Logo Search packages:      
Sourcecode: tagcoll version File versions  Download package

tagcoll.cc

/*
 * tagged collection - Experimental programs to test and study tagged collections
 *
 * Copyright (C) 2003,2004,2005  Enrico Zini
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#define APPNAME PACKAGE
#else
#warning No config.h found: using fallback values
#define APPNAME __FILE__
#define VERSION "unknown"
#endif

#include "CommandlineParser.h"

#include <stdio.h>

#include <stdlib.h>     // getenv

#include <errno.h>

#include <tagcoll/stringf.h>
#include <tagcoll/Exception.h>

#include <tagcoll/CardinalityStore.h>
#include <tagcoll/SmartHierarchy.h>

#include <tagcoll/Consumer.h>
#include <tagcoll/Filter.h>
#include <tagcoll/InputMerger.h>
#include <tagcoll/Implications.h>
#include <tagcoll/Filters.h>
#include <tagcoll/Patches.h>
#include <tagcoll/DerivedTags.h>
#include <tagcoll/ItemGrouper.h>

#include <tagcoll/StdioParserInput.h>
#include <tagcoll/TextFormat.h>
#include <tagcoll/Serializer.h>
#include <tagcoll/Expression.h>

#include <algorithm>

using namespace std;
using namespace Tagcoll;

void printItems(const set<string>& items, const string& prefix = "")
{
      for (set<string>::const_iterator i = items.begin();
                  i != items.end(); i++)
      {
            printf("%.*s%.*s\n", PFSTR(prefix), PFSTR(*i));
      }
}

void printNode(HierarchyNode<string, string>* node, string prefix = "")
{
      OpSet<string> items = node->getItems();
      if (!items.empty())
      {
            printItems(items, prefix + ": ");
//    } else {
//          printf("%.*s: no items\n", PFSTR(prefix));
      }
      
      if (prefix.size() > 0 && prefix[prefix.size() - 1] != '/')
            prefix += '/';

      for (HierarchyNode<string, string>::iterator i = node->begin();
                  i != node->end(); i++)
      {
            printNode(*i, prefix + (*i)->tag());
      }
}

OpSet<string> getItems(HierarchyNode<string, string>* node)
{
      OpSet<string> items = node->getItems();
      
      for (HierarchyNode<string, string>::iterator i = node->begin();
                  i != node->end(); i++)
            items += getItems(*i);

      return items;
}

void readCollection(const string& file, Consumer<string, string>& builder)
      throw (FileException, ParserException)
{
      Converter<string, string> conv;

      if (file == "-")
      {
            StdioParserInput input(stdin, "<stdin>");
            TextFormat<string, string>::parse(conv, conv, input, builder);
      }
      else
      {
            StdioParserInput input(file);
            TextFormat<string, string>::parse(conv, conv, input, builder);
      }
}

PatchList<string, string> readPatches(const string& file)
      throw (FileException, ParserException)
{
      Converter<string, string> conv;

      if (file == "-")
      {
            StdioParserInput input(stdin, "<stdin>");
            return TextFormat<string, string>::parsePatch(conv, conv, input);
      }
      else
      {
            StdioParserInput input(file);
            return TextFormat<string, string>::parsePatch(conv, conv, input);
      }
}

void parseDerivedTags(ParserInput& in, DerivedTags& output)
{
      string tag;
      string expr;

      int c;
      enum {TAG, TAGCOLON, SEXPR, EXPR} state = TAG;
      int line = 1;
      while ((c = in.nextChar()) != ParserInput::Eof)
      {
            if (c == '\n')
            {
                  if (tag.size() > 0 && expr.size() > 0)
                        output.add(tag, expr);
                  else
                        fprintf(stderr, "In derived tags file, ignoring incomplete line %d.\n", line);
                  tag = string();
                  expr = string();
                  state = TAG;
                  line++;
            } else
                  switch (state)
                  {
                        // Read item
                        case TAG:
                              switch (c)
                              {
                                    case ':':
                                          state = TAGCOLON;
                                          break;
                                    default:
                                          tag += c;
                                          break;
                              }
                              break;
                        // After colon on item
                        case TAGCOLON:
                              switch (c)
                              {
                                    case ' ':
                                    case '\t':
                                          state = SEXPR;
                                          break;
                                    case ':':
                                          tag += c;
                                          break;
                                    default:
                                          tag += ':';
                                          tag += c;
                                          state = EXPR;
                                          break;
                              }
                              break;
                        // Space before tag
                        case SEXPR:
                              switch (c)
                              {
                                    case ' ':
                                    case '\t':
                                          break;
                                    default:
                                          expr += c;
                                          state = EXPR;
                                          break;
                              }
                              break;
                        // Read tag
                        case EXPR:
                              expr += c;
                              break;
                  }
      }
}

void readDerivedTags(const string& file, DerivedTags& derivedTags)
      throw (FileException)
{
      if (file == "-")
      {
            StdioParserInput input(stdin, "<stdin>");
            parseDerivedTags(input, derivedTags);
      }
      else
      {
            StdioParserInput input(file);
            parseDerivedTags(input, derivedTags);
      }
}

class CommandlineParserWithCommand : public CommandlineParser
{
protected:
      map<string, int> command_map;

public:
      CommandlineParserWithCommand(const std::string& argv0,
                                                      const std::string& cmdline_summary,
                                                      const std::string& description) throw ()
            : CommandlineParser(argv0, cmdline_summary, description)
      {
            add("version", 0, "version", "print the program version, then exit");
      }

      void addCommand(const std::string name, int id) throw ()
      {
            command_map[name] = id;
      }

      int parse(int& argc, const char**& argv) throw ()
      {
            if (!CommandlineParser::parse(argc, argv))
            {
                  printHelp();
                  return false;
            }
            if (get("help").defined())
            {
                  printHelp();
                  exit(0);
            }
            if (get("version").defined())
            {
                  printf("%s ver." VERSION "\n", APPNAME);
                  exit(0);
            }
            if (argc == 1)
            {
                  printHelp();
                  exit(1);
            }

            string command_string = argv[1];
            map<string, int>::const_iterator cmap_i = command_map.find(command_string);
            if (cmap_i == command_map.end())
            {
                  fprintf(stderr, "Invalid command: \"%.*s\"\n", PFSTR(command_string));
                  printHelp();
                  exit(1);
            }

            for (int i = 1; i < argc; i++)
                  argv[i] = argv[i + 1];
            --argc;
            
            return cmap_i->second;
      }
};

class CommandlineArgs
{
protected:
      int argc;
      const char** argv;
      int _next;

public:
      CommandlineArgs(int argc, const char* argv[]) throw () : argc(argc), argv(argv), _next(1) {}

      // Return true if there is another argument left in the list
      bool hasNext() const throw () { return argc >= _next + 1; }

      // Return the next argument in the list
      string next() throw ()
      {
            if (hasNext())
            {
                  return argv[_next++];
            } else {
                  return "-";
            }
      }
};

enum valid_command { COPY, DIFF, RELATED, IMPLICATIONS, HIERARCHY, CLEANHIERARCHY, REVERSE, FINDSPECIALS };

int main(int argc, const char* argv[])
{
      try {
            CommandlineParserWithCommand opts(APPNAME, "[options] [command] [file1|-] [file2|-]",
                        "Perform various operations on a tagged collection\n\n"
                        "Commands are:\n"
                        "  copy         output the collection\n"
                        "  reverse      \"reverse\" the collection, outputting one with items\n"
                        "               associated to tags\n"
                        "  diff         output a tag patch file with the differences between two files\n"
                        "  related      print a list of items related to the given one\n"
                        "  implications compute a list of tag implications\n"
                        "  hierarchy    build a smart hierarchy with the collection data\n"
                        "  cleanhierarchy build a cleaned smart hierarchy with the collection data\n"
                        "  findspecials generate a smart hierarchy and print, for each toplevel tag,\n"
                        "               what are the items that make it toplevel instead of going below\n"
                        "               another tag.\n");

            /*
            opts.add("hierarchy", 's', "smart-hierarchy", "build a smart hierarchy");
            opts.add("implications", 'm', "show-implications", "output a list of tag implications");
            opts.add("copy", 'c', "copy", "output the collection");
            opts.add("diff", 'd', "diff", "output a tag patch file with the differences between two files (requires two file arguments)");
            */

            opts.add("derived", 'e', "derived-tags-from", "use an external list of derived tags", "file");
            opts.add("extimpl", 'i', "implications-from", "use an external list of implications", "file");
            opts.add("rename", 'r', "rename-from", "rename tags using the given mapping list", "file");
            opts.add("patch", 'p', "patch-with", "apply patches from the given tag patch file", "file");

            opts.add("expanded", 'x', "expanded-output", "produce full (and redundant) output data instead of compact");
            opts.add("groupitems", 'g', "group-items", "group items with the same tagset in the output collection");
            opts.add("distance", 'd', "distance", "set the maximum distance to use for the \"related\" command (defaults to 0)", "num");
            opts.add("flatten", 0, "flatten-threshold", "set the number of total items below which a branch is flattened when using the \"hierarchy\" command (defaults to 0, meaning \"don't flatten\")", "num");
            opts.add("filter", 'f', "filter", "filter out the tags with cardinality less than the given value (defaults to not filter; currently only works when building hierarchies)", "num");
            opts.add("untagged-tag", 0, "untagged-tag", "set item name to use for associating untagged items when using the \"reverse\" command.  If not specified, untagged items are not included in the output.", "tag");
            opts.add("remove-unfaceted", 0, "remove-unfaceted", "while parsing, remove all tags with no facet part.");
            opts.add("remove-tags", 0, "remove-tags", "while parsing, remove all tags matching the given tag expression.", "expression");
            
            //opts.add("verbose", 'v', "verbose", "enable verbose output");
            //opts.add("debug", 0, "debug", "enable debugging output (including verbose output)");

            opts.addCommand("copy", (int)COPY);
            opts.addCommand("reverse", (int)REVERSE);
            opts.addCommand("diff", (int)DIFF);
            opts.addCommand("related", (int)RELATED);
            opts.addCommand("implications", (int)IMPLICATIONS);
            opts.addCommand("hierarchy", (int)HIERARCHY);
            opts.addCommand("cleanhierarchy", (int)CLEANHIERARCHY);
            opts.addCommand("findspecials", (int)FINDSPECIALS);

            // Process the commandline
            valid_command cmd = (valid_command)opts.parse(argc, argv);

            CommandlineArgs args(argc, argv);
            
            string item;
            if (cmd == RELATED)
                  item = args.next();
            string file1 = args.next();

            //fprintf(stderr, "argc: %d file: %.*s\n", argc, PFSTR(file));

            // Prepare the input filter chain
            FilterChain<string, string> filters;
            Substitute<string, string> substitutions;
            PatchList<string, string> patches;
            Implications<string> implications;
            DerivedTags derivedTags;

            if (opts.get("rename").defined())
            {
                  readCollection(opts.get("rename").stringVal(), substitutions.substitutions());
                  filters.appendFilter(substitutions);
            }
            if (opts.get("patch").defined())
            {
                  patches = readPatches(opts.get("patch").stringVal());
                  filters.appendFilter(patches);
            }

            if (opts.get("extimpl").defined())
            {
                  readCollection(opts.get("extimpl").stringVal(), implications);
                  // Pack the structure for faster expansions
                  implications.pack();
            }
            if (opts.get("derived").defined())
                  readDerivedTags(opts.get("derived").stringVal(), derivedTags);

            // Intermix implications and derived tags as seems best
            bool compressOutput = (cmd == COPY && !opts.get("expanded").defined());
            bool hasImpl = opts.get("extimpl").defined();
            bool hasDerv = opts.get("derived").defined();

            AddImplied<string, string> addImplied(implications);
            RemoveImplied<string, string> removeImplied(implications);
            AddDerived<string> addDerived(derivedTags);
            RemoveDerived<string> removeDerived(derivedTags);

            if (compressOutput)
            {
                  if (hasDerv)
                  {
                        // Expand implications
                        if (hasImpl) filters.appendFilter(addImplied);

                        // Remove derived tags computing them using the expanded tag set
                        filters.appendFilter(removeDerived);
                  }

                  // Compress implications
                  if (hasImpl) filters.appendFilter(removeImplied);
            } else {
                  // Expand implications
                  if (hasImpl) filters.appendFilter(addImplied);

                  // Add derived tags computing them using the expanded tag set
                  if (hasDerv)
                  {
                        filters.appendFilter(addDerived);

                        // Add further tags implicated by the derived tags
                        if (hasImpl) filters.appendFilter(addImplied);
                  }
            }

            UnfacetedRemover<string> unfacetedRemover("::");
            if (opts.get("remove-unfaceted").defined())
                  filters.appendFilter(unfacetedRemover);

            FilterTagsByExpression<string, string> filterByExpression("");
            if (opts.get("remove-tags").defined())
            {
                  filterByExpression.setExpression(not Expression(opts.get("remove-tags").stringVal())); 
                  filters.appendFilter(filterByExpression);
            }

            InputMerger<string, string> merger;
            filters.setConsumer(merger);
            readCollection(file1, filters);

            // Perform the correct operation
            switch (cmd)
            {
                  case IMPLICATIONS:
                  {
                        CardinalityStore<string, string> coll;
                        merger.output(coll);

                        Implications<string> newImpls;

                        // Find tag implications
                        OpSet<string> allTags = coll.getAllTags();
                        for (OpSet<string>::const_iterator t = allTags.begin();
                                    t != allTags.end(); t++)
                        {
                              OpSet<string> implied = coll.getImpliedBy(*t);
                              if (!implied.empty())
                                    newImpls.consume(*t, implied);
                        }

                        newImpls.pack();
                        
                        Converter<string, string> conv;
                        TextFormat<string, string> output(conv, conv, stdout);
                        if (opts.get("expanded").defined())
                              newImpls.outputFull(output);
                        else
                              newImpls.output(output);
                        break;
                  }
                  case HIERARCHY:
                  {
                        int flattenThreshold = 0;
                        if (opts.get("flatten").defined())
                              flattenThreshold = opts.get("flatten").intVal();

                        CardinalityStore<string, string> coll;
                        merger.output(coll);

                        if (opts.get("filter").defined())
                              coll.removeTagsWithCardinalityLessThan(opts.get("filter").intVal());

                        // Default operation: build the smart hierarchy
                        HierarchyNode<string, string>* root =
                              new SmartHierarchyNode<string, string>("_top", coll, flattenThreshold);
                        printNode(root, "/");
                        break;
                  }
                  case CLEANHIERARCHY:
                  {
                        int flattenThreshold = 0;
                        if (opts.get("flatten").defined())
                              flattenThreshold = opts.get("flatten").intVal();

                        CardinalityStore<string, string> coll;
                        merger.output(coll);

                        if (opts.get("filter").defined())
                              coll.removeTagsWithCardinalityLessThan(opts.get("filter").intVal());

                        // Default operation: build the smart hierarchy
                        HierarchyNode<string, string>* root = new CleanSmartHierarchyNode<string, string>("_top", coll, flattenThreshold);
                        printNode(root, "/");
                        break;
                  }
                  case DIFF:
                  {
                        InputMerger<string, string> merger2;
                        filters.setConsumer(merger2);
                        readCollection(args.next(), filters);

                        PatchList<string, string> newpatches;
                        newpatches.addPatch(merger, merger2);

                        Converter<string, string> conv;
                        TextFormat<string, string>::outputPatch(conv, conv, newpatches, stdout);
                        break;
                  }
                  case RELATED:
                  {
                        int maxdist = 0;
                        if (opts.get("distance").defined())
                              maxdist = opts.get("distance").intVal();

                        // Split the items on commas
                        string splititem;
                        set<string> splititems;
                        for (string::const_iterator c = item.begin();
                                    c != item.end(); c++)
                              if (*c == ',')
                              {
                                    if (!merger.hasItem(splititem))
                                    {
                                          fprintf(stderr, "Item \"%.*s\" does not exist in the collection\n", PFSTR(splititem));
                                          return 1;
                                    }
                                    splititems.insert(splititem);
                                    splititem = string();
                              } else
                                    splititem += *c;
                        if (!merger.hasItem(splititem))
                        {
                              fprintf(stderr, "Item \"%.*s\" does not exist in the collection\n", PFSTR(splititem));
                              return 1;
                        }
                        splititems.insert(splititem);
                        
                        // Get the tagset as the intersection of the tagsets of all input items
                        set<string>::const_iterator i = splititems.begin();
                        OpSet<string> ts = merger.getTags(*i);
                        for (++i; i != splititems.end(); i++)
                              ts = ts ^ merger.getTags(*i);

                        if (ts.empty())
                        {
                              if (splititems.size() > 1)
                                    fprintf(stderr, "The items %.*s are unrelated: cannot find a barycenter to start computing relationships from.\n", PFSTR(item));
                              else
                                    fprintf(stderr, "The items %.*s has no tags attached.\n", PFSTR(item));
                              return 1;
                        }
                        
                        // Build a full TagCollection
                        CardinalityStore<string, string> coll;
                        merger.output(coll);

                        printItems(coll.getItemsExactMatch(ts));

                        if (maxdist)
                        {
                              // Get the related tagsets
                              list< OpSet<string> > rel = coll.getRelatedTagsets(ts, maxdist);

                              // Print the output list
                              for (list< OpSet<string> >::const_iterator i = rel.begin();
                                          i != rel.end(); i++)
                                    printItems(coll.getItemsExactMatch(*i));
                        }
                        break;
                  }
                  case REVERSE:
                  {
                        ItemGrouper<string, string> reverser;
                        merger.output(reverser);

                        /*
                        if (opts.get("untagged-tag").defined())
                              reverser.setUntaggedItemName(opts.get("untagged-tag").stringVal());
                         */

                        Converter<string, string> conv;
                        TextFormat<string, string> writer(conv, conv, stdout);
                        if (opts.get("groupitems").defined())
                        {
                              reverser.outputReversed(writer);
                        } else
                              // FIXME: we need a filter that does the opposite of ItemGrouper
                              reverser.outputReversed(writer);
                        break;
                  }
                  case COPY:
                  {
                        Converter<string, string> conv;
                        TextFormat<string, string> writer(conv, conv, stdout);
                        if (opts.get("groupitems").defined())
                        {
                              ItemGrouper<string, string> grouper;
                              merger.output(grouper);
                              grouper.output(writer);
                        } else
                              merger.output(writer);
                        break;
                  }
                  case FINDSPECIALS:
                  {
                        int flattenThreshold = 0;
                        if (opts.get("flatten").defined())
                              flattenThreshold = opts.get("flatten").intVal();

                        CardinalityStore<string, string> coll;
                        merger.output(coll);

                        if (opts.get("filter").defined())
                              coll.removeTagsWithCardinalityLessThan(opts.get("filter").intVal());

                        // Default operation: build the smart hierarchy
                        SmartHierarchyNode<string, string> root("_top", coll, flattenThreshold);

                        OpSet<string> seen;
                        for (HierarchyNode<string, string>::iterator i = root.begin();
                                    i != root.end(); i++)
                        {
                              OpSet<string> items = getItems(*i);

                              // Find the items in this branch that are not present in
                              // any of the previous ones
                              OpSet<string> newItems;
                              if (!seen.empty())
                              {
                                    for (OpSet<string>::const_iterator j = items.begin();
                                                j != items.end(); j++)
                                    {
                                          OpSet<string> tags = merger.getTags(*j) ^ seen;
                                          if (tags.empty())
                                                newItems += *j;
                                    }

                                    printf("%.*s: %d items, %d special items:\n",
                                                PFSTR((*i)->tag()), items.size(), newItems.size());

                                    int indent = (*i)->tag().size() + 2;
                                    for (OpSet<string>::const_iterator j = newItems.begin(); j != newItems.end(); j++)
                                          printf("%*s%.*s\n", indent, "", PFSTR(*j));
                              }

                              seen += (*i)->tag();
                        }

                        break;
                  }
            }

            return 0;
      }
      catch (Exception& e)
      {
            fprintf(stderr, "%s: %.*s\n", e.type(), PFSTR(e.desc()));
            return 1;
      }
}

// vim:set ts=4 sw=4:

Generated by  Doxygen 1.6.0   Back to index