diff -Nru alpino-corpus-1.4.0/CMakeLists.txt alpino-corpus-2.0.0/CMakeLists.txt --- alpino-corpus-1.4.0/CMakeLists.txt 2012-11-09 15:36:23.000000000 +0000 +++ alpino-corpus-2.0.0/CMakeLists.txt 2012-11-09 15:36:24.000000000 +0000 @@ -1,14 +1,14 @@ cmake_minimum_required(VERSION 2.6) project(AlpinoCorpus) -set (ALPINOCORPUS_VERSION "1.4.0") +set (ALPINOCORPUS_VERSION "2.0.0") if (NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) set (CMAKE_BUILD_TYPE Release) endif (NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) if (NOT WIN32) - set(CMAKE_CXX_FLAGS "-pthread -pedantic -Wno-long-long") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -pedantic -Wno-long-long") endif(NOT WIN32) LIST(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir) @@ -50,10 +50,10 @@ option(USE_REMOTE_CORPUS "Enable remote corpus" OFF) if(USE_REMOTE_CORPUS) -find_package(Boost 1.46.1 COMPONENTS system date_time iostreams +find_package(Boost 1.46.1 COMPONENTS system chrono date_time iostreams filesystem thread regex REQUIRED) else() -find_package(Boost 1.46.1 COMPONENTS system date_time filesystem +find_package(Boost 1.46.1 COMPONENTS system chrono date_time filesystem thread regex REQUIRED) endif() if(Boost_FOUND) @@ -137,10 +137,10 @@ include/AlpinoCorpus/IterImpl.hh include/AlpinoCorpus/CompactCorpusReader.hh include/AlpinoCorpus/LexItem.hh - include/AlpinoCorpus/MultiCorpusReader.hh - include/AlpinoCorpus/RecursiveCorpusReader.hh include/AlpinoCorpus/capi.h include/AlpinoCorpus/macros.hh + include/AlpinoCorpus/tr1wrap/memory.hh + include/AlpinoCorpus/util/Either.hh src/CompactCorpusWriterPrivate.hh src/DirectoryCorpusReaderPrivate.hh src/DzIstream.hh @@ -149,7 +149,6 @@ src/DzOstreamBuf.hh src/CompactCorpusReaderPrivate.hh src/FilterIter.hh - src/MultiCorpusReaderPrivate.hh src/StylesheetIter.hh src/XSLTransformer.hh src/parseMacros.hh @@ -178,9 +177,6 @@ src/FilterIter.cpp src/CompactCorpusReader.cpp src/CompactCorpusReaderPrivate.cpp - src/MultiCorpusReader.cpp - src/MultiCorpusReaderPrivate.cpp - src/RecursiveCorpusReader.cpp src/IterImpl.cpp src/StylesheetIter.cpp src/XSLTransformer.cpp @@ -196,13 +192,19 @@ list(APPEND HEADERS include/AlpinoCorpus/DbCorpusReader.hh include/AlpinoCorpus/DbCorpusWriter.hh + include/AlpinoCorpus/MultiCorpusReader.hh + include/AlpinoCorpus/RecursiveCorpusReader.hh src/DbCorpusReaderPrivate.hh + src/MultiCorpusReaderPrivate.hh ) list(APPEND SOURCES src/DbCorpusReader.cpp src/DbCorpusReaderPrivate.cpp src/DbCorpusWriter.cpp + src/MultiCorpusReader.cpp + src/MultiCorpusReaderPrivate.cpp + src/RecursiveCorpusReader.cpp ) endif() @@ -225,13 +227,13 @@ ) set_target_properties(alpino_corpus PROPERTIES VERSION ${ALPINOCORPUS_VERSION} - SOVERSION 1.4) + SOVERSION 2.0) target_link_libraries(alpino_corpus ${Boost_LIBRARIES}) target_link_libraries(alpino_corpus ${ZLIB_LIBRARIES}) +target_link_libraries(alpino_corpus ${XERCESC_LIBRARY}) +target_link_libraries(alpino_corpus ${XQILLA_LIBRARY}) if(USE_DBXML) - target_link_libraries(alpino_corpus ${XERCESC_LIBRARY}) - target_link_libraries(alpino_corpus ${XQILLA_LIBRARY}) target_link_libraries(alpino_corpus ${DBXML_LIBRARY}) endif(USE_DBXML) target_link_libraries(alpino_corpus ${LIBXML2_LIBRARIES}) diff -Nru alpino-corpus-1.4.0/README alpino-corpus-2.0.0/README --- alpino-corpus-1.4.0/README 2012-11-09 15:36:23.000000000 +0000 +++ alpino-corpus-2.0.0/README 2012-11-09 15:36:24.000000000 +0000 @@ -38,6 +38,12 @@ can catch exceptions and translate them to the language's native error reporting method. +Documentation +------------- + +Documentation for this library can be obtained by running 'doxygen' in +the root of the source archive. + Compilation ----------- diff -Nru alpino-corpus-1.4.0/debian/alpino-corpus-util.install alpino-corpus-2.0.0/debian/alpino-corpus-util.install --- alpino-corpus-1.4.0/debian/alpino-corpus-util.install 2012-11-09 15:36:23.000000000 +0000 +++ alpino-corpus-2.0.0/debian/alpino-corpus-util.install 2012-11-09 15:36:24.000000000 +0000 @@ -1,3 +1,5 @@ -usr/bin/alpinocorpus-act +usr/bin/alpinocorpus-create usr/bin/alpinocorpus-stats +usr/bin/alpinocorpus-xpath +usr/bin/alpinocorpus-xquery usr/bin/alpinocorpus-xslt diff -Nru alpino-corpus-1.4.0/debian/changelog alpino-corpus-2.0.0/debian/changelog --- alpino-corpus-1.4.0/debian/changelog 2012-11-09 15:36:23.000000000 +0000 +++ alpino-corpus-2.0.0/debian/changelog 2012-11-09 15:36:24.000000000 +0000 @@ -1,9 +1,27 @@ +alpino-corpus (2.0.0-1precise1) precise; urgency=low + + * New upstream release. + + -- Daniël de Kok Fri, 09 Nov 2012 01:25:40 -0800 + +alpino-corpus (1.4.1-1precise1) precise; urgency=low + + * New upstream release. + + -- Daniël de Kok Thu, 25 Oct 2012 10:37:49 +0200 + alpino-corpus (1.4.0-1precise1) precise; urgency=low * New upstream release. -- Daniël de Kok Fri, 21 Sep 2012 18:10:23 +0200 +alpino-corpus (1.2.0-1precise1) precise; urgency=low + + * Build for precise Quetzal. + + -- Daniël de Kok Sun, 09 Sep 2012 16:52:01 +0200 + alpino-corpus (1.2.0-1) precise; urgency=low * New upstream release. diff -Nru alpino-corpus-1.4.0/debian/compat alpino-corpus-2.0.0/debian/compat --- alpino-corpus-1.4.0/debian/compat 2012-11-09 15:36:23.000000000 +0000 +++ alpino-corpus-2.0.0/debian/compat 2012-11-09 15:36:24.000000000 +0000 @@ -1 +1 @@ -7 +9 diff -Nru alpino-corpus-1.4.0/debian/control alpino-corpus-2.0.0/debian/control --- alpino-corpus-1.4.0/debian/control 2012-11-09 15:36:23.000000000 +0000 +++ alpino-corpus-2.0.0/debian/control 2012-11-09 15:36:24.000000000 +0000 @@ -1,14 +1,14 @@ Source: alpino-corpus Priority: extra Maintainer: Daniël de Kok -Build-Depends: debhelper (>= 7.0.50~), libboost-dev (>= 1.46.1.1), libboost-filesystem-dev (>= 1.46.1.1), libboost-regex-dev(>= 1.46.1.1), libboost-system-dev (>= 1.46.1.1), libboost-thread-dev (>= 1.46.1.1), cmake (>= 2.8.0), cdbs (>= 0.4.62), libxqilla2.2-dev, libxerces-c-dev, libdbxml-dev (>= 2.5.16), libxml2-dev (>= 2.7.8), libxslt1-dev (>= 1.1.26-7), zlib1g-dev (>= 1.2.3) -Standards-Version: 3.9.2 +Build-Depends: cdbs (>= 0.4.100), debhelper (>= 9.20120115), libboost1.48-dev (>= 1.48), libboost-chrono1.48-dev (>= 1.48), libboost-filesystem1.48-dev (>= 1.48), libboost-regex1.48-dev(>= 1.48), libboost-system1.48-dev (>= 1.48), libboost-thread1.48-dev (>= 1.48), cmake (>= 2.8.0), libxqilla2.2-dev, libxerces-c-dev, libdbxml-dev (>= 2.5.16), libxml2-dev (>= 2.7.8), libxslt1-dev (>= 1.1.26-7), zlib1g-dev (>= 1.2.3) +Standards-Version: 3.9.3 Section: libs Homepage: http://github.com/rug-compling/dact Vcs-Git: git://github.com/rug-compling/alpinocorpus.git Vcs-Browser: https://github.com/rug-compling/alpinocorpus -Package: libalpino-corpus1.0 +Package: libalpino-corpus2.0 Section: libs Suggests: libalpino-corpus-dev, alpino-corpus-util Architecture: any @@ -20,7 +20,7 @@ Package: libalpino-corpus-dev Section: libdevel Architecture: any -Depends: libdbxml-dev (>= 2.5.16), zlib1g-dev, libalpino-corpus1.0 (= ${binary:Version}), ${misc:Depends} +Depends: libdbxml-dev (>= 2.5.16), zlib1g-dev, libalpino-corpus2.0 (= ${binary:Version}), ${misc:Depends} Description: Library for processing XML-based treebanks (development headers) AlpinoCorpus is a library for processing XML-based treebanks. It can read XML corpora in various formats and provides querying facilities. diff -Nru alpino-corpus-1.4.0/debian/libalpino-corpus1.0.dirs alpino-corpus-2.0.0/debian/libalpino-corpus1.0.dirs --- alpino-corpus-1.4.0/debian/libalpino-corpus1.0.dirs 2012-11-09 15:36:23.000000000 +0000 +++ alpino-corpus-2.0.0/debian/libalpino-corpus1.0.dirs 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib diff -Nru alpino-corpus-1.4.0/debian/libalpino-corpus1.0.install alpino-corpus-2.0.0/debian/libalpino-corpus1.0.install --- alpino-corpus-1.4.0/debian/libalpino-corpus1.0.install 2012-11-09 15:36:23.000000000 +0000 +++ alpino-corpus-2.0.0/debian/libalpino-corpus1.0.install 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib/libalpino_corpus.so.* diff -Nru alpino-corpus-1.4.0/debian/libalpino-corpus2.0.dirs alpino-corpus-2.0.0/debian/libalpino-corpus2.0.dirs --- alpino-corpus-1.4.0/debian/libalpino-corpus2.0.dirs 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.0.0/debian/libalpino-corpus2.0.dirs 2012-11-09 15:36:24.000000000 +0000 @@ -0,0 +1 @@ +usr/lib diff -Nru alpino-corpus-1.4.0/debian/libalpino-corpus2.0.install alpino-corpus-2.0.0/debian/libalpino-corpus2.0.install --- alpino-corpus-1.4.0/debian/libalpino-corpus2.0.install 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.0.0/debian/libalpino-corpus2.0.install 2012-11-09 15:36:24.000000000 +0000 @@ -0,0 +1 @@ +usr/lib/libalpino_corpus.so.* diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/CorpusReader.hh alpino-corpus-2.0.0/include/AlpinoCorpus/CorpusReader.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/CorpusReader.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/CorpusReader.hh 2012-11-08 12:03:40.000000000 +0000 @@ -6,11 +6,12 @@ #include #include -#include +#include #include #include #include +#include #include namespace alpinocorpus { @@ -79,7 +80,7 @@ enum QueryDialect { XPATH, XQUERY }; /** Is a query valid? */ - bool isValidQuery(QueryDialect d, bool variables, std::string const &q) const; + Either isValidQuery(QueryDialect d, bool variables, std::string const &q) const; /** Execute query. The end of the range is given by end(). */ EntryIterator query(QueryDialect d, std::string const &q) const; @@ -119,7 +120,7 @@ virtual EntryIterator runQueryWithStylesheet(QueryDialect d, std::string const &q, std::string const &stylesheet, std::list const &markerQueries) const; - virtual bool validQuery(QueryDialect d, bool variables, std::string const &q) const; + virtual Either validQuery(QueryDialect d, bool variables, std::string const &q) const; }; } diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/DbCorpusReader.hh alpino-corpus-2.0.0/include/AlpinoCorpus/DbCorpusReader.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/DbCorpusReader.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/DbCorpusReader.hh 2012-11-08 12:03:40.000000000 +0000 @@ -22,7 +22,7 @@ virtual ~DbCorpusReader(); private: - bool validQuery(QueryDialect d, bool variables, std::string const &query) const; + Either validQuery(QueryDialect d, bool variables, std::string const &query) const; EntryIterator getEntries() const; std::string getName() const; std::string readEntry(std::string const &) const; diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/DbCorpusWriter.hh alpino-corpus-2.0.0/include/AlpinoCorpus/DbCorpusWriter.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/DbCorpusWriter.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/DbCorpusWriter.hh 2012-11-08 12:03:40.000000000 +0000 @@ -19,7 +19,12 @@ class ALPINO_CORPUS_EXPORT DbCorpusWriter : public CorpusWriter { public: - /** Open path for writing. */ + /** + * Open path for writing. If the overwrite flag is set to + * true, the corpus is removed before it is opened. + * If this flag is set to false, the corpus will not + * be removed, however entries that already exist are overwritten. + */ DbCorpusWriter(std::string const &path, bool overwrite); ~DbCorpusWriter(); diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/MultiCorpusReader.hh alpino-corpus-2.0.0/include/AlpinoCorpus/MultiCorpusReader.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/MultiCorpusReader.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/MultiCorpusReader.hh 2012-11-08 12:03:40.000000000 +0000 @@ -23,7 +23,8 @@ std::string readEntry(std::string const &) const; std::string readEntryMarkQueries(std::string const &entry, std::list const &queries) const; EntryIterator runXPath(std::string const &query) const; - bool validQuery(QueryDialect d, bool variables, std::string const &query) const; + EntryIterator runXQuery(std::string const &query) const; + Either validQuery(QueryDialect d, bool variables, std::string const &query) const; MultiCorpusReaderPrivate *d_private; }; diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/RecursiveCorpusReader.hh alpino-corpus-2.0.0/include/AlpinoCorpus/RecursiveCorpusReader.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/RecursiveCorpusReader.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/RecursiveCorpusReader.hh 2012-11-08 12:03:40.000000000 +0000 @@ -4,6 +4,7 @@ #include #include +#include namespace alpinocorpus { @@ -29,7 +30,8 @@ std::string readEntry(std::string const &) const; std::string readEntryMarkQueries(std::string const &entry, std::list const &queries) const; EntryIterator runXPath(std::string const &query) const; - bool validQuery(QueryDialect d, bool variables, std::string const &query) const; + EntryIterator runXQuery(std::string const &query) const; + Either validQuery(QueryDialect d, bool variables, std::string const &query) const; RecursiveCorpusReaderPrivate *d_private; }; diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/RemoteCorpusReader.hh alpino-corpus-2.0.0/include/AlpinoCorpus/RemoteCorpusReader.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/RemoteCorpusReader.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/RemoteCorpusReader.hh 2012-11-08 12:03:40.000000000 +0000 @@ -89,7 +89,7 @@ virtual ~RemoteCorpusReader(); private: - bool validQuery(QueryDialect d, bool variables, + Either validQuery(QueryDialect d, bool variables, std::string const &query) const; EntryIterator getEntries() const; std::string getName() const; diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/macros.hh alpino-corpus-2.0.0/include/AlpinoCorpus/macros.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/macros.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/macros.hh 2012-11-08 12:03:40.000000000 +0000 @@ -4,12 +4,14 @@ #include #include +#include + namespace alpinocorpus { typedef std::map Macros; -std::string expandMacros(Macros const ¯os, std::string query); -Macros loadMacros(std::string const &filename); +ALPINO_CORPUS_EXPORT std::string expandMacros(Macros const ¯os, std::string query); +ALPINO_CORPUS_EXPORT Macros loadMacros(std::string const &filename); } diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/tr1wrap/memory.hh alpino-corpus-2.0.0/include/AlpinoCorpus/tr1wrap/memory.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/tr1wrap/memory.hh 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/tr1wrap/memory.hh 2012-11-08 12:03:40.000000000 +0000 @@ -0,0 +1,7 @@ +#include + +#ifdef __GLIBCXX__ +#include +#else +#include +#endif diff -Nru alpino-corpus-1.4.0/include/AlpinoCorpus/util/Either.hh alpino-corpus-2.0.0/include/AlpinoCorpus/util/Either.hh --- alpino-corpus-1.4.0/include/AlpinoCorpus/util/Either.hh 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.0.0/include/AlpinoCorpus/util/Either.hh 2012-11-08 12:03:40.000000000 +0000 @@ -0,0 +1,84 @@ +#ifndef ALPINOCORPUS_EITHER_HH +#define ALPINOCORPUS_EITHER_HH + +namespace alpinocorpus { + +/** + * Either is an option type that resembles the Either type in Haskell. + * It's used to indicate that computation results in two possible type: + * the left type or the right type (which could have the same type). + * + * Unfortunately, we do not have algebraic data types in C++, so both + * posibilities are encoded in the same type. The isLeft() and isRight() + * methods can be used to probe whether we have a left or right value. + * + * In error handling the convention is that Left L is an error, + * and information about the error is encoded using the type L. + * Right R is a succesful computation with a result of the type + * R. + * + * Note: we could have used boost::variant, but I think it is kinda + * heavyweight. It does more, but also has a more complex API. + */ +template +class Either { +public: + static Either left(L left); + static Either right(R right); + + bool isLeft(); + bool isRight(); + L left(); + R right(); +private: + enum Result { LEFT, RIGHT }; + + Either(Result result, L left, R right) : + d_result(result), d_left(left), d_right(right) {} + + Result d_result; + L d_left; + R d_right; +}; + +template +inline Either Either::left(L left) +{ + return Either(LEFT, left, R()); +} + +template +inline Either Either::right(R right) +{ + return Either(RIGHT, L(), right); +} + +template +inline bool Either::isLeft() +{ + return d_result == LEFT; +} + +template +inline bool Either::isRight() +{ + return d_result == RIGHT; +} + +template +inline L Either::left() +{ + return d_left; +} + +template +inline R Either::right() +{ + return d_right; +} + +struct Empty {}; + +} + +#endif // ALPINOCORPUS_EITHER_HH diff -Nru alpino-corpus-1.4.0/src/CompactCorpusReaderPrivate.cpp alpino-corpus-2.0.0/src/CompactCorpusReaderPrivate.cpp --- alpino-corpus-1.4.0/src/CompactCorpusReaderPrivate.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/CompactCorpusReaderPrivate.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff -Nru alpino-corpus-1.4.0/src/CompactCorpusReaderPrivate.hh alpino-corpus-2.0.0/src/CompactCorpusReaderPrivate.hh --- alpino-corpus-1.4.0/src/CompactCorpusReaderPrivate.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/CompactCorpusReaderPrivate.hh 2012-11-08 12:03:40.000000000 +0000 @@ -2,10 +2,12 @@ #define ALPINO_COMPACT_CORPUSREADER_PRIVATE_HH #include -#include -#include #include +#include + +#include + #include #if defined(BOOST_HAS_THREADS) diff -Nru alpino-corpus-1.4.0/src/CompactCorpusWriterPrivate.hh alpino-corpus-2.0.0/src/CompactCorpusWriterPrivate.hh --- alpino-corpus-1.4.0/src/CompactCorpusWriterPrivate.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/CompactCorpusWriterPrivate.hh 2012-11-08 12:03:40.000000000 +0000 @@ -3,7 +3,7 @@ #include -#include +#include #include diff -Nru alpino-corpus-1.4.0/src/CorpusReader.cpp alpino-corpus-2.0.0/src/CorpusReader.cpp --- alpino-corpus-1.4.0/src/CorpusReader.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/CorpusReader.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -3,16 +3,17 @@ #include #include #include -#include #include #include #include +#include #include #include #include #include +#include #include @@ -50,25 +51,23 @@ } - std::vector collectLexicals(xmlDoc *doc, + std::vector collectLexicals( + std::tr1::shared_ptr doc, std::tr1::unordered_map > const &matchDepth) { std::vector items; - xmlXPathContextPtr xpCtx = xmlXPathNewContext(doc); + std::tr1::shared_ptr xpCtx( + xmlXPathNewContext(doc.get()), xmlXPathFreeContext); if (xpCtx == 0) { - //qDebug() << "Could not make XPath context."; return items; } - xmlXPathObjectPtr xpObj = xmlXPathEvalExpression( - toXmlStr("//node[@word]"), xpCtx); - if (xpObj == 0) { - //qDebug() << "Could not make XPath expression to select active nodes."; - xmlXPathFreeContext(xpCtx); + std::tr1::shared_ptr xpObj(xmlXPathEvalExpression( + toXmlStr("//node[@word]"), xpCtx.get()), xmlXPathFreeObject); + if (xpObj == 0) return items; - } xmlNodeSet *nodeSet = xpObj->nodesetval; if (nodeSet != 0) @@ -80,21 +79,22 @@ if (node->type == XML_ELEMENT_NODE) { xmlAttrPtr wordAttr = xmlHasProp(node, toXmlStr("word")); - xmlChar *word = xmlNodeGetContent(wordAttr->children); + std::tr1::shared_ptr word( + xmlNodeGetContent(wordAttr->children), xmlFree); xmlAttrPtr beginAttr = xmlHasProp(node, toXmlStr("begin")); size_t begin = 0; if (beginAttr) { - xmlChar *beginStr = xmlNodeGetContent(beginAttr->children); + std::tr1::shared_ptr beginStr( + xmlNodeGetContent(beginAttr->children), xmlFree); try { - begin = alpinocorpus::util::parseString(fromXmlStr(beginStr)); + begin = alpinocorpus::util::parseString(fromXmlStr(beginStr.get())); } catch (std::invalid_argument &e) { - //qDebug() << e.what(); } } - alpinocorpus::LexItem item = {fromXmlStr(word), begin, std::set() }; + alpinocorpus::LexItem item = {fromXmlStr(word.get()), begin, std::set() }; std::tr1::unordered_map >::const_iterator matchIter = matchDepth.find(node); @@ -239,32 +239,32 @@ } std::string xmlData(read(entry, markers)); - xmlDocPtr doc; - doc = xmlReadMemory(xmlData.c_str(), xmlData.size(), NULL, NULL, 0); + std::tr1::shared_ptr doc( + xmlReadMemory(xmlData.c_str(), xmlData.size(), NULL, NULL, 0), + xmlFreeDoc); + if (doc == NULL) return std::vector(); // We get the sentence node, we should process its children. - xmlNode *sentenceNode = xmlDocGetRootElement(doc); + xmlNode *sentenceNode = xmlDocGetRootElement(doc.get()); if (sentenceNode == NULL) { - xmlFreeDoc(doc); return std::vector(); } - xmlXPathContextPtr xpCtx = xmlXPathNewContext(doc); + std::tr1::shared_ptr xpCtx( + xmlXPathNewContext(doc.get()), xmlXPathFreeContext); + if (xpCtx == 0) { - //qDebug() << "Could not make XPath context."; - xmlFreeDoc(doc); return std::vector(); } - xmlXPathObjectPtr xpObj = xmlXPathEvalExpression( - toXmlStr("//node[@active='1']"), xpCtx); + std::tr1::shared_ptr xpObj( + xmlXPathEvalExpression(toXmlStr("//node[@active='1']"), xpCtx.get()), + xmlXPathFreeObject); if (xpObj == 0) { //qDebug() << "Could not make XPath expression to select active nodes."; - xmlXPathFreeContext(xpCtx); - xmlFreeDoc(doc); return std::vector(); } @@ -280,15 +280,11 @@ std::vector items = collectLexicals(doc, matchDepth); - xmlXPathFreeObject(xpObj); - xmlXPathFreeContext(xpCtx); - xmlFreeDoc(doc); - return items; } - bool CorpusReader::isValidQuery(QueryDialect d, bool variables, std::string const &q) const + Either CorpusReader::isValidQuery(QueryDialect d, bool variables, std::string const &q) const { std::vector queries; boost::split_regex(queries, q, boost::regex("\\+\\|\\+")); @@ -296,10 +292,13 @@ for (std::vector::const_iterator iter = queries.begin(); iter != queries.end(); ++iter) - if (!validQuery(d, variables, *iter)) - return false; + { + Either result = validQuery(d, variables, *iter); + if (result.isLeft()) + return result; + } - return true; + return Either::right(Empty()); } std::string CorpusReader::name() const @@ -449,34 +448,32 @@ return getSize(); } - bool CorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const + Either CorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const { if (d != XPATH) - return false; + return Either::left("Only XPath2 queries are supported for this corpus."); // XXX - strip/trim if (query.empty()) - return true; + return Either::right(Empty()); // Prepare context - xmlXPathContextPtr ctx = xmlXPathNewContext(0); + + std::tr1::shared_ptr ctx(xmlXPathNewContext(0), + xmlXPathFreeContext); if (!variables) ctx->flags = XML_XPATH_NOVAR; - xmlSetStructuredErrorFunc(ctx, &ignoreStructuredError); + xmlSetStructuredErrorFunc(ctx.get(), &ignoreStructuredError); // Compile expression - xmlXPathCompExprPtr r = xmlXPathCtxtCompile(ctx, - reinterpret_cast(query.c_str())); + std::tr1::shared_ptr r( + xmlXPathCtxtCompile(ctx.get(), reinterpret_cast(query.c_str())), + xmlXPathFreeCompExpr); - if (!r) { - xmlXPathFreeContext(ctx); - return false; - } - - xmlXPathFreeCompExpr(r); - xmlXPathFreeContext(ctx); - - return true; + if (!r) + return Either::left("Invalid expression"); + + return Either::right(Empty()); } CorpusReader::EntryIterator CorpusReader::query(QueryDialect d, diff -Nru alpino-corpus-1.4.0/src/CorpusReaderFactory.cpp alpino-corpus-2.0.0/src/CorpusReaderFactory.cpp --- alpino-corpus-1.4.0/src/CorpusReaderFactory.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/CorpusReaderFactory.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -50,7 +50,11 @@ CorpusReader *CorpusReaderFactory::openRecursive(std::string const &path, bool dactOnly) { +#if defined(USE_DBXML) return new RecursiveCorpusReader(path, dactOnly); +#else + throw OpenError(path); +#endif } bool CorpusReaderFactory::readerAvailable(ReaderType readerType) diff -Nru alpino-corpus-1.4.0/src/DbCorpusReader.cpp alpino-corpus-2.0.0/src/DbCorpusReader.cpp --- alpino-corpus-1.4.0/src/DbCorpusReader.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/DbCorpusReader.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -42,7 +42,7 @@ return d_private->getSize(); } -bool DbCorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const +Either DbCorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const { return d_private->isValidQuery(d, variables, query); } diff -Nru alpino-corpus-1.4.0/src/DbCorpusReaderPrivate.cpp alpino-corpus-2.0.0/src/DbCorpusReaderPrivate.cpp --- alpino-corpus-1.4.0/src/DbCorpusReaderPrivate.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/DbCorpusReaderPrivate.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -9,6 +9,7 @@ #include #include #include +#include #include "DbCorpusReaderPrivate.hh" #include "util/url.hh" @@ -82,11 +83,16 @@ std::string value; if (v.isNode()) { - db::XmlDocument doc = v.asDocument(); - value = v.getNodeValue(); - name = doc.getName(); - } else if (v.isString()) - value = v.asString(); + try { + db::XmlDocument doc = v.asDocument(); + name = doc.getName(); + } catch (db::XmlException &) { + // Could not use node as a document. Why is there no isDocument() + // method? + } + } + + value = v.asString(); Entry e = {name, value}; @@ -110,7 +116,7 @@ return new QueryIter(*this); } DbCorpusReaderPrivate::DbCorpusReaderPrivate(std::string const &path) - : mgr(), container() + : mgr(db::DBXML_ALLOW_EXTERNAL_ACCESS), container() { try { db::XmlContainerConfig config; @@ -138,16 +144,16 @@ return container.getName(); } -bool DbCorpusReaderPrivate::validQuery(QueryDialect d, bool variables, std::string const &query) const +Either DbCorpusReaderPrivate::validQuery(QueryDialect d, bool variables, std::string const &query) const { try { db::XmlQueryContext ctx = mgr.createQueryContext(); mgr.prepare(query, ctx); } catch (db::XmlException const &e) { - return false; + return Either::left(e.what()); } - return true; + return Either::right(Empty()); } diff -Nru alpino-corpus-1.4.0/src/DbCorpusReaderPrivate.hh alpino-corpus-2.0.0/src/DbCorpusReaderPrivate.hh --- alpino-corpus-1.4.0/src/DbCorpusReaderPrivate.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/DbCorpusReaderPrivate.hh 2012-11-08 12:03:40.000000000 +0000 @@ -55,7 +55,7 @@ { return const_cast(container).getNumDocuments(); } - bool validQuery(QueryDialect d, bool variables, std::string const &query) const; + Either validQuery(QueryDialect d, bool variables, std::string const &query) const; std::string readEntry(std::string const &) const; EntryIterator runXPath(std::string const &) const; EntryIterator runXQuery(std::string const &) const; diff -Nru alpino-corpus-1.4.0/src/DbCorpusWriter.cpp alpino-corpus-2.0.0/src/DbCorpusWriter.cpp --- alpino-corpus-1.4.0/src/DbCorpusWriter.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/DbCorpusWriter.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -74,8 +74,13 @@ d_container = d_mgr.createContainer(path, config, db::XmlContainer ::NodeContainer); - } else - d_container = d_mgr.openContainer(path, config); + } else { + if (bf::exists(path)) + d_container = d_mgr.openContainer(path, config); + else + d_container = d_mgr.createContainer(path, config, + db::XmlContainer::NodeContainer); + } } catch (db::XmlException const &e) { throw OpenError(path, e.what()); } diff -Nru alpino-corpus-1.4.0/src/DzIstream.hh alpino-corpus-2.0.0/src/DzIstream.hh --- alpino-corpus-1.4.0/src/DzIstream.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/DzIstream.hh 2012-11-08 12:03:40.000000000 +0000 @@ -4,7 +4,7 @@ #include #include -#include +#include #include "DzIstreamBuf.hh" diff -Nru alpino-corpus-1.4.0/src/FilterIter.cpp alpino-corpus-2.0.0/src/FilterIter.cpp --- alpino-corpus-1.4.0/src/FilterIter.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/FilterIter.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -1,7 +1,7 @@ #include #include -#include +#include #include #include @@ -15,21 +15,6 @@ #include namespace { - struct Globals { - Globals(); - virtual ~Globals(); - }; - - static Globals s_globals; - - Globals::Globals() { - XQillaPlatformUtils::initialize(); - } - - Globals::~Globals() { - XQillaPlatformUtils::terminate(); - } - static XQilla s_xqilla; } diff -Nru alpino-corpus-1.4.0/src/FilterIter.hh alpino-corpus-2.0.0/src/FilterIter.hh --- alpino-corpus-1.4.0/src/FilterIter.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/FilterIter.hh 2012-11-08 12:03:40.000000000 +0000 @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include diff -Nru alpino-corpus-1.4.0/src/IterImpl.cpp alpino-corpus-2.0.0/src/IterImpl.cpp --- alpino-corpus-1.4.0/src/IterImpl.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/IterImpl.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -3,6 +3,12 @@ #include +#ifdef _MSC_VER +#include +#define INFINITY (DBL_MAX+DBL_MAX) +#define NAN (INFINITY-INFINITY) +#endif + namespace alpinocorpus { bool IterImpl::hasProgress() { diff -Nru alpino-corpus-1.4.0/src/MultiCorpusReader.cpp alpino-corpus-2.0.0/src/MultiCorpusReader.cpp --- alpino-corpus-1.4.0/src/MultiCorpusReader.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/MultiCorpusReader.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -38,7 +38,7 @@ d_private->push_back(name, reader, recursive); } -bool MultiCorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const +Either MultiCorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const { return d_private->isValidQuery(d, variables, query); } @@ -59,5 +59,10 @@ return d_private->query(XPATH, query); } +CorpusReader::EntryIterator MultiCorpusReader::runXQuery(std::string const &query) const +{ + return d_private->query(XQUERY, query); +} + } diff -Nru alpino-corpus-1.4.0/src/MultiCorpusReaderPrivate.cpp alpino-corpus-2.0.0/src/MultiCorpusReaderPrivate.cpp --- alpino-corpus-1.4.0/src/MultiCorpusReaderPrivate.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/MultiCorpusReaderPrivate.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -32,6 +33,15 @@ MultiCorpusReaderPrivate::MultiCorpusReaderPrivate() { + DbXml::XmlContainerConfig config; + config.setReadOnly(false); + + // Create an in-memory container. + d_container = d_mgr.createContainer("", config, + DbXml::XmlContainer::NodeContainer); + + // Default container name. + d_container.addAlias("corpus"); } MultiCorpusReaderPrivate::~MultiCorpusReaderPrivate() @@ -134,7 +144,13 @@ CorpusReader::EntryIterator MultiCorpusReaderPrivate::runXPath( std::string const &query) const { - return EntryIterator(new MultiIter(d_corporaMap, query)); + return EntryIterator(new MultiIter(d_corporaMap, query, CorpusReader::XPATH)); +} + +CorpusReader::EntryIterator MultiCorpusReaderPrivate::runXQuery( + std::string const &query) const +{ + return EntryIterator(new MultiIter(d_corporaMap, query, CorpusReader::XQUERY)); } // Iteration over MultiCorpusReaders @@ -158,7 +174,9 @@ MultiCorpusReaderPrivate::MultiIter::MultiIter( Corpora const &corpora, - std::string const &query) : d_hasQuery(true), d_interrupted(false) + std::string const &query, + CorpusReader::QueryDialect dialect) : + d_hasQuery(true), d_query(query), d_dialect(dialect), d_interrupted(false) { #if defined(BOOST_HAS_THREADS) d_currentIterMutex.reset(new boost::mutex); @@ -170,8 +188,6 @@ d_iters.push_back(ReaderIter(iter->first, iter->second.first, iter->second.second)); - d_query = query; - // Initial number of 'iterators'. d_totalIters = d_iters.size(); } @@ -253,7 +269,7 @@ try { if (d_hasQuery) - d_currentIter.reset(new EntryIterator(reader->query(CorpusReader::XPATH, d_query))); + d_currentIter.reset(new EntryIterator(reader->query(d_dialect, d_query))); else d_currentIter.reset(new EntryIterator(reader->entries())); } catch (std::runtime_error &e) @@ -277,16 +293,16 @@ } #ifdef USE_DBXML -bool MultiCorpusReaderPrivate::validQuery(QueryDialect d, bool variables, +Either MultiCorpusReaderPrivate::validQuery(QueryDialect d, bool variables, std::string const &query) const { try { DbXml::XmlQueryContext ctx = d_mgr.createQueryContext(); d_mgr.prepare(query, ctx); } catch (DbXml::XmlException const &e) { - return false; + return Either::left(e.what()); } - return true; + return Either::right(Empty()); #endif } diff -Nru alpino-corpus-1.4.0/src/MultiCorpusReaderPrivate.hh alpino-corpus-2.0.0/src/MultiCorpusReaderPrivate.hh --- alpino-corpus-1.4.0/src/MultiCorpusReaderPrivate.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/MultiCorpusReaderPrivate.hh 2012-11-08 12:03:40.000000000 +0000 @@ -3,8 +3,9 @@ #include #include +#include + #include -#include #include #include @@ -47,7 +48,7 @@ public: MultiIter(Corpora const &corpora); MultiIter(Corpora const &corpora, - std::string const &query); + std::string const &query, CorpusReader::QueryDialect dialect); ~MultiIter(); IterImpl *copy() const; void nextIterator(); @@ -69,6 +70,7 @@ std::string d_currentName; bool d_hasQuery; std::string d_query; + CorpusReader::QueryDialect d_dialect; bool d_interrupted; }; @@ -83,9 +85,13 @@ bool recursive = false); std::string readEntry(std::string const &) const; std::string readEntryMarkQueries(std::string const &entry, std::list const &queries) const; + +protected: + EntryIterator runXPath(std::string const &query) const; + EntryIterator runXQuery(std::string const &query) const; #ifdef USE_DBXML - bool validQuery(QueryDialect d, bool variables, std::string const &query) const; + Either validQuery(QueryDialect d, bool variables, std::string const &query) const; #endif private: @@ -97,6 +103,7 @@ Corpora d_corporaMap; #ifdef USE_DBXML mutable DbXml::XmlManager d_mgr; + DbXml::XmlContainer d_container; #endif }; diff -Nru alpino-corpus-1.4.0/src/RecursiveCorpusReader.cpp alpino-corpus-2.0.0/src/RecursiveCorpusReader.cpp --- alpino-corpus-1.4.0/src/RecursiveCorpusReader.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/RecursiveCorpusReader.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -1,7 +1,7 @@ #include #include -#include +#include #include @@ -27,7 +27,8 @@ std::string readEntry(std::string const &) const; std::string readEntryMarkQueries(std::string const &entry, std::list const &queries) const; EntryIterator runXPath(std::string const &query) const; - bool validQuery(QueryDialect d, bool variables, std::string const &query) const; + EntryIterator runXQuery(std::string const &query) const; + Either validQuery(QueryDialect d, bool variables, std::string const &query) const; private: bf::path d_directory; @@ -62,7 +63,7 @@ return d_private->getSize(); } -bool RecursiveCorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const +Either RecursiveCorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const { return d_private->isValidQuery(d, variables, query); } @@ -83,6 +84,11 @@ return d_private->query(XPATH, query); } +CorpusReader::EntryIterator RecursiveCorpusReader::runXQuery(std::string const &query) const +{ + return d_private->query(XQUERY, query); +} + // Implementation of the private interface RecursiveCorpusReaderPrivate::RecursiveCorpusReaderPrivate(std::string const &directory, @@ -152,11 +158,17 @@ return d_multiReader->query(CorpusReader::XPATH, query); } -bool RecursiveCorpusReaderPrivate::validQuery(QueryDialect d, bool variables, +CorpusReader::EntryIterator RecursiveCorpusReaderPrivate::runXQuery( + std::string const &query) const +{ + return d_multiReader->query(CorpusReader::XQUERY, query); +} + +Either RecursiveCorpusReaderPrivate::validQuery(QueryDialect d, bool variables, std::string const &query) const { if (!d_multiReader) - return false; + return Either::left("No reader available."); return d_multiReader->isValidQuery(d, variables, query); } diff -Nru alpino-corpus-1.4.0/src/RemoteCorpusReader.cpp alpino-corpus-2.0.0/src/RemoteCorpusReader.cpp --- alpino-corpus-1.4.0/src/RemoteCorpusReader.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/RemoteCorpusReader.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -73,7 +73,7 @@ return d_private->runXQuery(query); } - bool RemoteCorpusReader::validQuery(QueryDialect d, bool variables, + Either RemoteCorpusReader::validQuery(QueryDialect d, bool variables, std::string const &query) const { return d_private->isValidQuery(d, variables, query); diff -Nru alpino-corpus-1.4.0/src/RemoteCorpusReaderPrivate.cpp alpino-corpus-2.0.0/src/RemoteCorpusReaderPrivate.cpp --- alpino-corpus-1.4.0/src/RemoteCorpusReaderPrivate.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/RemoteCorpusReaderPrivate.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -14,6 +14,8 @@ #include #include +#include + #include "RemoteCorpusReaderPrivate.hh" #include "util/GetUrl.hh" #include "util/parseString.hh" @@ -93,7 +95,7 @@ throw std::runtime_error("RemoteCorpusReader: size is unknown"); } - bool RemoteCorpusReaderPrivate::validQuery(QueryDialect d, bool variables, + Either RemoteCorpusReaderPrivate::validQuery(QueryDialect d, bool variables, std::string const &query) const { #ifdef USE_DBXML @@ -101,15 +103,18 @@ DbXml::XmlQueryContext ctx = mgr.createQueryContext(); mgr.prepare(query, ctx); } catch (DbXml::XmlException const &e) { - return false; + return Either::left(e.what()); } - return true; + return Either::right(Empty()); #else util::GetUrl p(d_url + "/validQuery?query=" + util::toPercentEncoding(query)); std::string result = p.body(); boost::to_lower(result); boost::trim(result); - return (result == "true" || result == "yes" || result == "1"); + if (result == "true" || result == "yes" || result == "1") + return Either::right(Empty()); + else + return Either::left("Query could not be validated by the server."); #endif } diff -Nru alpino-corpus-1.4.0/src/RemoteCorpusReaderPrivate.hh alpino-corpus-2.0.0/src/RemoteCorpusReaderPrivate.hh --- alpino-corpus-1.4.0/src/RemoteCorpusReaderPrivate.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/RemoteCorpusReaderPrivate.hh 2012-11-08 12:03:40.000000000 +0000 @@ -4,7 +4,7 @@ #include #include -#include +#include #include #ifdef USE_DBXML @@ -55,7 +55,7 @@ virtual EntryIterator getEntries() const; virtual std::string getName() const; virtual size_t getSize() const; - bool validQuery(QueryDialect d, bool variables, + Either validQuery(QueryDialect d, bool variables, std::string const &query) const; virtual std::string readEntry(std::string const &filename) const; virtual std::string readEntryMarkQueries(std::string const &entry, diff -Nru alpino-corpus-1.4.0/src/XSLTransformer.cpp alpino-corpus-2.0.0/src/XSLTransformer.cpp --- alpino-corpus-1.4.0/src/XSLTransformer.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/XSLTransformer.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -10,6 +10,7 @@ } #include "AlpinoCorpus/Error.hh" +#include #include "XSLTransformer.hh" @@ -22,64 +23,55 @@ XSLTransformer::~XSLTransformer() { - xsltFreeStylesheet(d_xslPtr); } void XSLTransformer::initWithStylesheet(std::string const &xsl) { xmlDocPtr xslDoc = xmlReadMemory(xsl.c_str(), xsl.size(), 0, 0, XSLT_PARSE_OPTIONS); - d_xslPtr = xsltParseStylesheetDoc(xslDoc); + d_xslPtr.reset(xsltParseStylesheetDoc(xslDoc), xsltFreeStylesheet); } std::string XSLTransformer::transform(std::string const &xml) const { // Read XML data intro an xmlDoc. - xmlDocPtr doc = xmlReadMemory(xml.c_str(), xml.size(), 0, 0, 0); + std::tr1::shared_ptr doc( + xmlReadMemory(xml.c_str(), xml.size(), 0, 0, 0), + xmlFreeDoc); if (!doc) throw Error("XSLTransformer::transform: Could not open XML data"); - xsltTransformContextPtr ctx = xsltNewTransformContext(d_xslPtr, doc); - xsltSetCtxtParseOptions(ctx, XSLT_PARSE_OPTIONS); + std::tr1::shared_ptr ctx( + xsltNewTransformContext(d_xslPtr.get(), doc.get()), + xsltFreeTransformContext); + xsltSetCtxtParseOptions(ctx.get(), XSLT_PARSE_OPTIONS); // Transform... - xmlDocPtr res = xsltApplyStylesheetUser(d_xslPtr, doc, NULL, NULL, - NULL, ctx); + std::tr1::shared_ptr res( + xsltApplyStylesheetUser(d_xslPtr.get(), doc.get(), NULL, NULL, NULL, ctx.get()), + xmlFreeDoc); if (!res) { - xsltFreeTransformContext(ctx); - xmlFreeDoc(doc); throw Error("XSLTransformer::transform: Could not apply transformation!"); } else if (ctx->state != XSLT_STATE_OK) { - xsltFreeTransformContext(ctx); - xmlFreeDoc(res); - xmlFreeDoc(doc); throw Error("XSLTransformer::transform: Transformation error, check your query!"); } - xsltFreeTransformContext(ctx); - - xmlChar *output = 0; + xmlChar *bareOutput = 0; int outputLen = -1; - xsltSaveResultToString(&output, &outputLen, res, d_xslPtr); + xsltSaveResultToString(&bareOutput, &outputLen, res.get(), d_xslPtr.get()); + std::tr1::shared_ptr output(bareOutput, xmlFree); if (!output) { - xmlFreeDoc(res); - xmlFreeDoc(doc); throw Error("Could not apply stylesheet!"); } - std::string result(reinterpret_cast(output)); - - // Deallocate memory used for libxml2/libxslt. - xmlFree(output); - xmlFreeDoc(res); - xmlFreeDoc(doc); + std::string result(reinterpret_cast(output.get())); return result; } diff -Nru alpino-corpus-1.4.0/src/XSLTransformer.hh alpino-corpus-2.0.0/src/XSLTransformer.hh --- alpino-corpus-1.4.0/src/XSLTransformer.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/XSLTransformer.hh 2012-11-08 12:03:40.000000000 +0000 @@ -3,6 +3,8 @@ #include +#include + extern "C" { #include } @@ -20,7 +22,7 @@ XSLTransformer &operator=(XSLTransformer const &other); void initWithStylesheet(std::string const &xslt); - xsltStylesheetPtr d_xslPtr; + std::tr1::shared_ptr d_xslPtr; }; } diff -Nru alpino-corpus-1.4.0/src/capi.cpp alpino-corpus-2.0.0/src/capi.cpp --- alpino-corpus-1.4.0/src/capi.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/capi.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -14,6 +14,8 @@ #include #include +#include + extern "C" { #include @@ -67,7 +69,8 @@ alpinocorpus_reader alpinocorpus_open_recursive(char const *path) { - alpinocorpus::CorpusReader *reader; +#if defined(USE_DBXML) + alpinocorpus::CorpusReader *reader = 0; try { reader = alpinocorpus::CorpusReaderFactory::openRecursive(path); @@ -76,6 +79,9 @@ } return new alpinocorpus_reader_t(reader); +#else + return NULL; +#endif } void alpinocorpus_close(alpinocorpus_reader reader) @@ -87,7 +93,7 @@ int alpinocorpus_is_valid_query(alpinocorpus_reader reader, char const *query) { return int(reader->corpusReader->isValidQuery( - alpinocorpus::CorpusReader::XPATH, false, query)); + alpinocorpus::CorpusReader::XPATH, false, query).isRight()); } alpinocorpus_iter alpinocorpus_entry_iter(alpinocorpus_reader corpus) @@ -198,8 +204,8 @@ alpinocorpus_entry ce = reinterpret_cast( std::malloc(sizeof(alpinocorpus_entry_t))); - ce->name = strndup(e.name.c_str(), e.name.size()); - ce->contents = strndup(e.contents.c_str(), e.contents.size()); + ce->name = strdup(e.name.c_str()); + ce->contents = strdup(e.contents.c_str()); return ce; } diff -Nru alpino-corpus-1.4.0/src/parseMacros.cpp alpino-corpus-2.0.0/src/parseMacros.cpp --- alpino-corpus-1.4.0/src/parseMacros.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/parseMacros.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -35,89 +35,89 @@ }; static const char _macros_key_offsets[] = { - 0, 0, 12, 13, 14, 26, 31, 36, - 37, 38, 40, 42, 44, 46, 48, 50, - 52, 54, 66, 79, 91, 104 + 0, 0, 12, 13, 25, 30, 35, 36, + 37, 39, 41, 43, 45, 47, 49, 51, + 53, 65, 78, 90, 103 }; static const char _macros_trans_keys[] = { 13, 32, 35, 95, 9, 10, 48, 57, - 65, 90, 97, 122, 10, 10, 13, 32, - 61, 95, 9, 10, 48, 57, 65, 90, - 97, 122, 13, 32, 61, 9, 10, 13, - 32, 34, 9, 10, 34, 34, 34, 37, - 34, 37, 34, 37, 34, 37, 34, 37, - 34, 37, 34, 37, 34, 37, 13, 32, + 65, 90, 97, 122, 10, 13, 32, 61, + 95, 9, 10, 48, 57, 65, 90, 97, + 122, 13, 32, 61, 9, 10, 13, 32, + 34, 9, 10, 34, 34, 34, 37, 34, + 37, 34, 37, 34, 37, 34, 37, 34, + 37, 34, 37, 34, 37, 13, 32, 35, + 95, 9, 10, 48, 57, 65, 90, 97, + 122, 13, 32, 34, 35, 95, 9, 10, + 48, 57, 65, 90, 97, 122, 13, 32, 35, 95, 9, 10, 48, 57, 65, 90, 97, 122, 13, 32, 34, 35, 95, 9, 10, 48, 57, 65, 90, 97, 122, 13, 32, 35, 95, 9, 10, 48, 57, 65, - 90, 97, 122, 13, 32, 34, 35, 95, - 9, 10, 48, 57, 65, 90, 97, 122, - 13, 32, 35, 95, 9, 10, 48, 57, - 65, 90, 97, 122, 0 + 90, 97, 122, 0 }; static const char _macros_single_lengths[] = { - 0, 4, 1, 1, 4, 3, 3, 1, - 1, 2, 2, 2, 2, 2, 2, 2, - 2, 4, 5, 4, 5, 4 + 0, 4, 1, 4, 3, 3, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 5, 4, 5, 4 }; static const char _macros_range_lengths[] = { - 0, 4, 0, 0, 4, 1, 1, 0, + 0, 4, 0, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 4, 4, 4, 4, 4 + 4, 4, 4, 4, 4 }; static const char _macros_index_offsets[] = { - 0, 0, 9, 11, 13, 22, 27, 32, - 34, 36, 39, 42, 45, 48, 51, 54, - 57, 60, 69, 79, 88, 98 + 0, 0, 9, 11, 20, 25, 30, 32, + 34, 37, 40, 43, 46, 49, 52, 55, + 58, 67, 77, 86, 96 }; static const char _macros_indicies[] = { 0, 0, 2, 3, 0, 3, 3, 3, - 1, 1, 4, 0, 4, 5, 5, 6, - 3, 5, 3, 3, 3, 1, 7, 7, - 8, 7, 1, 8, 8, 9, 8, 1, - 10, 1, 11, 1, 13, 14, 12, 16, - 17, 15, 18, 17, 15, 19, 17, 15, - 21, 1, 20, 23, 24, 22, 25, 24, - 22, 1, 24, 22, 0, 0, 2, 3, - 0, 3, 3, 3, 1, 26, 26, 27, - 28, 29, 26, 29, 29, 29, 1, 30, - 30, 2, 3, 30, 3, 3, 3, 1, - 26, 26, 31, 28, 29, 26, 29, 29, - 29, 1, 26, 26, 28, 29, 26, 29, - 29, 29, 1, 0 + 1, 0, 2, 4, 4, 5, 3, 4, + 3, 3, 3, 1, 6, 6, 7, 6, + 1, 7, 7, 8, 7, 1, 9, 1, + 10, 1, 12, 13, 11, 15, 16, 14, + 17, 16, 14, 18, 16, 14, 20, 1, + 19, 22, 23, 21, 24, 23, 21, 1, + 23, 21, 0, 0, 2, 3, 0, 3, + 3, 3, 1, 25, 25, 26, 27, 28, + 25, 28, 28, 28, 1, 29, 29, 2, + 3, 29, 3, 3, 3, 1, 25, 25, + 30, 27, 28, 25, 28, 28, 28, 1, + 25, 25, 27, 28, 25, 28, 28, 28, + 1, 0 }; static const char _macros_trans_targs[] = { - 1, 0, 2, 4, 3, 5, 6, 5, - 6, 7, 8, 9, 10, 11, 13, 10, - 11, 13, 12, 18, 14, 15, 14, 15, - 10, 16, 19, 20, 2, 4, 19, 21 + 1, 0, 2, 3, 4, 5, 4, 5, + 6, 7, 8, 9, 10, 12, 9, 10, + 12, 11, 17, 13, 14, 13, 14, 9, + 15, 18, 19, 2, 3, 18, 20 }; static const char _macros_trans_actions[] = { - 0, 0, 0, 1, 0, 3, 3, 0, - 0, 0, 0, 0, 5, 5, 5, 0, - 0, 0, 0, 0, 9, 9, 0, 0, - 11, 0, 7, 0, 7, 13, 0, 0 + 0, 0, 0, 1, 3, 3, 0, 0, + 0, 0, 0, 5, 5, 5, 0, 0, + 0, 0, 0, 9, 9, 0, 0, 11, + 0, 7, 0, 7, 13, 0, 0 }; static const char _macros_eof_actions[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 7, 0, 7, 7 + 0, 7, 0, 7, 7 }; -static const int macros_start = 17; -static const int macros_first_final = 17; +static const int macros_start = 16; +static const int macros_first_final = 16; static const int macros_error = 0; -static const int macros_en_main = 17; +static const int macros_en_main = 16; #line 30 "src/parseMacros.rl" @@ -371,7 +371,10 @@ std::string context = str.substr(ctxBegin, ctxEnd - ctxBegin); err << context << std::endl; - size_t cursor = errPos - ctxBegin; + size_t cursor = 0; + if (errPos > ctxBegin) + cursor = errPos - ctxBegin; + for (size_t i = 0; i < cursor; ++i) err << " "; err << "^"; diff -Nru alpino-corpus-1.4.0/src/parseMacros.rl alpino-corpus-2.0.0/src/parseMacros.rl --- alpino-corpus-1.4.0/src/parseMacros.rl 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/src/parseMacros.rl 2012-11-08 12:03:40.000000000 +0000 @@ -118,7 +118,7 @@ separator = "\"\"\""; whitespace = [\n\r\t ]+; - comment = '#' [^\n]+ '\n'; + comment = '#' [^\n]* '\n'; key = ([A-Za-z0-9_]+) $ str_char % key; @@ -150,7 +150,10 @@ std::string context = str.substr(ctxBegin, ctxEnd - ctxBegin); err << context << std::endl; - size_t cursor = errPos - ctxBegin; + size_t cursor = 0; + if (errPos > ctxBegin) + cursor = errPos - ctxBegin; + for (size_t i = 0; i < cursor; ++i) err << " "; err << "^"; diff -Nru alpino-corpus-1.4.0/util/CMakeLists.txt alpino-corpus-2.0.0/util/CMakeLists.txt --- alpino-corpus-1.4.0/util/CMakeLists.txt 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/util/CMakeLists.txt 2012-11-08 12:03:40.000000000 +0000 @@ -1,3 +1,5 @@ +if(USE_DBXML) + find_package(LibXslt REQUIRED) if(LIBXSLT_FOUND) include_directories(${LIBXSLT_INCLUDE_DIR}) @@ -21,17 +23,41 @@ common/util.cpp ) -set(ACT_SOURCES - act/main.cpp +set(CREATE_SOURCES + create/main.cpp +) + +add_executable(alpinocorpus-create + ${CREATE_SOURCES} + ${COMMON_HEADERS} + ${COMMON_SOURCES} +) + +target_link_libraries(alpinocorpus-create alpino_corpus) + +set(XPATH_SOURCES + xpath/main.cpp +) + +add_executable(alpinocorpus-xpath + ${XPATH_SOURCES} + ${COMMON_HEADERS} + ${COMMON_SOURCES} +) + +target_link_libraries(alpinocorpus-xpath alpino_corpus) + +set(XQUERY_SOURCES + xquery/main.cpp ) -add_executable(alpinocorpus-act - ${ACT_SOURCES} +add_executable(alpinocorpus-xquery + ${XQUERY_SOURCES} ${COMMON_HEADERS} ${COMMON_SOURCES} ) -target_link_libraries(alpinocorpus-act alpino_corpus) +target_link_libraries(alpinocorpus-xquery alpino_corpus) set(XSLT_HEADERS common/Stylesheet.hh @@ -69,14 +95,14 @@ alpino_corpus ) -install(TARGETS alpinocorpus-act +install( + TARGETS + alpinocorpus-create + alpinocorpus-stats + alpinocorpus-xpath + alpinocorpus-xquery + alpinocorpus-xslt RUNTIME DESTINATION bin ) -install(TARGETS alpinocorpus-xslt - RUNTIME DESTINATION bin -) - -install(TARGETS alpinocorpus-stats - RUNTIME DESTINATION bin -) +endif(USE_DBXML) \ No newline at end of file diff -Nru alpino-corpus-1.4.0/util/act/main.cpp alpino-corpus-2.0.0/util/act/main.cpp --- alpino-corpus-1.4.0/util/act/main.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/util/act/main.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,296 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -#if defined(USE_DBXML) - #include -#endif - -#include -#include -#include - -#include -#include - -#include -#include -#include - -using alpinocorpus::CorpusReader; -using alpinocorpus::CorpusWriter; -using alpinocorpus::CompactCorpusWriter; -using alpinocorpus::Entry; -using alpinocorpus::LexItem; - -#if defined(USE_DBXML) -using alpinocorpus::DbCorpusWriter; -#endif - -namespace bf = boost::filesystem; -namespace tr1 = std::tr1; - -void listCorpus(tr1::shared_ptr reader, - std::string const &query, bool bracketed = false) -{ - CorpusReader::EntryIterator i; - - if (query.empty()) - i = reader->entries(); - else - i = reader->query(CorpusReader::XPATH, query); - - NotEqualsPrevious pred; - - tr1::unordered_set seen; - while (i.hasNext()) - { - Entry entry = i.next(*reader); - if (seen.find(entry.name) == seen.end()) { - std::cout << entry.name; - - if (bracketed) { - std::cout << " "; - - std::vector items = reader->sentence(entry.name, query); - - size_t prevDepth = 0; - for (std::vector::const_iterator itemIter = items.begin(); - itemIter != items.end(); ++itemIter) - { - size_t depth = itemIter->matches.size(); - - if (depth != prevDepth) { - if (depth == 0) - std::cout << "\033[0;22m"; - else if (depth == 1) - std::cout << "\033[38;5;99m"; - else if (depth == 2) - std::cout << "\033[38;5;111m"; - else if (depth == 3) - std::cout << "\033[38;5;123m"; - else if (depth == 4) - std::cout << "\033[38;5;121m"; - else - std::cout << "\033[38;5;119m"; - } - - std::cout << itemIter->word; - - std::vector::const_iterator next = itemIter + 1; - if (next != items.end() && next->matches.size() < depth) - std::cout << "\033[0;22m"; - - std::cout << " "; - - prevDepth = depth; - } - - std::cout << "\033[0;22m" << std::endl; - } - - std::cout << std::endl; - seen.insert(entry.name); - } - } -} - -void readEntry(tr1::shared_ptr reader, std::string const &entry) -{ - std::cout << reader->read(entry); -} - -void usage(std::string const &programName) -{ - std::cerr << "Usage: " << programName << " [OPTION] treebanks" << - std::endl << std::endl << - " -c filename\tCreate a compact corpus archive" << std::endl << -#if defined(USE_DBXML) - " -d filename\tCreate a Dact dbxml archive" << std::endl << -#endif - " -g entry\tPrint a treebank entry to stdout" << std::endl << - " -l\t\tList the entries of a treebank" << std::endl << - " -m filename\tLoad macro file" << std::endl << - " -q query\tFilter the treebank using the given query" << std::endl << - " -s\t\tInclude a bracketed sentence" << std::endl << - " -r\t\tProcess a directory of corpora recursively" << std::endl << std::endl; -} - -void writeCorpus(tr1::shared_ptr reader, - tr1::shared_ptr writer, - std::string const &query) -{ - CorpusReader::EntryIterator i; - if (query.empty()) - i = reader->entries(); - else - i = reader->query(CorpusReader::XPATH, query); - - // We need to be *really* sure when writing a corpus that an entry was not written - // before. So, we'll use a set, rather than a basic filter. - tr1::unordered_set seen; - while (i.hasNext()) { - Entry e = i.next(*reader); - - if (seen.find(e.name) == seen.end()) { - writer->write(e.name, reader->read(e.name)); - seen.insert(e.name); - } else - std::cerr << "Duplicate entry: " << e.name << std::endl; - } -} - -int main(int argc, char *argv[]) -{ - boost::scoped_ptr opts; - try { - opts.reset(new ProgramOptions(argc, const_cast(argv), - "c:d:g:lm:q:rs")); - } catch (std::exception &e) { - std::cerr << e.what() << std::endl; - return 1; - } - - if (opts->arguments().size() == 0) - { - usage(opts->programName()); - return 1; - } - - size_t cmdCount = 0; - char const commands[] = "cdgl"; - for (size_t i = 0; i < sizeof(commands); ++i) - if (opts->option(commands[i])) - ++cmdCount; - - if (cmdCount > 1) { - std::cerr << opts->programName() << - ": the '-c', 'd', '-g', and '-l' options cannot be used simultaneously." << - std::endl; - return 1; - } - - if (cmdCount == 0) { - std::cerr << opts->programName() << - ": one of the '-c', 'd', -g' or '-l' option should be used." << - std::endl; - return 1; - } - - tr1::shared_ptr reader; - try { - if (opts->arguments().size() == 1) - reader = tr1::shared_ptr( - openCorpus(opts->arguments().at(0), opts->option('r'))); - else - reader = tr1::shared_ptr( - openCorpora(opts->arguments().begin(), - opts->arguments().end(), opts->option('r'))); - } catch (std::runtime_error &e) { - std::cerr << "Could not open corpus: " << e.what() << std::endl; - return 1; - } - - alpinocorpus::Macros macros; - if (opts->option('m')) { - std::string macrosFn = opts->optionValue('m'); - try { - macros = alpinocorpus::loadMacros(macrosFn); - } catch (std::runtime_error &e) { - std::cerr << e.what() << std::endl; - return 1; - } - } - - std::string query; - if (opts->option('q')) { - query = alpinocorpus::expandMacros(macros, opts->optionValue('q')); - - if (!reader->isValidQuery(CorpusReader::XPATH, false, query)) { - std::cerr << "Invalid (or unwanted) query: " << query << std::endl; - return 1; - } - } - - if (opts->option('d')) { - try { - std::string treebankOut = opts->optionValue('d').c_str(); - - // XXX - needs a more sophisticated check now, the output treebank - // could also be in the search path of a recursive reader. - for (std::vector::const_iterator iter = - opts->arguments().begin(); iter != opts->arguments().end(); - ++iter) - if (bf::equivalent(treebankOut, *iter)) - throw std::runtime_error("Attempting to write to the source treebank."); - -#if defined(USE_DBXML) - tr1::shared_ptr wr(new DbCorpusWriter(treebankOut, true)); - writeCorpus(reader, wr, query); -#else - throw std::runtime_error("AlpinoCorpus was compiled without DBXML support."); -#endif // defined(USE_DBXML) - - } catch (std::runtime_error const &e) { - std::cerr << opts->programName() << - ": error creating Dact treebank: " << e.what() << std::endl; - return 1; - } - } - else if (opts->option('c')) { - try { - std::string treebankOut = opts->optionValue('c').c_str(); - - // XXX - needs a more sophisticated check now, the output treebank - // could also be in the search path of a recursive reader. - for (std::vector::const_iterator iter = - opts->arguments().begin(); iter != opts->arguments().end(); - ++iter) - if (bf::equivalent(treebankOut, *iter)) - throw std::runtime_error("Attempting to write to the source treebank."); - - tr1::shared_ptr wr(new CompactCorpusWriter(treebankOut)); - writeCorpus(reader, wr, query); - - } catch (std::runtime_error const &e) { - std::cerr << opts->programName() << - ": error creating compact corpus: " << e.what() << std::endl; - return 1; - } - } - else if (opts->option('g')) { - try { - readEntry(reader, opts->optionValue('g')); - } catch (std::runtime_error const &e) { - std::cerr << opts->programName() << - ": error reading entry: " << e.what() << std::endl; - return 1; - } - } - else if (opts->option('l')) { - try { - listCorpus(reader, query, opts->option('s')); - } catch (std::runtime_error const &e) { - std::cerr << opts->programName() << - ": error listing treebank: " << e.what() << std::endl; - return 1; - } - } - - return 0; -} diff -Nru alpino-corpus-1.4.0/util/common/EqualsPrevious.hh alpino-corpus-2.0.0/util/common/EqualsPrevious.hh --- alpino-corpus-1.4.0/util/common/EqualsPrevious.hh 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/util/common/EqualsPrevious.hh 2012-11-08 12:03:40.000000000 +0000 @@ -3,7 +3,7 @@ #include -#include +#include template class EqualsPrevious : public std::unary_function diff -Nru alpino-corpus-1.4.0/util/common/util.cpp alpino-corpus-2.0.0/util/common/util.cpp --- alpino-corpus-1.4.0/util/common/util.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/util/common/util.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -17,7 +17,7 @@ CorpusReader* openCorpus(std::string const &path, bool recursive) { - if (recursive) + if (recursive && bf::is_directory(bf::path(path))) return CorpusReaderFactory::openRecursive(path, false); else return CorpusReaderFactory::open(path); @@ -36,6 +36,9 @@ // If we are dealing with a directory, and the path ends with a trailing // slash, we remove the slash. bf::path p = bf::path(*iter); + + bool isDir = bf::is_directory(p); + if (bf::is_directory(p) && iter->rfind('/') == iter->size() - 1) p = bf::path(iter->substr(0, iter->size() - 1)); @@ -43,9 +46,10 @@ p.replace_extension(""); // Use the last path component as the corpus name. - std::string name = p.filename().generic_string(); + + std::string name = isDir ? p.generic_string() : p.filename().generic_string(); - readers->push_back(name, *iter, recursive); + readers->push_back(name, *iter, recursive && isDir); } return readers; diff -Nru alpino-corpus-1.4.0/util/create/main.cpp alpino-corpus-2.0.0/util/create/main.cpp --- alpino-corpus-1.4.0/util/create/main.cpp 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.0.0/util/create/main.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(USE_DBXML) + #include +#endif + +#include +#include +#include + +#include +#include + +#include +#include +#include + +using alpinocorpus::CorpusReader; +using alpinocorpus::CorpusWriter; +using alpinocorpus::CompactCorpusWriter; +using alpinocorpus::Either; +using alpinocorpus::Entry; +using alpinocorpus::LexItem; + +#if defined(USE_DBXML) +using alpinocorpus::DbCorpusWriter; +#endif + +namespace bf = boost::filesystem; +namespace tr1 = std::tr1; + +void usage(std::string const &programName) +{ + std::cerr << "Usage: " << programName << " [OPTION] treebanks" << + std::endl << std::endl << + " -c filename\tCreate a compact corpus archive" << std::endl << +#if defined(USE_DBXML) + " -d filename\tCreate a Dact dbxml archive" << std::endl << +#endif + " -m filename\tLoad macro file" << std::endl << + " -q query\tFilter the treebank using the given query" << std::endl << + " -r\t\tProcess a directory of corpora recursively" << std::endl << std::endl; +} + +void writeCorpus(tr1::shared_ptr reader, + tr1::shared_ptr writer, + std::string const &query) +{ + CorpusReader::EntryIterator i; + if (query.empty()) + i = reader->entries(); + else + i = reader->query(CorpusReader::XPATH, query); + + // We need to be *really* sure when writing a corpus that an entry was not written + // before. So, we'll use a set, rather than a basic filter. + tr1::unordered_set seen; + while (i.hasNext()) { + Entry e = i.next(*reader); + + if (seen.find(e.name) == seen.end()) { + writer->write(e.name, reader->read(e.name)); + seen.insert(e.name); + } else + std::cerr << "Duplicate entry: " << e.name << std::endl; + } +} + +int main(int argc, char *argv[]) +{ + boost::scoped_ptr opts; + try { + opts.reset(new ProgramOptions(argc, const_cast(argv), + "c:d:m:q:r")); + } catch (std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } + + if (opts->arguments().size() == 0) + { + usage(opts->programName()); + return 1; + } + + if (opts->option('c') && opts->option('d')) { + std::cerr << opts->programName() << + ": the '-c' and 'd' options cannot be used simultaneously." << + std::endl; + return 1; + } + + if (!opts->option('c') && !opts->option('d')) { + std::cerr << opts->programName() << + ": one of the '-c' or 'd' options should be used." << + std::endl; + return 1; + } + + tr1::shared_ptr reader; + try { + if (opts->arguments().size() == 1) + reader = tr1::shared_ptr( + openCorpus(opts->arguments().at(0), opts->option('r'))); + else + reader = tr1::shared_ptr( + openCorpora(opts->arguments().begin(), + opts->arguments().end(), opts->option('r'))); + } catch (std::runtime_error &e) { + std::cerr << "Could not open corpus: " << e.what() << std::endl; + return 1; + } + + alpinocorpus::Macros macros; + if (opts->option('m')) { + std::string macrosFn = opts->optionValue('m'); + try { + macros = alpinocorpus::loadMacros(macrosFn); + } catch (std::runtime_error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + } + + std::string query; + if (opts->option('q')) { + query = alpinocorpus::expandMacros(macros, opts->optionValue('q')); + + Either valid = + reader->isValidQuery(CorpusReader::XPATH, false, query); + if (valid.isLeft()) { + std::cerr << "Invalid (or unwanted) query: " << query << std::endl << std::endl; + std::cerr << valid.left() << std::endl; + return 1; + } + } + + if (opts->option('d')) { + try { + std::string treebankOut = opts->optionValue('d').c_str(); + + // XXX - needs a more sophisticated check now, the output treebank + // could also be in the search path of a recursive reader. + for (std::vector::const_iterator iter = + opts->arguments().begin(); iter != opts->arguments().end(); + ++iter) + if (bf::equivalent(treebankOut, *iter)) + throw std::runtime_error("Attempting to write to the source treebank."); + +#if defined(USE_DBXML) + tr1::shared_ptr wr(new DbCorpusWriter(treebankOut, true)); + writeCorpus(reader, wr, query); +#else + throw std::runtime_error("AlpinoCorpus was compiled without DBXML support."); +#endif // defined(USE_DBXML) + + } catch (std::runtime_error const &e) { + std::cerr << opts->programName() << + ": error creating Dact treebank: " << e.what() << std::endl; + return 1; + } + } + else if (opts->option('c')) { + try { + std::string treebankOut = opts->optionValue('c').c_str(); + + // XXX - needs a more sophisticated check now, the output treebank + // could also be in the search path of a recursive reader. + for (std::vector::const_iterator iter = + opts->arguments().begin(); iter != opts->arguments().end(); + ++iter) + if (bf::equivalent(treebankOut, *iter)) + throw std::runtime_error("Attempting to write to the source treebank."); + + tr1::shared_ptr wr(new CompactCorpusWriter(treebankOut)); + writeCorpus(reader, wr, query); + + } catch (std::runtime_error const &e) { + std::cerr << opts->programName() << + ": error creating compact corpus: " << e.what() << std::endl; + return 1; + } + } + + return 0; +} diff -Nru alpino-corpus-1.4.0/util/stats/main.cpp alpino-corpus-2.0.0/util/stats/main.cpp --- alpino-corpus-1.4.0/util/stats/main.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/util/stats/main.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -4,7 +4,8 @@ #include #include -#include +#include + #include #include @@ -16,6 +17,7 @@ #include using alpinocorpus::CorpusReader; +using alpinocorpus::Either; namespace tr1 = std::tr1; @@ -62,8 +64,7 @@ std::cerr << "Usage: " << programName << " [OPTION] query treebanks" << std::endl << std::endl << " -m filename\tLoad macro file" << std::endl << - " -p\t\tRelative item frequencies" << std::endl << - " -r\t\tProcess a directory of corpora recursively" << std::endl << std::endl; + " -p\t\tRelative item frequencies" << std::endl << std::endl; } @@ -72,7 +73,7 @@ boost::scoped_ptr opts; try { opts.reset(new ProgramOptions(argc, const_cast(argv), - "m:pr")); + "m:p")); } catch (std::exception &e) { std::cerr << e.what() << std::endl; return 1; @@ -88,11 +89,11 @@ try { if (opts->arguments().size() == 1) reader = tr1::shared_ptr( - openCorpus(opts->arguments().at(0), opts->option('r'))); + openCorpus(opts->arguments().at(0), true)); else reader = tr1::shared_ptr( openCorpora(opts->arguments().begin() + 1, - opts->arguments().end(), opts->option('r'))); + opts->arguments().end(), true)); } catch (std::runtime_error &e) { std::cerr << "Could not open corpus: " << e.what() << std::endl; return 1; @@ -110,11 +111,14 @@ } std::string query = alpinocorpus::expandMacros(macros, opts->arguments().at(0)); - if (!reader->isValidQuery(CorpusReader::XPATH, false, query)) { - std::cerr << "Invalid (or unwanted) query: " << query << std::endl; - return 1; + Either valid = + reader->isValidQuery(CorpusReader::XPATH, false, query); + if (valid.isLeft()) { + std::cerr << "Invalid (or unwanted) query: " << query << std::endl << std::endl; + std::cerr << valid.left() << std::endl; + return 1; } - + ValueCounts counts(countQuery(reader, query)); printFrequencies(reader, counts, opts->option('p')); } diff -Nru alpino-corpus-1.4.0/util/xpath/main.cpp alpino-corpus-2.0.0/util/xpath/main.cpp --- alpino-corpus-1.4.0/util/xpath/main.cpp 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.0.0/util/xpath/main.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -0,0 +1,177 @@ +#include +#include +#include + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +using alpinocorpus::CorpusReader; +using alpinocorpus::Either; +using alpinocorpus::Entry; +using alpinocorpus::LexItem; + +namespace bf = boost::filesystem; +namespace tr1 = std::tr1; + +void listCorpus(tr1::shared_ptr reader, + std::string const &query, bool bracketed = false) +{ + CorpusReader::EntryIterator i; + + if (query.empty()) + i = reader->entries(); + else + i = reader->query(CorpusReader::XPATH, query); + + NotEqualsPrevious pred; + + tr1::unordered_set seen; + while (i.hasNext()) + { + Entry entry = i.next(*reader); + if (seen.find(entry.name) == seen.end()) { + std::cout << entry.name; + + if (bracketed) { + std::cout << " "; + + std::vector items = reader->sentence(entry.name, query); + + size_t prevDepth = 0; + for (std::vector::const_iterator itemIter = items.begin(); + itemIter != items.end(); ++itemIter) + { + size_t depth = itemIter->matches.size(); + + if (depth != prevDepth) { + if (depth == 0) + std::cout << "\033[0;22m"; + else if (depth == 1) + std::cout << "\033[38;5;99m"; + else if (depth == 2) + std::cout << "\033[38;5;111m"; + else if (depth == 3) + std::cout << "\033[38;5;123m"; + else if (depth == 4) + std::cout << "\033[38;5;121m"; + else + std::cout << "\033[38;5;119m"; + } + + std::cout << itemIter->word; + + std::vector::const_iterator next = itemIter + 1; + if (next != items.end() && next->matches.size() < depth) + std::cout << "\033[0;22m"; + + std::cout << " "; + + prevDepth = depth; + } + + std::cout << "\033[0;22m" << std::endl; + } + + std::cout << std::endl; + seen.insert(entry.name); + } + } +} + +void readEntry(tr1::shared_ptr reader, std::string const &entry) +{ + std::cout << reader->read(entry); +} + +void usage(std::string const &programName) +{ + std::cerr << "Usage: " << programName << " [OPTION] treebank(s)" << + std::endl << std::endl << + " -m filename\tLoad macro file" << std::endl << + " -q query\tFilter the treebank using the given query" << std::endl << + " -s\t\tInclude a bracketed sentence" << std::endl << std::endl; +} + +int main(int argc, char *argv[]) +{ + boost::scoped_ptr opts; + try { + opts.reset(new ProgramOptions(argc, const_cast(argv), + "m:q:s")); + } catch (std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } + + if (opts->arguments().size() == 0) + { + usage(opts->programName()); + return 1; + } + + tr1::shared_ptr reader; + try { + if (opts->arguments().size() == 1) + reader = tr1::shared_ptr( + openCorpus(opts->arguments().at(0), true)); + else + reader = tr1::shared_ptr( + openCorpora(opts->arguments().begin(), + opts->arguments().end(), true)); + } catch (std::runtime_error &e) { + std::cerr << "Could not open corpus: " << e.what() << std::endl; + return 1; + } + + alpinocorpus::Macros macros; + if (opts->option('m')) { + std::string macrosFn = opts->optionValue('m'); + try { + macros = alpinocorpus::loadMacros(macrosFn); + } catch (std::runtime_error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + } + + std::string query; + if (opts->option('q')) { + query = alpinocorpus::expandMacros(macros, opts->optionValue('q')); + + Either valid = + reader->isValidQuery(CorpusReader::XPATH, false, query); + if (valid.isLeft()) { + std::cerr << "Invalid (or unwanted) query: " << query << std::endl << std::endl; + std::cerr << valid.left() << std::endl; + return 1; + } + } + + try { + listCorpus(reader, query, opts->option('s')); + } catch (std::runtime_error const &e) { + std::cerr << opts->programName() << + ": error listing treebank: " << e.what() << std::endl; + return 1; + } + + return 0; +} diff -Nru alpino-corpus-1.4.0/util/xquery/main.cpp alpino-corpus-2.0.0/util/xquery/main.cpp --- alpino-corpus-1.4.0/util/xquery/main.cpp 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.0.0/util/xquery/main.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -0,0 +1,140 @@ +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "../src/util/textfile.hh" + +using alpinocorpus::CorpusReader; +using alpinocorpus::Entry; +using alpinocorpus::Either; + +namespace tr1 = std::tr1; + +void listCorpus(tr1::shared_ptr reader, + std::string const &query) +{ + CorpusReader::EntryIterator i; + + i = reader->query(CorpusReader::XQUERY, query); + + while (i.hasNext()) + { + Entry entry = i.next(*reader); + std::cout << entry.contents << std::endl; + } +} + +void readEntry(tr1::shared_ptr reader, std::string const &entry) +{ + std::cout << reader->read(entry); +} + +void usage(std::string const &programName) +{ + std::cerr << "Usage: " << programName << " [OPTION] treebanks" << + std::endl << std::endl << + " -f filename\tRead XQuery program from file" << std::endl << + " -m filename\tLoad macro file" << std::endl << + " -q query\tFilter the treebank using the given query" << std::endl << std::endl; +} + +int main(int argc, char *argv[]) +{ + boost::scoped_ptr opts; + try { + opts.reset(new ProgramOptions(argc, const_cast(argv), + "f:m:q:")); + } catch (std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } + + if (opts->arguments().size() == 0) + { + usage(opts->programName()); + return 1; + } + + if (!opts->option('q') && !opts->option('f')) { + std::cerr << opts->programName() << + ": you should provide a query with -q or -f." << + std::endl; + return 1; + } + + if (opts->option('q') && opts->option('f')) { + std::cerr << opts->programName() << + ": -q and -f are mutually exclusive." << + std::endl; + return 1; + } + + tr1::shared_ptr reader; + try { + if (opts->arguments().size() == 1) + reader = tr1::shared_ptr( + openCorpus(opts->arguments().at(0), true)); + else + reader = tr1::shared_ptr( + openCorpora(opts->arguments().begin(), + opts->arguments().end(), true)); + } catch (std::runtime_error &e) { + std::cerr << "Could not open corpus: " << e.what() << std::endl; + return 1; + } + + alpinocorpus::Macros macros; + if (opts->option('m')) { + std::string macrosFn = opts->optionValue('m'); + try { + macros = alpinocorpus::loadMacros(macrosFn); + } catch (std::runtime_error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + } + + std::string query; + if (opts->option('q')) { + query = alpinocorpus::expandMacros(macros, opts->optionValue('q')); + } else if (opts->option('f')) { + try { + query = alpinocorpus::util::readFile(opts->optionValue('f')); + } catch (std::runtime_error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + } + + Either valid = + reader->isValidQuery(CorpusReader::XPATH, false, query); + if (valid.isLeft()) { + std::cerr << "Invalid (or unwanted) query: " << query << std::endl << std::endl; + std::cerr << valid.left() << std::endl; + return 1; + } + + try { + listCorpus(reader, query); + } catch (std::runtime_error const &e) { + std::cerr << opts->programName() << + ": error listing treebank: " << e.what() << std::endl; + return 1; + } + + return 0; +} diff -Nru alpino-corpus-1.4.0/util/xslt/main.cpp alpino-corpus-2.0.0/util/xslt/main.cpp --- alpino-corpus-1.4.0/util/xslt/main.cpp 2012-09-21 13:18:58.000000000 +0000 +++ alpino-corpus-2.0.0/util/xslt/main.cpp 2012-11-08 12:03:40.000000000 +0000 @@ -2,9 +2,9 @@ #include #include -#include -#include +#include +#include #include extern "C" { @@ -27,6 +27,7 @@ #include using alpinocorpus::CorpusReader; +using alpinocorpus::Either; namespace tr1 = std::tr1; @@ -83,8 +84,7 @@ std::endl << std::endl << " -g entry\tApply the stylesheet to a single entry" << std::endl << " -m filename\tLoad macro file" << std::endl << - " -q query\tFilter the treebank using the given query" << std::endl << - " -r\t\tProcess a directory of corpora recursively" << std::endl << std::endl; + " -q query\tFilter the treebank using the given query" << std::endl << std::endl; } int main (int argc, char *argv[]) @@ -101,7 +101,7 @@ boost::scoped_ptr opts; try { opts.reset(new ProgramOptions(argc, const_cast(argv), - "g:m:q:r")); + "g:m:q:")); } catch (std::exception &e) { std::cerr << e.what() << std::endl; return 1; @@ -125,11 +125,11 @@ try { if (opts->arguments().size() == 2) reader = tr1::shared_ptr( - openCorpus(opts->arguments().at(1), opts->option('r'))); + openCorpus(opts->arguments().at(1), true)); else reader = tr1::shared_ptr( openCorpora(opts->arguments().begin() + 1, - opts->arguments().end(), opts->option('r'))); + opts->arguments().end(), true)); } catch (std::runtime_error &e) { std::cerr << "Could not open corpus: " << e.what() << std::endl; return 1; @@ -150,8 +150,11 @@ if (opts->option('q')) { query = alpinocorpus::expandMacros(macros, opts->optionValue('q')); - if (!reader->isValidQuery(CorpusReader::XPATH, false, query)) { - std::cerr << "Invalid (or unwanted) query: " << query << std::endl; + Either valid = + reader->isValidQuery(CorpusReader::XPATH, false, query); + if (valid.isLeft()) { + std::cerr << "Invalid (or unwanted) query: " << query << std::endl << std::endl; + std::cerr << valid.left() << std::endl; return 1; } }