diff -Nru alpino-corpus-2.4.1/CMakeLists.txt alpino-corpus-2.6.0/CMakeLists.txt --- alpino-corpus-2.4.1/CMakeLists.txt 2017-01-21 09:17:16.000000000 +0000 +++ alpino-corpus-2.6.0/CMakeLists.txt 2017-01-21 09:17:16.000000000 +0000 @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 2.8.4) project(AlpinoCorpus) -set (ALPINOCORPUS_VERSION "2.4.0") +set (ALPINOCORPUS_VERSION "2.6.0") if (NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) set (CMAKE_BUILD_TYPE Release) @@ -44,6 +44,7 @@ endif() option(USE_REMOTE_CORPUS "Enable remote corpus" OFF) + include(GNUInstallDirs REQUIRED) if(USE_REMOTE_CORPUS) @@ -149,6 +150,7 @@ src/StylesheetIter.hh src/XSLTransformer.hh src/parseMacros.hh + src/util/InfInt.h src/util/NameCompare.hh src/util/base64.hh src/util/bufutil.hh @@ -225,7 +227,7 @@ ) set_target_properties(alpino_corpus PROPERTIES VERSION ${ALPINOCORPUS_VERSION} - SOVERSION 2.4) + SOVERSION 2.6) target_link_libraries(alpino_corpus ${Boost_LIBRARIES}) target_link_libraries(alpino_corpus ${ZLIB_LIBRARIES}) diff -Nru alpino-corpus-2.4.1/debian/changelog alpino-corpus-2.6.0/debian/changelog --- alpino-corpus-2.4.1/debian/changelog 2017-01-21 09:17:16.000000000 +0000 +++ alpino-corpus-2.6.0/debian/changelog 2017-01-21 09:17:16.000000000 +0000 @@ -1,3 +1,9 @@ +alpino-corpus (2.6.0-1xenial2) xenial; urgency=medium + + * New upstream release. + + -- Daniƫl de Kok Sat, 21 Jan 2017 08:43:28 +0100 + alpino-corpus (2.4.1-1xenial1) xenial; urgency=medium * Update to alpinocorpus 2.4.1. diff -Nru alpino-corpus-2.4.1/debian/control alpino-corpus-2.6.0/debian/control --- alpino-corpus-2.4.1/debian/control 2017-01-21 09:17:16.000000000 +0000 +++ alpino-corpus-2.6.0/debian/control 2017-01-21 09:17:16.000000000 +0000 @@ -8,7 +8,7 @@ Vcs-Git: git://github.com/rug-compling/alpinocorpus.git Vcs-Browser: https://github.com/rug-compling/alpinocorpus -Package: libalpino-corpus2.4 +Package: libalpino-corpus2.6 Section: libs Suggests: libalpino-corpus-dev, alpino-corpus-util Architecture: any @@ -22,7 +22,7 @@ Section: libdevel Architecture: any Pre-Depends: ${misc:Pre-Depends} -Depends: libdbxml-dev (>= 2.5.16), zlib1g-dev, libalpino-corpus2.4 (= ${binary:Version}), ${misc:Depends} +Depends: libdbxml-dev (>= 2.5.16), zlib1g-dev, libalpino-corpus2.6 (= ${binary:Version}), ${misc:Depends} Description: Library for processing XML-based treebanks (development headers) AlpinoCorpus is a library for processing XML-based treebanks. It can read XML corpora in various formats and provides querying facilities. diff -Nru alpino-corpus-2.4.1/debian/libalpino-corpus2.4.dirs alpino-corpus-2.6.0/debian/libalpino-corpus2.4.dirs --- alpino-corpus-2.4.1/debian/libalpino-corpus2.4.dirs 2017-01-21 09:17:16.000000000 +0000 +++ alpino-corpus-2.6.0/debian/libalpino-corpus2.4.dirs 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib diff -Nru alpino-corpus-2.4.1/debian/libalpino-corpus2.4.install alpino-corpus-2.6.0/debian/libalpino-corpus2.4.install --- alpino-corpus-2.4.1/debian/libalpino-corpus2.4.install 2017-01-21 09:17:16.000000000 +0000 +++ alpino-corpus-2.6.0/debian/libalpino-corpus2.4.install 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib/x86_64-linux-gnu/libalpino_corpus.so.* diff -Nru alpino-corpus-2.4.1/debian/libalpino-corpus2.4.install.in alpino-corpus-2.6.0/debian/libalpino-corpus2.4.install.in --- alpino-corpus-2.4.1/debian/libalpino-corpus2.4.install.in 2017-01-21 09:17:16.000000000 +0000 +++ alpino-corpus-2.6.0/debian/libalpino-corpus2.4.install.in 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib/${DEB_HOST_MULTIARCH}/libalpino_corpus.so.* diff -Nru alpino-corpus-2.4.1/debian/libalpino-corpus2.6.dirs alpino-corpus-2.6.0/debian/libalpino-corpus2.6.dirs --- alpino-corpus-2.4.1/debian/libalpino-corpus2.6.dirs 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.6.0/debian/libalpino-corpus2.6.dirs 2017-01-21 09:17:16.000000000 +0000 @@ -0,0 +1 @@ +usr/lib diff -Nru alpino-corpus-2.4.1/debian/libalpino-corpus2.6.install alpino-corpus-2.6.0/debian/libalpino-corpus2.6.install --- alpino-corpus-2.4.1/debian/libalpino-corpus2.6.install 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.6.0/debian/libalpino-corpus2.6.install 2017-01-21 09:17:16.000000000 +0000 @@ -0,0 +1 @@ +usr/lib/x86_64-linux-gnu/libalpino_corpus.so.* diff -Nru alpino-corpus-2.4.1/debian/libalpino-corpus2.6.install.in alpino-corpus-2.6.0/debian/libalpino-corpus2.6.install.in --- alpino-corpus-2.4.1/debian/libalpino-corpus2.6.install.in 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.6.0/debian/libalpino-corpus2.6.install.in 2017-01-21 09:17:16.000000000 +0000 @@ -0,0 +1 @@ +usr/lib/${DEB_HOST_MULTIARCH}/libalpino_corpus.so.* diff -Nru alpino-corpus-2.4.1/debian/rules alpino-corpus-2.6.0/debian/rules --- alpino-corpus-2.4.1/debian/rules 2017-01-21 09:17:16.000000000 +0000 +++ alpino-corpus-2.6.0/debian/rules 2017-01-21 09:17:16.000000000 +0000 @@ -13,7 +13,7 @@ include /usr/share/cdbs/1/class/cmake.mk common-install-arch:: - for file in libalpino-corpus2.4.install libalpino-corpus-dev.install ; \ + for file in libalpino-corpus2.6.install libalpino-corpus-dev.install ; \ do \ sed -e"s,\$${DEB_HOST_MULTIARCH},${DEB_HOST_MULTIARCH},g" \ debian/$${file}.in > debian/$$file; \ diff -Nru alpino-corpus-2.4.1/include/AlpinoCorpus/capi.h alpino-corpus-2.6.0/include/AlpinoCorpus/capi.h --- alpino-corpus-2.4.1/include/AlpinoCorpus/capi.h 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/include/AlpinoCorpus/capi.h 2017-01-21 07:33:18.000000000 +0000 @@ -18,6 +18,11 @@ char const *value; } marker_query_t; +typedef enum { + natural_order, + numerical_order +} sort_order_t; + /** * Initialize the library. If you don't use the library anymore, call * alpinocorpus_cleanup(). @@ -60,7 +65,8 @@ * alpinocorpus_iter_destroy to free up resources associated with the * iterator. */ -alpinocorpus_iter alpinocorpus_entry_iter(alpinocorpus_reader corpus); +alpinocorpus_iter alpinocorpus_entry_iter(alpinocorpus_reader corpus, + sort_order_t sort_order); /** @@ -70,7 +76,7 @@ */ alpinocorpus_iter alpinocorpus_query_stylesheet_iter(alpinocorpus_reader corpus, char const *query, char const *stylesheet, marker_query_t *queries, - size_t n_queries); + size_t n_queries, sort_order_t sort_order); /** * Get an iterator over the entries in a corpus, where the contents @@ -82,7 +88,8 @@ char const *stylesheet, char const *markerQuery, char const *markerAttr, - char const *markerValue); + char const *markerValue, + sort_order_t sort_order); /** * Get the contents of an entry. The content string is deallocated when @@ -106,7 +113,8 @@ * alpinocorpus_iter_destroy to free up resources associated with the * iterator. */ -alpinocorpus_iter alpinocorpus_query_iter(alpinocorpus_reader reader, char const *query); +alpinocorpus_iter alpinocorpus_query_iter(alpinocorpus_reader reader, + char const *query, sort_order_t sort_order); /** * Destroy an iterator. diff -Nru alpino-corpus-2.4.1/include/AlpinoCorpus/CompactCorpusReader.hh alpino-corpus-2.6.0/include/AlpinoCorpus/CompactCorpusReader.hh --- alpino-corpus-2.4.1/include/AlpinoCorpus/CompactCorpusReader.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/include/AlpinoCorpus/CompactCorpusReader.hh 2017-01-21 07:33:18.000000000 +0000 @@ -23,7 +23,7 @@ virtual ~CompactCorpusReader(); private: - virtual EntryIterator getEntries() const; + virtual EntryIterator getEntries(SortOrder sortOrder) const; virtual std::string getName() const; virtual std::string readEntry(std::string const &filename) const; virtual size_t getSize() const; diff -Nru alpino-corpus-2.4.1/include/AlpinoCorpus/CorpusReader.hh alpino-corpus-2.6.0/include/AlpinoCorpus/CorpusReader.hh --- alpino-corpus-2.4.1/include/AlpinoCorpus/CorpusReader.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/include/AlpinoCorpus/CorpusReader.hh 2017-01-21 07:33:18.000000000 +0000 @@ -18,6 +18,26 @@ namespace alpinocorpus { /** + * The sort order for iterators. + * + * Note that this is currently just a hint to the iterator implementation. + * Most iterators currently only support the order that is natural to the + * underlying corpus. + */ +enum SortOrder { + /** + * Sort using the natural order, typically the sequence in which the + * entries are stored in a treebank. + */ + NaturalOrder, + + /** + * Sort using numeric order. + */ + NumericalOrder +}; + +/** * Abstract base class for corpus readers. * * A corpus is conceptually a mapping of names to XML documents. @@ -28,7 +48,6 @@ public: /** Forward iterator over entry names */ class ALPINO_CORPUS_EXPORT EntryIterator - : public std::iterator { public: EntryIterator(); @@ -69,14 +88,15 @@ std::string name() const; /** Iterator over entry names. */ - EntryIterator entries() const; + EntryIterator entries(SortOrder order = NaturalOrder) const; /** * Iterator over entry names, contents are transformed with * the given stylesheet. */ EntryIterator entriesWithStylesheet(std::string const &stylesheet, - std::list const &markerQueries = std::list()) const; + std::list const &markerQueries = std::list(), + SortOrder sortOrder = NaturalOrder) const; enum QueryDialect { XPATH, XQUERY }; @@ -84,15 +104,17 @@ Either isValidQuery(QueryDialect d, bool variables, std::string const &q) const; /** Execute query. The end of the range is given by end(). */ - EntryIterator query(QueryDialect d, std::string const &q) const; + EntryIterator query(QueryDialect d, std::string const &q, + SortOrder sortOrder = NaturalOrder) const; /** * Execute a query, applying the given stylesheet to each entry. The * end of the range is given by end(). */ EntryIterator queryWithStylesheet(QueryDialect d, std::string const &q, - std::string const &stylesheet, - std::list const &markerQueries) const; + std::string const &stylesheet, + std::list const &markerQueries, + SortOrder sortOrder = NaturalOrder) const; /** * Return content of a single treebank entry. Mark elements if a marker @@ -123,7 +145,7 @@ size_t size() const; private: - virtual EntryIterator getEntries() const = 0; + virtual EntryIterator getEntries(SortOrder sortOrder) const = 0; virtual std::string getName() const = 0; virtual std::vector getSentence(std::string const &entry, std::string const &query, std::string const &attribute, @@ -132,11 +154,11 @@ virtual std::string readEntry(std::string const &entry) const = 0; virtual std::string readEntryMarkQueries(std::string const &entry, std::list const &queries) const; - virtual EntryIterator runXPath(std::string const &) const; - virtual EntryIterator runXQuery(std::string const &) const; + virtual EntryIterator runXPath(std::string const &, SortOrder sortOrder) const; + virtual EntryIterator runXQuery(std::string const &, SortOrder sortOrder) const; virtual EntryIterator runQueryWithStylesheet(QueryDialect d, std::string const &q, std::string const &stylesheet, - std::list const &markerQueries) const; + std::list const &markerQueries, SortOrder sortOrder) const; virtual Either validQuery(QueryDialect d, bool variables, std::string const &q) const; // Initialized lazily in type(); diff -Nru alpino-corpus-2.4.1/include/AlpinoCorpus/DbCorpusReader.hh alpino-corpus-2.6.0/include/AlpinoCorpus/DbCorpusReader.hh --- alpino-corpus-2.4.1/include/AlpinoCorpus/DbCorpusReader.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/include/AlpinoCorpus/DbCorpusReader.hh 2017-01-21 07:33:18.000000000 +0000 @@ -23,7 +23,7 @@ private: Either validQuery(QueryDialect d, bool variables, std::string const &query) const; - EntryIterator getEntries() const; + EntryIterator getEntries(SortOrder sortOrder) const; std::string getName() const; std::string readEntry(std::string const &) const; EntryIterator runXPath(std::string const &) const; diff -Nru alpino-corpus-2.4.1/include/AlpinoCorpus/DirectoryCorpusReader.hh alpino-corpus-2.6.0/include/AlpinoCorpus/DirectoryCorpusReader.hh --- alpino-corpus-2.4.1/include/AlpinoCorpus/DirectoryCorpusReader.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/include/AlpinoCorpus/DirectoryCorpusReader.hh 2017-01-21 07:33:18.000000000 +0000 @@ -26,7 +26,7 @@ ~DirectoryCorpusReader(); private: - virtual EntryIterator getEntries() const; + virtual EntryIterator getEntries(SortOrder sortOrder) const; virtual std::string getName() const; virtual std::string readEntry(std::string const &entry) const; virtual size_t getSize() const; diff -Nru alpino-corpus-2.4.1/include/AlpinoCorpus/MultiCorpusReader.hh alpino-corpus-2.6.0/include/AlpinoCorpus/MultiCorpusReader.hh --- alpino-corpus-2.4.1/include/AlpinoCorpus/MultiCorpusReader.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/include/AlpinoCorpus/MultiCorpusReader.hh 2017-01-21 07:33:18.000000000 +0000 @@ -17,7 +17,7 @@ void push_back(std::string const &name, std::string const &filename, bool recursive); private: - EntryIterator getEntries() const; + EntryIterator getEntries(SortOrder sortOrder) const; std::string getName() const; size_t getSize() const; std::string readEntry(std::string const &) const; diff -Nru alpino-corpus-2.4.1/include/AlpinoCorpus/RecursiveCorpusReader.hh alpino-corpus-2.6.0/include/AlpinoCorpus/RecursiveCorpusReader.hh --- alpino-corpus-2.4.1/include/AlpinoCorpus/RecursiveCorpusReader.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/include/AlpinoCorpus/RecursiveCorpusReader.hh 2017-01-21 07:33:18.000000000 +0000 @@ -24,7 +24,7 @@ RecursiveCorpusReader(std::string const &directory, bool dactOnly = true); virtual ~RecursiveCorpusReader(); private: - EntryIterator getEntries() const; + EntryIterator getEntries(SortOrder sortOrder) const; std::string getName() const; size_t getSize() const; std::string readEntry(std::string const &) const; diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-create.1 alpino-corpus-2.6.0/man/alpinocorpus-create.1 --- alpino-corpus-2.4.1/man/alpinocorpus-create.1 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-create.1 2017-01-21 07:33:18.000000000 +0000 @@ -1,4 +1,7 @@ +.\" Automatically generated by Pandoc 1.19.1 +.\" .TH "ALPINOCORPUS\-CREATE" "1" "Nov 19, 2012" "" "" +.hy .SH NAME .PP \f[B]alpinocorpus\-create\f[] \-\- Create Alpino treebanks @@ -32,6 +35,11 @@ .RS .RE .TP +.B \f[C]\-n\f[] +Sort entries numerically, if supported by the underlying reader. +.RS +.RE +.TP .B \f[C]\-q\f[] \f[I]QUERY\f[] Only include entries that match \f[I]QUERY\f[] (XPath 2.0). .RS @@ -44,7 +52,7 @@ .RE .SH SEE ALSO .PP -alpinocorpus\-get(1), alpinocorpus\-stats(1), alpinocorpus\-xpath(1), -alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) +alpinocorpus\-create(1), alpinocorpus\-extract(1), +alpinocorpus\-stats(1), alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) .SH AUTHORS Daniel de Kok. diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-create.1.md alpino-corpus-2.6.0/man/alpinocorpus-create.1.md --- alpino-corpus-2.4.1/man/alpinocorpus-create.1.md 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-create.1.md 2017-01-21 07:33:18.000000000 +0000 @@ -22,19 +22,32 @@ The following options are available: `-c` *FILENAME* + : Create a compact corpus. + `-d` *FILENAME* + : Create a Dact corpus. + `-m` *MACROFILE* + : Load macros from *MACROFILE*. + +`-n` + +: Sort entries numerically, if supported by the underlying reader. + `-q` *QUERY* + : Only include entries that match *QUERY* (XPath 2.0). + `-r` + : If *treebank* is a directory, include the contents of any Dact corpus below it, rather than including XML files. SEE ALSO ======== -alpinocorpus-get(1), alpinocorpus-stats(1), alpinocorpus-xpath(1), +alpinocorpus-create(1), alpinocorpus-extract(1), alpinocorpus-stats(1), alpinocorpus-xquery(1), alpinocorpus-xslt(1) diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-extract.1 alpino-corpus-2.6.0/man/alpinocorpus-extract.1 --- alpino-corpus-2.4.1/man/alpinocorpus-extract.1 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-extract.1 2017-01-21 07:33:18.000000000 +0000 @@ -0,0 +1,23 @@ +.\" Automatically generated by Pandoc 1.17.2 +.\" +.TH "ALPINOCORPUS\-EXTRACT" "1" "Jul 6, 2016" "" "" +.hy +.SH NAME +.PP +\f[B]alpinocorpus\-extract\f[] \-\- Extract Alpino treebank entry to a +directory +.SH SYNOPSIS +.PP +\f[B]alpinocorpus\-extract\f[] \f[I]treebank\f[] \f[I]directory\f[] +.SH DESCRIPTION +.PP +The \f[B]alpinocorpus\-extract\f[] utility extracts a \f[I]treebank\f[] +to a \f[I]directory\f[]. +The treebank can either be a Dact treebank or a compact corpus +(\f[C]\&.index\f[] or \f[C]\&.data.dz\f[] file). +.SH SEE ALSO +.PP +alpinocorpus\-create(1), alpinocorpus\-get(1), alpinocorpus\-stats(1), +alpinocorpus\-xpath(1), alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) +.SH AUTHORS +Daniel de Kok. diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-extract.1.md alpino-corpus-2.6.0/man/alpinocorpus-extract.1.md --- alpino-corpus-2.4.1/man/alpinocorpus-extract.1.md 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-extract.1.md 2017-01-21 07:33:18.000000000 +0000 @@ -0,0 +1,25 @@ +% ALPINOCORPUS-EXTRACT(1) +% Daniel de Kok +% Jul 6, 2016 +NAME +==== + +**alpinocorpus-extract** -- Extract Alpino treebank entry to a directory + +SYNOPSIS +======== + +**alpinocorpus-extract** *treebank* *directory* + +DESCRIPTION +=========== + +The **alpinocorpus-extract** utility extracts a *treebank* to a *directory*. +The treebank can either be a Dact treebank or a compact corpus (`.index` +or `.data.dz` file). + +SEE ALSO +======== + +alpinocorpus-create(1), alpinocorpus-get(1), alpinocorpus-stats(1), +alpinocorpus-xpath(1), alpinocorpus-xquery(1), alpinocorpus-xslt(1) diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-get.1 alpino-corpus-2.6.0/man/alpinocorpus-get.1 --- alpino-corpus-2.4.1/man/alpinocorpus-get.1 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-get.1 2017-01-21 07:33:18.000000000 +0000 @@ -1,4 +1,7 @@ +.\" Automatically generated by Pandoc 1.17.2 +.\" .TH "ALPINOCORPUS\-GET" "1" "Oct 8, 2014" "" "" +.hy .SH NAME .PP \f[B]alpinocorpus\-get\f[] \-\- Print Alpino treebank entry to stdout @@ -16,26 +19,23 @@ .TP .B \f[C]\-a\f[] \f[I]ATTRIBUTE\f[] Mark nodes selected with \[aq]q\[aq] with \f[I]ATTRIBUTE\f[]. +\f[C]\-m\f[] \f[I]MACROFILE\f[] .RS .RE -.TP -.B \f[C]\-m\f[] \f[I]MACROFILE\f[] Load macros from \f[I]MACROFILE\f[]. +\f[C]\-q\f[] \f[I]QUERY\f[] .RS .RE -.TP -.B \f[C]\-q\f[] \f[I]QUERY\f[] Mark nodes in the output using \f[I]QUERY\f[] (XPath 2.0). +\f[C]\-v\f[] \f[I]VALUE\f[] .RS .RE -.TP -.B \f[C]\-v\f[] \f[I]VALUE\f[] Mark nodes selected with \[aq]q\[aq] with \f[I]VALUE\f[]. .RS .RE .SH SEE ALSO .PP -alpinocorpus\-create(1), alpinocorpus\-stats(1), -alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) +alpinocorpus\-create(1), alpinocorpus\-extract, alpinocorpus\-stats(1), +alpinocorpus\-xpath(1), alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) .SH AUTHORS Daniel de Kok. diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-get.1.md alpino-corpus-2.6.0/man/alpinocorpus-get.1.md --- alpino-corpus-2.4.1/man/alpinocorpus-get.1.md 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-get.1.md 2017-01-21 07:33:18.000000000 +0000 @@ -20,16 +20,23 @@ The following options are available: `-a` *ATTRIBUTE* + : Mark nodes selected with 'q' with *ATTRIBUTE*. + `-m` *MACROFILE* + : Load macros from *MACROFILE*. + `-q` *QUERY* + : Mark nodes in the output using *QUERY* (XPath 2.0). + `-v` *VALUE* + : Mark nodes selected with 'q' with *VALUE*. SEE ALSO ======== -alpinocorpus-create(1), alpinocorpus-stats(1), alpinocorpus-xquery(1), -alpinocorpus-xslt(1) +alpinocorpus-create(1), alpinocorpus-extract, alpinocorpus-stats(1), +alpinocorpus-xpath(1), alpinocorpus-xquery(1), alpinocorpus-xslt(1) diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-stats.1 alpino-corpus-2.6.0/man/alpinocorpus-stats.1 --- alpino-corpus-2.4.1/man/alpinocorpus-stats.1 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-stats.1 2017-01-21 07:33:18.000000000 +0000 @@ -1,4 +1,7 @@ +.\" Automatically generated by Pandoc 1.17.2 +.\" .TH "ALPINOCORPUS\-STATS" "1" "Nov 19, 2012" "" "" +.hy .SH NAME .PP \f[B]alpinocorpus\-stats\f[] \-\- Alpino treebank node statistics @@ -18,10 +21,9 @@ .TP .B \f[C]\-m\f[] \f[I]MACROFILE\f[] Load macros from \f[I]MACROFILE\f[]. +\f[C]\-p\f[] .RS .RE -.TP -.B \f[C]\-p\f[] Print relative frequencies. .RS .RE @@ -39,7 +41,7 @@ \f[I]loop\f[] as its root. .SH SEE ALSO .PP -alpinocorpus\-create(1), alpinocorpus\-get(1), alpinocorpus\-xpath(1), -alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) +alpinocorpus\-create(1), alpinocorpus\-extract(1), alpinocorpus\-get(1), +alpinocorpus\-xpath(1), alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) .SH AUTHORS Daniel de Kok. diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-stats.1.md alpino-corpus-2.6.0/man/alpinocorpus-stats.1.md --- alpino-corpus-2.4.1/man/alpinocorpus-stats.1.md 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-stats.1.md 2017-01-21 07:33:18.000000000 +0000 @@ -22,8 +22,11 @@ The following options are available: `-m` *MACROFILE* + : Load macros from *MACROFILE*. + `-p` + : Print relative frequencies. EXAMPLE @@ -39,5 +42,5 @@ SEE ALSO ======== -alpinocorpus-create(1), alpinocorpus-get(1), alpinocorpus-xpath(1), -alpinocorpus-xquery(1), alpinocorpus-xslt(1) +alpinocorpus-create(1), alpinocorpus-extract(1), alpinocorpus-get(1), +alpinocorpus-xpath(1), alpinocorpus-xquery(1), alpinocorpus-xslt(1) diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-xpath.1 alpino-corpus-2.6.0/man/alpinocorpus-xpath.1 --- alpino-corpus-2.4.1/man/alpinocorpus-xpath.1 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-xpath.1 2017-01-21 07:33:18.000000000 +0000 @@ -1,4 +1,7 @@ +.\" Automatically generated by Pandoc 1.17.2 +.\" .TH "ALPINOCORPUS\-XPATH" "1" "Nov 19, 2012" "" "" +.hy .SH NAME .PP \f[B]alpinocorpus\-xpath\f[] \-\- Alpino treebank XPath search @@ -18,22 +21,20 @@ .TP .B \f[C]\-m\f[] \f[I]MACROFILE\f[] Load macros from \f[I]MACROFILE\f[]. +\f[C]\-q\f[] \f[I]QUERY\f[] .RS .RE -.TP -.B \f[C]\-q\f[] \f[I]QUERY\f[] Only show entries that match \f[I]QUERY\f[] (XPath 2.0). +\f[C]\-s\f[] .RS .RE -.TP -.B \f[C]\-s\f[] Print the sentence of each entry, fragments that match the query are colored. .RS .RE .SH SEE ALSO .PP -alpinocorpus\-create(1), alpinocorpus\-get(1), alpinocorpus\-stats(1), -alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) +alpinocorpus\-create(1), alpinocorpus\-extract(1), alpinocorpus\-get(1), +alpinocorpus\-stats(1), alpinocorpus\-xquery(1), alpinocorpus\-xslt(1) .SH AUTHORS Daniel de Kok. diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-xpath.1.md alpino-corpus-2.6.0/man/alpinocorpus-xpath.1.md --- alpino-corpus-2.4.1/man/alpinocorpus-xpath.1.md 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-xpath.1.md 2017-01-21 07:33:18.000000000 +0000 @@ -22,15 +22,20 @@ The following options are available: `-m` *MACROFILE* + : Load macros from *MACROFILE*. + `-q` *QUERY* + : Only show entries that match *QUERY* (XPath 2.0). + `-s` + : Print the sentence of each entry, fragments that match the query are colored. SEE ALSO ======== -alpinocorpus-create(1), alpinocorpus-get(1), alpinocorpus-stats(1), -alpinocorpus-xquery(1), alpinocorpus-xslt(1) +alpinocorpus-create(1), alpinocorpus-extract(1), alpinocorpus-get(1), +alpinocorpus-stats(1), alpinocorpus-xquery(1), alpinocorpus-xslt(1) diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-xquery.1 alpino-corpus-2.6.0/man/alpinocorpus-xquery.1 --- alpino-corpus-2.4.1/man/alpinocorpus-xquery.1 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-xquery.1 2017-01-21 07:33:18.000000000 +0000 @@ -1,4 +1,7 @@ +.\" Automatically generated by Pandoc 1.17.2 +.\" .TH "ALPINOCORPUS\-XQUERY" "1" "Nov 19, 2012" "" "" +.hy .SH NAME .PP \f[B]alpinocorpus\-xquery\f[] \-\- Apply an XQuery program to an Alpino @@ -22,21 +25,19 @@ .TP .B \f[C]\-f\f[] \f[I]XQUERY_PROGRAM\f[] Read XQuery program from the file \f[I]XQUERY_PROGRAM\f[]. +\f[C]\-m\f[] \f[I]MACROFILE\f[] .RS .RE -.TP -.B \f[C]\-m\f[] \f[I]MACROFILE\f[] Load macros from \f[I]MACROFILE\f[]. +\f[C]\-q\f[] \f[I]QUERY\f[] .RS .RE -.TP -.B \f[C]\-q\f[] \f[I]QUERY\f[] Only show entries that match \f[I]QUERY\f[] (XPath 2.0). .RS .RE .SH SEE ALSO .PP -alpinocorpus\-create(1), alpinocorpus\-get(1), alpinocorpus\-stats(1), -alpinocorpus\-xpath(1), alpinocorpus\-xslt(1) +alpinocorpus\-create(1), alpinocorpus\-get(1), alpinocorpus\-extract(1), +alpinocorpus\-stats(1), alpinocorpus\-xpath(1), alpinocorpus\-xslt(1) .SH AUTHORS Daniel de Kok. diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-xquery.1.md alpino-corpus-2.6.0/man/alpinocorpus-xquery.1.md --- alpino-corpus-2.4.1/man/alpinocorpus-xquery.1.md 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-xquery.1.md 2017-01-21 07:33:18.000000000 +0000 @@ -25,14 +25,19 @@ The following options are available: `-f` *XQUERY_PROGRAM* + : Read XQuery program from the file *XQUERY_PROGRAM*. + `-m` *MACROFILE* + : Load macros from *MACROFILE*. + `-q` *QUERY* + : Only show entries that match *QUERY* (XPath 2.0). SEE ALSO ======== -alpinocorpus-create(1), alpinocorpus-get(1), alpinocorpus-stats(1), -alpinocorpus-xpath(1), alpinocorpus-xslt(1) +alpinocorpus-create(1), alpinocorpus-get(1), alpinocorpus-extract(1), +alpinocorpus-stats(1), alpinocorpus-xpath(1), alpinocorpus-xslt(1) diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-xslt.1 alpino-corpus-2.6.0/man/alpinocorpus-xslt.1 --- alpino-corpus-2.4.1/man/alpinocorpus-xslt.1 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-xslt.1 2017-01-21 07:33:18.000000000 +0000 @@ -1,4 +1,7 @@ +.\" Automatically generated by Pandoc 1.17.2 +.\" .TH "ALPINOCORPUS\-XSLT" "1" "Nov 19, 2012" "" "" +.hy .SH NAME .PP \f[B]alpinocorpus\-xslt\f[] \-\- Apply stylesheet to an Alpino treebank @@ -19,15 +22,13 @@ .B \f[C]\-g\f[] \f[I]ENTRY\f[] Apply the stylesheet to \f[I]ENTRY\f[], rather than each entry in the treebank. +\f[C]\-m\f[] \f[I]MACROFILE\f[] .RS .RE -.TP -.B \f[C]\-m\f[] \f[I]MACROFILE\f[] Load macros from \f[I]MACROFILE\f[]. +\f[C]\-q\f[] \f[I]QUERY\f[] .RS .RE -.TP -.B \f[C]\-q\f[] \f[I]QUERY\f[] Filter the treebank using \f[I]QUERY\f[] (XPath 2.0). Nodes in the XML data that match \f[I]QUERY\f[] get the attribute\-value pair \f[I]active=1\f[]. @@ -35,7 +36,7 @@ .RE .SH SEE ALSO .PP -alpinocorpus\-create(1), alpinocorpus\-get(1), alpinocorpus\-stats(1), -alpinocorpus\-xpath(1), alpinocorpus\-xquery(1) +alpinocorpus\-create(1), alpinocorpus\-extract(1), alpinocorpus\-get(1) +alpinocorpus\-stats(1), alpinocorpus\-xpath(1), alpinocorpus\-xquery(1) .SH AUTHORS Daniel de Kok. diff -Nru alpino-corpus-2.4.1/man/alpinocorpus-xslt.1.md alpino-corpus-2.6.0/man/alpinocorpus-xslt.1.md --- alpino-corpus-2.4.1/man/alpinocorpus-xslt.1.md 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/alpinocorpus-xslt.1.md 2017-01-21 07:33:18.000000000 +0000 @@ -22,15 +22,20 @@ The following options are available: `-g` *ENTRY* + : Apply the stylesheet to *ENTRY*, rather than each entry in the treebank. + `-m` *MACROFILE* + : Load macros from *MACROFILE*. + `-q` *QUERY* + : Filter the treebank using *QUERY* (XPath 2.0). Nodes in the XML data that match *QUERY* get the attribute-value pair *active=1*. SEE ALSO ======== -alpinocorpus-create(1), alpinocorpus-get(1), alpinocorpus-stats(1), -alpinocorpus-xpath(1), alpinocorpus-xquery(1) +alpinocorpus-create(1), alpinocorpus-extract(1), alpinocorpus-get(1) +alpinocorpus-stats(1), alpinocorpus-xpath(1), alpinocorpus-xquery(1) diff -Nru alpino-corpus-2.4.1/man/Makefile alpino-corpus-2.6.0/man/Makefile --- alpino-corpus-2.4.1/man/Makefile 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/man/Makefile 2017-01-21 07:33:18.000000000 +0000 @@ -1,5 +1,20 @@ all: alpinocorpus-create.1 alpinocorpus-stats.1 alpinocorpus-xpath.1 \ - alpinocorpus-xquery.1 alpinocorpus-xslt.1 alpinocorpus-get.1 + alpinocorpus-xquery.1 alpinocorpus-xslt.1 alpinocorpus-get.1 \ + alpinocorpus-extract.1 + +clean: + rm -f *.html + +web: alpinocorpus-create.1.html \ + alpinocorpus-stats.1.html \ + alpinocorpus-xpath.1.html \ + alpinocorpus-xquery.1.html \ + alpinocorpus-xslt.1.html \ + alpinocorpus-get.1.html \ + alpinocorpus-extract.1.html %.1: %.1.md pandoc -s -w man -o $@ $< + +%.1.html: %.1.md + pandoc -s -c manpage.css -w html5 -o $@ $< diff -Nru alpino-corpus-2.4.1/man/manpage.css alpino-corpus-2.6.0/man/manpage.css --- alpino-corpus-2.4.1/man/manpage.css 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.6.0/man/manpage.css 2017-01-21 07:33:18.000000000 +0000 @@ -0,0 +1,7 @@ +body { + font-family:'Source Sans Pro', verdana, arial, helvetica, sans-serif;font-size:15px +} + +h1 { + color: #5C7A99; +} diff -Nru alpino-corpus-2.4.1/README.md alpino-corpus-2.6.0/README.md --- alpino-corpus-2.4.1/README.md 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/README.md 2017-01-21 07:33:18.000000000 +0000 @@ -54,7 +54,7 @@ - A C++ compiler. - Boost 1.47.0. -- Berkeley DB XML 2.5.16 or later. +- Berkeley DB XML 2.5.16 (later versions are not supported, see #131). - libxml2 - libxslt diff -Nru alpino-corpus-2.4.1/src/capi.cpp alpino-corpus-2.6.0/src/capi.cpp --- alpino-corpus-2.4.1/src/capi.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/capi.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -9,6 +9,8 @@ #include #include +#include + #include #include #include @@ -16,6 +18,17 @@ #include +alpinocorpus::SortOrder to_sort_order(sort_order_t sort_order) { + switch (sort_order) { + case natural_order: + return alpinocorpus::NaturalOrder; + case numerical_order: + return alpinocorpus::NumericalOrder; + default: + BOOST_ASSERT_MSG(false, "Unknown sort order"); + } +} + extern "C" { #include @@ -96,9 +109,11 @@ alpinocorpus::CorpusReader::XPATH, false, query).isRight()); } -alpinocorpus_iter alpinocorpus_entry_iter(alpinocorpus_reader corpus) +alpinocorpus_iter alpinocorpus_entry_iter(alpinocorpus_reader corpus, + sort_order_t sort_order) { - alpinocorpus_iter i = new alpinocorpus_iter_t(corpus->corpusReader->entries()); + alpinocorpus_iter i = new alpinocorpus_iter_t( + corpus->corpusReader->entries(to_sort_order(sort_order))); return i; } @@ -122,7 +137,7 @@ alpinocorpus_iter alpinocorpus_query_stylesheet_iter(alpinocorpus_reader corpus, char const *query, char const *stylesheet, marker_query_t *queries, - size_t n_queries) + size_t n_queries, sort_order_t sort_order) { std::list markerQueries; @@ -140,7 +155,7 @@ try { iter = corpus->corpusReader->queryWithStylesheet( alpinocorpus::CorpusReader::XPATH, query, stylesheet, - markerQueries); + markerQueries, to_sort_order(sort_order)); } catch (std::exception const &) { return NULL; } @@ -153,21 +168,25 @@ char const *stylesheet, char const *markerQuery, char const *markerAttr, - char const *markerValue) + char const *markerValue, + sort_order_t sort_order) { marker_query_t m [1]; m[0].query = markerQuery; m[0].attr = markerAttr; m[0].value = markerValue; - return alpinocorpus_query_stylesheet_iter(corpus, query, stylesheet, m, 1); + return alpinocorpus_query_stylesheet_iter(corpus, query, stylesheet, m, 1, + sort_order); } -alpinocorpus_iter alpinocorpus_query_iter(alpinocorpus_reader reader, char const *query) +alpinocorpus_iter alpinocorpus_query_iter(alpinocorpus_reader reader, + char const *query, sort_order_t sort_order) { alpinocorpus::CorpusReader::EntryIterator iter; try { - iter = reader->corpusReader->query(alpinocorpus::CorpusReader::XPATH, query); + iter = reader->corpusReader->query(alpinocorpus::CorpusReader::XPATH, + query, to_sort_order(sort_order)); } catch (std::exception const &) { return NULL; } diff -Nru alpino-corpus-2.4.1/src/CompactCorpusReader.cpp alpino-corpus-2.6.0/src/CompactCorpusReader.cpp --- alpino-corpus-2.4.1/src/CompactCorpusReader.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/CompactCorpusReader.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -21,9 +21,9 @@ delete d_private; } -CorpusReader::EntryIterator CompactCorpusReader::getEntries() const +CorpusReader::EntryIterator CompactCorpusReader::getEntries(SortOrder sortOrder) const { - return d_private->getEntries(); + return d_private->getEntries(sortOrder); } std::string CompactCorpusReader::getName() const diff -Nru alpino-corpus-2.4.1/src/CompactCorpusReaderPrivate.cpp alpino-corpus-2.6.0/src/CompactCorpusReaderPrivate.cpp --- alpino-corpus-2.4.1/src/CompactCorpusReaderPrivate.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/CompactCorpusReaderPrivate.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -68,7 +68,7 @@ d_name = canonical; } -CorpusReader::EntryIterator CompactCorpusReaderPrivate::getEntries() const +CorpusReader::EntryIterator CompactCorpusReaderPrivate::getEntries(SortOrder sortOrder) const { ItemVector::const_iterator begin(d_indices.begin()); return EntryIterator(new IndexIter(begin, d_indices.end())); diff -Nru alpino-corpus-2.4.1/src/CompactCorpusReaderPrivate.hh alpino-corpus-2.6.0/src/CompactCorpusReaderPrivate.hh --- alpino-corpus-2.4.1/src/CompactCorpusReaderPrivate.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/CompactCorpusReaderPrivate.hh 2017-01-21 07:33:18.000000000 +0000 @@ -63,7 +63,7 @@ CompactCorpusReaderPrivate(std::string const &dataFilename, std::string const &indexFilename); virtual ~CompactCorpusReaderPrivate() {} - virtual EntryIterator getEntries() const; + virtual EntryIterator getEntries(SortOrder sortOrder) const; virtual std::string getName() const; virtual std::string readEntry(std::string const &filename) const; virtual size_t getSize() const; diff -Nru alpino-corpus-2.4.1/src/CorpusReader.cpp alpino-corpus-2.6.0/src/CorpusReader.cpp --- alpino-corpus-2.4.1/src/CorpusReader.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/CorpusReader.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -188,16 +188,17 @@ } - CorpusReader::EntryIterator CorpusReader::entries() const + CorpusReader::EntryIterator CorpusReader::entries(SortOrder sortOrder) const { - return getEntries(); + return getEntries(sortOrder); } CorpusReader::EntryIterator CorpusReader::entriesWithStylesheet( std::string const &stylesheet, - std::list const &markerQueries) const + std::list const &markerQueries, + SortOrder sortOrder) const { - return EntryIterator(new StylesheetIter(getEntries(), + return EntryIterator(new StylesheetIter(getEntries(sortOrder), stylesheet, markerQueries)); } @@ -541,7 +542,7 @@ } CorpusReader::EntryIterator CorpusReader::query(QueryDialect d, - std::string const &q) const + std::string const &q, SortOrder sortOrder) const { if (d == XPATH) { @@ -549,7 +550,7 @@ boost::split_regex(queries, q, boost::regex("\\+\\|\\+")); assert(queries.size() > 0); - EntryIterator qIter = runXPath(queries[0]); + EntryIterator qIter = runXPath(queries[0], sortOrder); for (std::vector::const_iterator iter = queries.begin() + 1; iter != queries.end(); ++iter) qIter = EntryIterator(new FilterIter(*this, qIter, *iter)); @@ -557,7 +558,7 @@ return qIter; } else if (d == XQUERY) - return runXQuery(q); + return runXQuery(q, sortOrder); else throw NotImplemented("unknown query language"); } @@ -565,31 +566,36 @@ CorpusReader::EntryIterator CorpusReader::queryWithStylesheet( QueryDialect d, std::string const &query, std::string const &stylesheet, - std::list const &markerQueries) const + std::list const &markerQueries, + SortOrder sortOrder) const { - return runQueryWithStylesheet(d, query, stylesheet, markerQueries); + return runQueryWithStylesheet(d, query, stylesheet, markerQueries, + sortOrder); } CorpusReader::EntryIterator CorpusReader::runQueryWithStylesheet( QueryDialect d, std::string const &q, std::string const &stylesheet, - std::list const &markerQueries) const + std::list const &markerQueries, + SortOrder sortOrder) const { if (d == XQUERY) throw NotImplemented(typeid(*this).name(), "XQuery functionality"); - return EntryIterator(new StylesheetIter(query(XPATH, q), stylesheet, - markerQueries)); + return EntryIterator(new StylesheetIter(query(XPATH, q, sortOrder), + stylesheet, markerQueries)); } - CorpusReader::EntryIterator CorpusReader::runXPath(std::string const &query) const + CorpusReader::EntryIterator CorpusReader::runXPath(std::string const &query, + SortOrder sortOrder) const { //throw NotImplemented(typeid(*this).name(), "XQuery functionality"); - return EntryIterator(new FilterIter(*this, getEntries(), query)); + return EntryIterator(new FilterIter(*this, getEntries(sortOrder), query)); } - CorpusReader::EntryIterator CorpusReader::runXQuery(std::string const &) const + CorpusReader::EntryIterator CorpusReader::runXQuery(std::string const &, + SortOrder sortOrder) const { throw NotImplemented(typeid(*this).name(), "XQuery functionality"); } diff -Nru alpino-corpus-2.4.1/src/DbCorpusReader.cpp alpino-corpus-2.6.0/src/DbCorpusReader.cpp --- alpino-corpus-2.4.1/src/DbCorpusReader.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/DbCorpusReader.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -27,9 +27,9 @@ delete d_private; } -CorpusReader::EntryIterator DbCorpusReader::getEntries() const +CorpusReader::EntryIterator DbCorpusReader::getEntries(SortOrder sortOrder) const { - return d_private->getEntries(); + return d_private->getEntries(sortOrder); } std::string DbCorpusReader::getName() const diff -Nru alpino-corpus-2.4.1/src/DbCorpusReaderPrivate.cpp alpino-corpus-2.6.0/src/DbCorpusReaderPrivate.cpp --- alpino-corpus-2.4.1/src/DbCorpusReaderPrivate.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/DbCorpusReaderPrivate.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -134,7 +134,7 @@ { } -CorpusReader::EntryIterator DbCorpusReaderPrivate::getEntries() const +CorpusReader::EntryIterator DbCorpusReaderPrivate::getEntries(SortOrder sortOrder) const { return EntryIterator(new DbIter(container)); } diff -Nru alpino-corpus-2.4.1/src/DbCorpusReaderPrivate.hh alpino-corpus-2.6.0/src/DbCorpusReaderPrivate.hh --- alpino-corpus-2.4.1/src/DbCorpusReaderPrivate.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/DbCorpusReaderPrivate.hh 2017-01-21 07:33:18.000000000 +0000 @@ -49,7 +49,7 @@ public: DbCorpusReaderPrivate(std::string const &); virtual ~DbCorpusReaderPrivate(); - EntryIterator getEntries() const; + EntryIterator getEntries(SortOrder sortOrder) const; std::string getName() const; size_t getSize() const { diff -Nru alpino-corpus-2.4.1/src/DirectoryCorpusReader.cpp alpino-corpus-2.6.0/src/DirectoryCorpusReader.cpp --- alpino-corpus-2.4.1/src/DirectoryCorpusReader.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/DirectoryCorpusReader.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -17,9 +17,9 @@ delete d_private; } -CorpusReader::EntryIterator DirectoryCorpusReader::getEntries() const +CorpusReader::EntryIterator DirectoryCorpusReader::getEntries(SortOrder sortOrder) const { - return d_private->getEntries(); + return d_private->getEntries(sortOrder); } std::string DirectoryCorpusReader::getName() const diff -Nru alpino-corpus-2.4.1/src/DirectoryCorpusReaderPrivate.cpp alpino-corpus-2.6.0/src/DirectoryCorpusReaderPrivate.cpp --- alpino-corpus-2.4.1/src/DirectoryCorpusReaderPrivate.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/DirectoryCorpusReaderPrivate.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include @@ -13,10 +15,164 @@ #include #include "DirectoryCorpusReaderPrivate.hh" +#include "util/NameCompare.hh" #include "util/textfile.hh" namespace bf = boost::filesystem; +namespace { + class DirIter : public alpinocorpus::IterImpl + { + boost::filesystem::recursive_directory_iterator iter; + boost::filesystem::path d_directory; + + public: + DirIter(boost::filesystem::path const &path, + boost::filesystem::recursive_directory_iterator i); + alpinocorpus::IterImpl *copy() const; + bool hasNext(); + alpinocorpus::Entry next(alpinocorpus::CorpusReader const &rdr); + private: + bool isValid(); + }; + + DirIter::DirIter( + bf::path const &path, bf::recursive_directory_iterator i) : + d_directory(path), iter(i) + { + } + + alpinocorpus::IterImpl *DirIter::copy() const + { + // No pointer members + return new DirIter(*this); + } + + bool DirIter::isValid() + { + // End is a correct iterator state. + if (iter == bf::recursive_directory_iterator()) + return true; + + return iter->path().extension() == ".xml"; + } + + bool DirIter::hasNext() + { + // Position iterator at the next valid entry + while (!isValid()) { + ++iter; + } + + return iter != bf::recursive_directory_iterator(); + } + + alpinocorpus::Entry DirIter::next(alpinocorpus::CorpusReader const &rdr) + { + // We assume the iterator is valid, since hasNext() should be called + // before next(). + + std::string entryPathStr = iter->path().string(); + entryPathStr.erase(0, d_directory.string().size()); + + if (entryPathStr[0] == '/') + entryPathStr.erase(0, 1); + + bf::path entryPath(entryPathStr); + + // Move the iterator. + ++iter; + + alpinocorpus::Entry entry = {entryPath.string(), ""}; + + return entry; + } + + class SortedDirIter : public alpinocorpus::IterImpl + { + std::vector d_entries; + std::vector::const_iterator d_iter; + + public: + SortedDirIter(boost::filesystem::path const &path, + boost::filesystem::recursive_directory_iterator i); + alpinocorpus::IterImpl *copy() const; + bool hasNext(); + alpinocorpus::Entry next(alpinocorpus::CorpusReader const &rdr); + private: + SortedDirIter(); + SortedDirIter &operator=(SortedDirIter &other); + bool isValid(); + }; + + SortedDirIter::SortedDirIter( + bf::path const &path, bf::recursive_directory_iterator i) + { + for (; i != bf::recursive_directory_iterator(); i++) + { + std::string entryPathStr = i->path().string(); + entryPathStr.erase(0, path.string().size()); + + if (entryPathStr[0] == '/') + entryPathStr.erase(0, 1); + + d_entries.push_back(entryPathStr); + } + + std::sort(d_entries.begin(), d_entries.end(), + alpinocorpus::NameCompare()); + + d_iter = d_entries.begin(); + } + + SortedDirIter::SortedDirIter() { + } + + alpinocorpus::IterImpl *SortedDirIter::copy() const + { + SortedDirIter *dirIter = new SortedDirIter; + dirIter->d_entries = d_entries; + dirIter->d_iter = dirIter->d_entries.begin() + + std::distance(d_entries.begin(), d_iter); + + return dirIter; + } + + bool SortedDirIter::isValid() + { + // End is a correct iterator state. + if (d_iter == d_entries.end()) + return true; + + return boost::algorithm::ends_with(*d_iter, ".xml"); + } + + bool SortedDirIter::hasNext() + { + // Position iterator at the next valid entry + while (!isValid()) { + ++d_iter; + } + + return d_iter != d_entries.end(); + } + + alpinocorpus::Entry SortedDirIter::next(alpinocorpus::CorpusReader const &rdr) + { + // We assume the iterator is valid, since hasNext() should be called + + bf::path entryPath(*d_iter); + + // Move the iterator. + ++d_iter; + + alpinocorpus::Entry entry = {entryPath.string(), ""}; + + return entry; + } + +} + namespace alpinocorpus { DirectoryCorpusReaderPrivate::DirectoryCorpusReaderPrivate( @@ -36,10 +192,18 @@ DirectoryCorpusReaderPrivate::~DirectoryCorpusReaderPrivate() {} -CorpusReader::EntryIterator DirectoryCorpusReaderPrivate::getEntries() const +CorpusReader::EntryIterator DirectoryCorpusReaderPrivate::getEntries(SortOrder sortOrder) const { - return EntryIterator(new DirIter(d_directory, - bf::recursive_directory_iterator(d_directory, bf::symlink_option::recurse))); + switch (sortOrder) { + case NaturalOrder: + return EntryIterator(new DirIter(d_directory, + bf::recursive_directory_iterator(d_directory, bf::symlink_option::recurse))); + case NumericalOrder: + return EntryIterator(new SortedDirIter(d_directory, + bf::recursive_directory_iterator(d_directory, bf::symlink_option::recurse))); + default: + BOOST_ASSERT_MSG(false, "Unexpected sort order."); + } } std::string DirectoryCorpusReaderPrivate::getName() const @@ -53,7 +217,7 @@ { size_t nEntries = 0; - EntryIterator i = getEntries(); + EntryIterator i = getEntries(NaturalOrder); while (i.hasNext()) { ++nEntries; @@ -66,61 +230,6 @@ return d_nEntries; } - -DirectoryCorpusReaderPrivate::DirIter::DirIter( - bf::path const &path, bf::recursive_directory_iterator i) : - d_directory(path), iter(i) -{ - //if (!isValid()) - // next(); -} - -IterImpl *DirectoryCorpusReaderPrivate::DirIter::copy() const -{ - // No pointer members - return new DirIter(*this); -} - -bool DirectoryCorpusReaderPrivate::DirIter::isValid() -{ - // End is a correct iterator state. - if (iter == bf::recursive_directory_iterator()) - return true; - - return iter->path().extension() == ".xml"; -} - -bool DirectoryCorpusReaderPrivate::DirIter::hasNext() -{ - // Position iterator at the next valid entry - while (!isValid()) { - ++iter; - } - - return iter != bf::recursive_directory_iterator(); -} - -Entry DirectoryCorpusReaderPrivate::DirIter::next(CorpusReader const &rdr) -{ - // We assume the iterator is valid, since hasNext() should be called - // before next(). - - std::string entryPathStr = iter->path().string(); - entryPathStr.erase(0, d_directory.string().size()); - - if (entryPathStr[0] == '/') - entryPathStr.erase(0, 1); - - bf::path entryPath(entryPathStr); - - // Move the iterator. - ++iter; - - Entry entry = {entryPath.string(), ""}; - - return entry; -} - std::string DirectoryCorpusReaderPrivate::readEntry(std::string const &entry) const { bf::path p(d_directory); diff -Nru alpino-corpus-2.4.1/src/DirectoryCorpusReaderPrivate.hh alpino-corpus-2.6.0/src/DirectoryCorpusReaderPrivate.hh --- alpino-corpus-2.4.1/src/DirectoryCorpusReaderPrivate.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/DirectoryCorpusReaderPrivate.hh 2017-01-21 07:33:18.000000000 +0000 @@ -19,21 +19,6 @@ { typedef std::vector StrVector; - class DirIter : public IterImpl - { - boost::filesystem::recursive_directory_iterator iter; - boost::filesystem::path d_directory; - - public: - DirIter(boost::filesystem::path const &path, - boost::filesystem::recursive_directory_iterator i); - IterImpl *copy() const; - bool hasNext(); - Entry next(CorpusReader const &rdr); - private: - bool isValid(); - }; - public: /** * Open directory dir for reading. @@ -41,7 +26,7 @@ DirectoryCorpusReaderPrivate(std::string const &directory); virtual ~DirectoryCorpusReaderPrivate(); - virtual EntryIterator getEntries() const; + virtual EntryIterator getEntries(SortOrder sortOrder) const; virtual std::string getName() const; virtual std::string readEntry(std::string const &entry) const; virtual size_t getSize() const; diff -Nru alpino-corpus-2.4.1/src/MultiCorpusReader.cpp alpino-corpus-2.6.0/src/MultiCorpusReader.cpp --- alpino-corpus-2.4.1/src/MultiCorpusReader.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/MultiCorpusReader.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -17,9 +17,9 @@ delete d_private; } -CorpusReader::EntryIterator MultiCorpusReader::getEntries() const +CorpusReader::EntryIterator MultiCorpusReader::getEntries(SortOrder sortOrder) const { - return d_private->getEntries(); + return d_private->getEntries(sortOrder); } std::string MultiCorpusReader::getName() const diff -Nru alpino-corpus-2.4.1/src/MultiCorpusReaderPrivate.cpp alpino-corpus-2.6.0/src/MultiCorpusReaderPrivate.cpp --- alpino-corpus-2.4.1/src/MultiCorpusReaderPrivate.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/MultiCorpusReaderPrivate.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -49,7 +49,7 @@ { } -CorpusReader::EntryIterator MultiCorpusReaderPrivate::getEntries() const +CorpusReader::EntryIterator MultiCorpusReaderPrivate::getEntries(SortOrder sortOrder) const { return EntryIterator(new MultiIter(d_corporaMap)); } diff -Nru alpino-corpus-2.4.1/src/MultiCorpusReaderPrivate.hh alpino-corpus-2.6.0/src/MultiCorpusReaderPrivate.hh --- alpino-corpus-2.4.1/src/MultiCorpusReaderPrivate.hh 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/MultiCorpusReaderPrivate.hh 2017-01-21 07:33:18.000000000 +0000 @@ -78,7 +78,7 @@ MultiCorpusReaderPrivate(); virtual ~MultiCorpusReaderPrivate(); - EntryIterator getEntries() const; + EntryIterator getEntries(SortOrder sortOrder) const; std::string getName() const; size_t getSize() const; void push_back(std::string const &name, std::string const &filename, diff -Nru alpino-corpus-2.4.1/src/RecursiveCorpusReader.cpp alpino-corpus-2.6.0/src/RecursiveCorpusReader.cpp --- alpino-corpus-2.4.1/src/RecursiveCorpusReader.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/RecursiveCorpusReader.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -20,7 +20,7 @@ bool dactOnly); virtual ~RecursiveCorpusReaderPrivate(); - EntryIterator getEntries() const; + EntryIterator getEntries(SortOrder sortOrder) const; std::string getName() const; size_t getSize() const; std::string readEntry(std::string const &) const; @@ -47,9 +47,9 @@ delete d_private; } -CorpusReader::EntryIterator RecursiveCorpusReader::getEntries() const +CorpusReader::EntryIterator RecursiveCorpusReader::getEntries(SortOrder sortOrder) const { - return d_private->getEntries(); + return d_private->getEntries(sortOrder); } std::string RecursiveCorpusReader::getName() const @@ -125,9 +125,9 @@ { } -CorpusReader::EntryIterator RecursiveCorpusReaderPrivate::getEntries() const +CorpusReader::EntryIterator RecursiveCorpusReaderPrivate::getEntries(SortOrder sortOrder) const { - return d_multiReader->entries(); + return d_multiReader->entries(sortOrder); } std::string RecursiveCorpusReaderPrivate::getName() const diff -Nru alpino-corpus-2.4.1/src/util/InfInt.h alpino-corpus-2.6.0/src/util/InfInt.h --- alpino-corpus-2.4.1/src/util/InfInt.h 1970-01-01 00:00:00.000000000 +0000 +++ alpino-corpus-2.6.0/src/util/InfInt.h 2017-01-21 07:33:18.000000000 +0000 @@ -0,0 +1,1397 @@ +/* + * InfInt - Arbitrary-Precision Integer Arithmetic Library + * Copyright (C) 2013 Sercan Tutar + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * + * USAGE: + * It is pretty straight forward to use the library. Just create an instance of + * InfInt class and start using it. + * + * Useful methods: + * intSqrt: integer square root operation + * digitAt: returns digit at index + * numberOfDigits: returns number of digits + * size: returns size in bytes + * toString: converts it to a string + * + * There are also conversion methods which allow conversion to primitive types: + * toInt, toLong, toLongLong, toUnsignedInt, toUnsignedLong, toUnsignedLongLong. + * + * You may define INFINT_USE_EXCEPTIONS and library methods will start raising + * InfIntException in case of error instead of writing error messages using + * std::cerr. + * + * See ReadMe.txt for more info. + * + * + * No overflows, happy programmers! + * + */ + +#ifndef INFINT_H_ +#define INFINT_H_ + +#include +#include +#include +#include + +#include +#include + +#ifdef USE_PROFINY +#include "..\profiny\Profiny.h" +#else +#define PROFINY_SCOPE +#endif + +#ifdef _WIN32 +#define LONG_LONG_MIN LLONG_MIN +#define LONG_LONG_MAX LLONG_MAX +#define ULONG_LONG_MIN ULLONG_MIN +#define ULONG_LONG_MAX ULLONG_MAX +#endif + +//#define INFINT_USE_EXCEPTIONS +//#define INFINT_USE_SHORT_BASE + +#ifdef INFINT_USE_EXCEPTIONS +#include +#endif + +//inline bool check_pos(int n) +//{ +// return n >= 0; +//} +//inline bool check_neg(int n) +//{ +// return n <= 0; +//} + +#ifdef INFINT_USE_SHORT_BASE // uses 10^4 (short) as the base +typedef short ELEM_TYPE; +typedef int PRODUCT_TYPE; +static const ELEM_TYPE BASE = 10000; +static const ELEM_TYPE UPPER_BOUND = 9999; +static const ELEM_TYPE DIGIT_COUNT = 4; +static const int powersOfTen[] = { 1, 10, 100, 1000}; +#else // uses 10^9 (int) as the base +typedef int ELEM_TYPE; +typedef long long PRODUCT_TYPE; +static const ELEM_TYPE BASE = 1000000000; +static const ELEM_TYPE UPPER_BOUND = 999999999; +static const ELEM_TYPE DIGIT_COUNT = 9; +static const int powersOfTen[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000 }; +#endif + +#ifdef INFINT_USE_EXCEPTIONS +class InfIntException: public std::exception +{ +public: + InfIntException(const std::string& txt) throw (); + ~InfIntException() throw (); + const char* what() const throw (); +private: + std::string txt; +}; + +InfIntException::InfIntException(const std::string& txt) throw () : +std::exception(), txt(txt) +{ +} + +InfIntException::~InfIntException() throw () +{ +} + +const char* InfIntException::what() const throw () +{ + return txt.c_str(); +} +#endif + +class InfInt +{ + friend std::ostream& operator<<(std::ostream &s, const InfInt &n); + friend std::istream& operator>>(std::istream &s, InfInt &val); + +public: + /* constructors */ + InfInt(); + InfInt(const char* c); + InfInt(const std::string& s); + InfInt(int l); + InfInt(long l); + InfInt(long long l); + InfInt(unsigned int l); + InfInt(unsigned long l); + InfInt(unsigned long long l); + InfInt(const InfInt& l); + + /* assignment operators */ + const InfInt& operator=(const char* c); + const InfInt& operator=(const std::string& s); + const InfInt& operator=(int l); + const InfInt& operator=(long l); + const InfInt& operator=(long long l); + const InfInt& operator=(unsigned int l); + const InfInt& operator=(unsigned long l); + const InfInt& operator=(unsigned long long l); + const InfInt& operator=(const InfInt& l); + + /* unary increment/decrement operators */ + const InfInt& operator++(); + const InfInt& operator--(); + InfInt operator++(int); + InfInt operator--(int); + + /* operational assignments */ + const InfInt& operator+=(const InfInt& rhs); + const InfInt& operator-=(const InfInt& rhs); + const InfInt& operator*=(const InfInt& rhs); + const InfInt& operator/=(const InfInt& rhs); // throw + const InfInt& operator%=(const InfInt& rhs); // throw + const InfInt& operator*=(ELEM_TYPE rhs); + + /* operations */ + InfInt operator-() const; + InfInt operator+(const InfInt& rhs) const; + InfInt operator-(const InfInt& rhs) const; + InfInt operator*(const InfInt& rhs) const; + InfInt operator/(const InfInt& rhs) const; // throw + InfInt operator%(const InfInt& rhs) const; // throw + InfInt operator*(ELEM_TYPE rhs) const; + + /* relational operations */ + bool operator==(const InfInt& rhs) const; + bool operator!=(const InfInt& rhs) const; + bool operator<(const InfInt& rhs) const; + bool operator<=(const InfInt& rhs) const; + bool operator>(const InfInt& rhs) const; + bool operator>=(const InfInt& rhs) const; + + /* integer square root */ + InfInt intSqrt() const; // throw + + /* digit operations */ + char digitAt(size_t i) const; // throw + size_t numberOfDigits() const; + + /* size in bytes */ + size_t size() const; + + /* string conversion */ + std::string toString() const; + + /* conversion to primitive types */ + int toInt() const; // throw + long toLong() const; // throw + long long toLongLong() const; // throw + unsigned int toUnsignedInt() const; // throw + unsigned long toUnsignedLong() const; // throw + unsigned long long toUnsignedLongLong() const; // throw + +private: + static ELEM_TYPE dInR(const InfInt& R, const InfInt& D); + static void multiplyByDigit(ELEM_TYPE factor, std::vector& val); + + void correct(bool justCheckLeadingZeros = false, bool hasValidSign = false); + void fromString(const std::string& s); + void optimizeSqrtSearchBounds(InfInt& lo, InfInt& hi) const; + void truncateToBase(); + bool equalizeSigns(); + void removeLeadingZeros(); + + std::vector val; // number with base FACTOR + bool pos; // true if number is positive +}; + +inline InfInt::InfInt() : pos(true) +{ + PROFINY_SCOPE + val.push_back((ELEM_TYPE) 0); +} + +inline InfInt::InfInt(const char* c) +{ + PROFINY_SCOPE + fromString(c); +} + +inline InfInt::InfInt(const std::string& s) +{ + PROFINY_SCOPE + fromString(s); +} + +inline InfInt::InfInt(int l) : pos(l >= 0) +{ + PROFINY_SCOPE + bool subtractOne = false; + if (l == INT_MIN) + { + subtractOne = true; + ++l; + } + + if (!pos) + { + l = -l; + } + do + { + div_t dt = div(l, BASE); + val.push_back((ELEM_TYPE) dt.rem); + l = dt.quot; + } while (l > 0); + + if (subtractOne) + { + --*this; + } +} + +inline InfInt::InfInt(long l) : pos(l >= 0) +{ + PROFINY_SCOPE + bool subtractOne = false; + if (l == LONG_MIN) + { + subtractOne = true; + ++l; + } + + if (!pos) + { + l = -l; + } + do + { + ldiv_t dt = ldiv(l, BASE); + val.push_back((ELEM_TYPE) dt.rem); + l = dt.quot; + } while (l > 0); + + if (subtractOne) + { + --*this; + } +} + +inline InfInt::InfInt(long long l) : pos(l >= 0) +{ + PROFINY_SCOPE + bool subtractOne = false; + if (l == LONG_LONG_MIN) + { + subtractOne = true; + ++l; + } + + if (!pos) + { + l = -l; + } + do + { +#ifndef _WIN32 + lldiv_t dt = lldiv(l, BASE); + val.push_back((ELEM_TYPE) dt.rem); + l = dt.quot; +#else + val.push_back((ELEM_TYPE) (l % BASE)); + l = l / BASE; +#endif + } while (l > 0); + + if (subtractOne) + { + --*this; + } +} + +inline InfInt::InfInt(unsigned int l) : pos(true) +{ + PROFINY_SCOPE + do + { + val.push_back((ELEM_TYPE) (l % BASE)); + l = l / BASE; + } while (l > 0); +} + +inline InfInt::InfInt(unsigned long l) : pos(true) +{ + PROFINY_SCOPE + do + { + val.push_back((ELEM_TYPE) (l % BASE)); + l = l / BASE; + } while (l > 0); +} + +inline InfInt::InfInt(unsigned long long l) : pos(true) +{ + PROFINY_SCOPE + do + { + val.push_back((ELEM_TYPE) (l % BASE)); + l = l / BASE; + } while (l > 0); +} + +inline InfInt::InfInt(const InfInt& l) : pos(l.pos), val(l.val) +{ + PROFINY_SCOPE +} + +inline const InfInt& InfInt::operator=(const char* c) +{ + PROFINY_SCOPE + fromString(c); + return *this; +} + +inline const InfInt& InfInt::operator=(const std::string& s) +{ + PROFINY_SCOPE + fromString(s); + return *this; +} + +inline const InfInt& InfInt::operator=(int l) +{ + PROFINY_SCOPE + bool subtractOne = false; + if (l == INT_MIN) + { + subtractOne = true; + ++l; + } + + pos = l >= 0; + val.clear(); + if (!pos) + { + l = -l; + } + do + { + div_t dt = div(l, BASE); + val.push_back((ELEM_TYPE) dt.rem); + l = dt.quot; + } while (l > 0); + + return subtractOne ? --*this : *this; +} + +inline const InfInt& InfInt::operator=(long l) +{ + PROFINY_SCOPE + bool subtractOne = false; + if (l == LONG_MIN) + { + subtractOne = true; + ++l; + } + + pos = l >= 0; + val.clear(); + if (!pos) + { + l = -l; + } + do + { + ldiv_t dt = ldiv(l, BASE); + val.push_back((ELEM_TYPE) dt.rem); + l = dt.quot; + } while (l > 0); + + return subtractOne ? --*this : *this; +} + +inline const InfInt& InfInt::operator=(long long l) +{ + PROFINY_SCOPE + bool subtractOne = false; + if (l == LONG_LONG_MIN) + { + subtractOne = true; + ++l; + } + + pos = l >= 0; + val.clear(); + if (!pos) + { + l = -l; + } + do + { +#ifndef _WIN32 + lldiv_t dt = lldiv(l, BASE); + val.push_back((ELEM_TYPE) dt.rem); + l = dt.quot; +#else + val.push_back((ELEM_TYPE) (l % BASE)); + l = l / BASE; +#endif + } while (l > 0); + + return subtractOne ? --*this : *this; +} + +inline const InfInt& InfInt::operator=(unsigned int l) +{ + PROFINY_SCOPE + pos = true; + val.clear(); + do + { + val.push_back((ELEM_TYPE) (l % BASE)); + l = l / BASE; + } while (l > 0); + return *this; +} + +inline const InfInt& InfInt::operator=(unsigned long l) +{ + PROFINY_SCOPE + pos = true; + val.clear(); + do + { + val.push_back((ELEM_TYPE) (l % BASE)); + l = l / BASE; + } while (l > 0); + return *this; +} + +inline const InfInt& InfInt::operator=(unsigned long long l) +{ + PROFINY_SCOPE + pos = true; + val.clear(); + do + { + val.push_back((ELEM_TYPE) (l % BASE)); + l = l / BASE; + } while (l > 0); + return *this; +} + +const InfInt& InfInt::operator=(const InfInt& l) +{ + PROFINY_SCOPE + pos = l.pos; + val = l.val; + return *this; +} + +inline const InfInt& InfInt::operator++() +{ + PROFINY_SCOPE + val[0] += (pos ? 1 : -1); + this->correct(false, true); + return *this; +} + +inline const InfInt& InfInt::operator--() +{ + PROFINY_SCOPE + val[0] -= (pos ? 1 : -1); + this->correct(false, true); + return *this; +} + +inline InfInt InfInt::operator++(int) +{ + PROFINY_SCOPE + InfInt result = *this; + val[0] += (pos ? 1 : -1); + this->correct(false, true); + return result; +} + +inline InfInt InfInt::operator--(int) +{ + PROFINY_SCOPE + InfInt result = *this; + val[0] -= (pos ? 1 : -1); + this->correct(false, true); + return result; +} + +inline const InfInt& InfInt::operator+=(const InfInt& rhs) +{ + PROFINY_SCOPE + if (rhs.val.size() > val.size()) + { + val.resize(rhs.val.size(), 0); + } + for (size_t i = 0; i < val.size(); ++i) + { + val[i] = (pos ? val[i] : -val[i]) + (i < rhs.val.size() ? (rhs.pos ? rhs.val[i] : -rhs.val[i]) : 0); + } + correct(); + return *this; +} + +inline const InfInt& InfInt::operator-=(const InfInt& rhs) +{ + PROFINY_SCOPE + if (rhs.val.size() > val.size()) + { + val.resize(rhs.val.size(), 0); + } + for (size_t i = 0; i < val.size(); ++i) + { + val[i] = (pos ? val[i] : -val[i]) - (i < rhs.val.size() ? (rhs.pos ? rhs.val[i] : -rhs.val[i]) : 0); + } + correct(); + return *this; +} + +inline const InfInt& InfInt::operator*=(const InfInt& rhs) +{ + PROFINY_SCOPE + // TODO: optimize (do not use operator*) + *this = *this * rhs; + return *this; +} + +inline const InfInt& InfInt::operator/=(const InfInt& rhs) +{ + PROFINY_SCOPE + if (rhs == 0) + { +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("division by zero"); +#else + std::cerr << "Division by zero!" << std::endl; + return *this; +#endif + } + InfInt R, D = (rhs.pos ? rhs : -rhs), N = (pos ? *this : -*this); + bool oldpos = pos; + val.clear(); + val.resize(N.val.size(), 0); + for (int i = (int) N.val.size() - 1; i >= 0; --i) + { + R.val.insert(R.val.begin(), (ELEM_TYPE) 0); + R.val[0] = N.val[i]; + R.correct(true); + ELEM_TYPE cnt = dInR(R, D); + R -= D * cnt; + val[i] += cnt; + } + correct(); + pos = (val.size() == 1 && val[0] == 0) ? true : (oldpos == rhs.pos); + return *this; +} + +inline const InfInt& InfInt::operator%=(const InfInt& rhs) +{ + PROFINY_SCOPE + if (rhs == 0) + { +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("division by zero"); +#else + std::cerr << "Division by zero!" << std::endl; + return *this; +#endif + } + InfInt D = (rhs.pos ? rhs : -rhs), N = (pos ? *this : -*this); + bool oldpos = pos; + val.clear(); + for (int i = (int) N.val.size() - 1; i >= 0; --i) + { + val.insert(val.begin(), (ELEM_TYPE) 0); + val[0] = N.val[i]; + correct(true); + *this -= D * dInR(*this, D); + } + correct(); + pos = (val.size() == 1 && val[0] == 0) ? true : oldpos; + return *this; +} + +inline const InfInt& InfInt::operator*=(ELEM_TYPE rhs) +{ + PROFINY_SCOPE + ELEM_TYPE factor = rhs < 0 ? -rhs : rhs; + bool oldpos = pos; + multiplyByDigit(factor, val); + correct(); + pos = (val.size() == 1 && val[0] == 0) ? true : (oldpos == (rhs >= 0)); + return *this; +} + +inline InfInt InfInt::operator-() const +{ + PROFINY_SCOPE + InfInt result = *this; + result.pos = !pos; + return result; +} + +inline InfInt InfInt::operator+(const InfInt& rhs) const +{ + PROFINY_SCOPE + InfInt result; + result.val.resize(val.size() > rhs.val.size() ? val.size() : rhs.val.size(), 0); + for (size_t i = 0; i < val.size() || i < rhs.val.size(); ++i) + { + result.val[i] = (i < val.size() ? (pos ? val[i] : -val[i]) : 0) + (i < rhs.val.size() ? (rhs.pos ? rhs.val[i] : -rhs.val[i]) : 0); + } + result.correct(); + return result; +} + +inline InfInt InfInt::operator-(const InfInt& rhs) const +{ + PROFINY_SCOPE + InfInt result; + result.val.resize(val.size() > rhs.val.size() ? val.size() : rhs.val.size(), 0); + for (size_t i = 0; i < val.size() || i < rhs.val.size(); ++i) + { + result.val[i] = (i < val.size() ? (pos ? val[i] : -val[i]) : 0) - (i < rhs.val.size() ? (rhs.pos ? rhs.val[i] : -rhs.val[i]) : 0); + } + result.correct(); + return result; +} + +inline InfInt InfInt::operator*(const InfInt& rhs) const +{ + PROFINY_SCOPE + InfInt result; + result.val.resize(val.size() + rhs.val.size(), 0); + PRODUCT_TYPE carry = 0; + size_t digit = 0; + for (;; ++digit) + { + PROFINY_SCOPE + //result.val[digit] = (ELEM_TYPE) (carry % BASE); + //carry /= BASE; + + PRODUCT_TYPE oldcarry = carry; + carry /= BASE; + result.val[digit] = (ELEM_TYPE) (oldcarry - carry * BASE); + + bool found = false; + for (size_t i = digit < rhs.val.size() ? 0 : digit - rhs.val.size() + 1; i < val.size() && i <= digit; ++i) + { + PROFINY_SCOPE + PRODUCT_TYPE pval = result.val[digit] + val[i] * (PRODUCT_TYPE) rhs.val[digit - i]; + if (pval >= BASE || pval <= -BASE) + { + PROFINY_SCOPE + //carry += pval / BASE; + //pval %= BASE; + + PRODUCT_TYPE quot = pval / BASE; + carry += quot; + pval -= quot * BASE; + } + result.val[digit] = (ELEM_TYPE) pval; + found = true; + } + if (!found) + { + PROFINY_SCOPE + break; + } + } + for (; carry > 0; ++digit) + { + PROFINY_SCOPE + result.val[digit] = (ELEM_TYPE) (carry % BASE); + carry /= BASE; + } + result.correct(); + result.pos = (result.val.size() == 1 && result.val[0] == 0) ? true : (pos == rhs.pos); + return result; +} + +inline InfInt InfInt::operator/(const InfInt& rhs) const +{ + PROFINY_SCOPE + if (rhs == 0) + { +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("division by zero"); +#else + std::cerr << "Division by zero!" << std::endl; + return 0; +#endif + } + InfInt Q, R, D = (rhs.pos ? rhs : -rhs), N = (pos ? *this : -*this); + Q.val.resize(N.val.size(), 0); + for (int i = (int) N.val.size() - 1; i >= 0; --i) + { + PROFINY_SCOPE + R.val.insert(R.val.begin(), (ELEM_TYPE) 0); + R.val[0] = N.val[i]; + R.correct(true); + ELEM_TYPE cnt = dInR(R, D); + R -= D * cnt; + Q.val[i] += cnt; + } + Q.correct(); + Q.pos = (Q.val.size() == 1 && Q.val[0] == 0) ? true : (pos == rhs.pos); + return Q; +} + +inline InfInt InfInt::operator%(const InfInt& rhs) const +{ + PROFINY_SCOPE + if (rhs == 0) + { +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("division by zero"); +#else + std::cerr << "Division by zero!" << std::endl; + return 0; +#endif + } + InfInt R, D = (rhs.pos ? rhs : -rhs), N = (pos ? *this : -*this); + for (int i = (int) N.val.size() - 1; i >= 0; --i) + { + R.val.insert(R.val.begin(), (ELEM_TYPE) 0); + R.val[0] = N.val[i]; + R.correct(true); + R -= D * dInR(R, D); + } + R.correct(); + R.pos = (R.val.size() == 1 && R.val[0] == 0) ? true : pos; + return R; +} + +inline InfInt InfInt::operator*(ELEM_TYPE rhs) const +{ + PROFINY_SCOPE + InfInt result = *this; + ELEM_TYPE factor = rhs < 0 ? -rhs : rhs; + multiplyByDigit(factor, result.val); + result.correct(); + result.pos = (result.val.size() == 1 && result.val[0] == 0) ? true : (pos == (rhs >= 0)); + return result; +} + +inline bool InfInt::operator==(const InfInt& rhs) const +{ + PROFINY_SCOPE + if (pos != rhs.pos || val.size() != rhs.val.size()) + { + return false; + } + for (int i = (int) val.size() - 1; i >= 0; --i) + { + if (val[i] != rhs.val[i]) + { + return false; + } + } + return true; +} + +inline bool InfInt::operator!=(const InfInt& rhs) const +{ + PROFINY_SCOPE + if (pos != rhs.pos || val.size() != rhs.val.size()) + { + return true; + } + for (int i = (int) val.size() - 1; i >= 0; --i) + { + if (val[i] != rhs.val[i]) + { + return true; + } + } + return false; +} + +inline bool InfInt::operator<(const InfInt& rhs) const +{ + PROFINY_SCOPE + if (pos && !rhs.pos) + { + return false; + } + if (!pos && rhs.pos) + { + return true; + } + if (val.size() > rhs.val.size()) + { + return pos ? false : true; + } + if (val.size() < rhs.val.size()) + { + return pos ? true : false; + } + for (int i = (int) val.size() - 1; i >= 0; --i) + { + if (val[i] < rhs.val[i]) + { + return pos ? true : false; + } + if (val[i] > rhs.val[i]) + { + return pos ? false : true; + } + } + return false; +} + +inline bool InfInt::operator<=(const InfInt& rhs) const +{ + PROFINY_SCOPE + if (pos && !rhs.pos) + { + return false; + } + if (!pos && rhs.pos) + { + return true; + } + if (val.size() > rhs.val.size()) + { + return pos ? false : true; + } + if (val.size() < rhs.val.size()) + { + return pos ? true : false; + } + for (int i = (int) val.size() - 1; i >= 0; --i) + { + if (val[i] < rhs.val[i]) + { + return pos ? true : false; + } + if (val[i] > rhs.val[i]) + { + return pos ? false : true; + } + } + return true; +} + +inline bool InfInt::operator>(const InfInt& rhs) const +{ + PROFINY_SCOPE + if (pos && !rhs.pos) + { + return true; + } + if (!pos && rhs.pos) + { + return false; + } + if (val.size() > rhs.val.size()) + { + return pos ? true : false; + } + if (val.size() < rhs.val.size()) + { + return pos ? false : true; + } + for (int i = (int) val.size() - 1; i >= 0; --i) + { + if (val[i] < rhs.val[i]) + { + return pos ? false : true; + } + if (val[i] > rhs.val[i]) + { + return pos ? true : false; + } + } + return false; +} + +inline bool InfInt::operator>=(const InfInt& rhs) const +{ + PROFINY_SCOPE + if (pos && !rhs.pos) + { + return true; + } + if (!pos && rhs.pos) + { + return false; + } + if (val.size() > rhs.val.size()) + { + return pos ? true : false; + } + if (val.size() < rhs.val.size()) + { + return pos ? false : true; + } + for (int i = (int) val.size() - 1; i >= 0; --i) + { + if (val[i] < rhs.val[i]) + { + return pos ? false : true; + } + if (val[i] > rhs.val[i]) + { + return pos ? true : false; + } + } + return true; +} + +inline void InfInt::optimizeSqrtSearchBounds(InfInt& lo, InfInt& hi) const +{ + PROFINY_SCOPE + InfInt hdn = 1; + for (int i = (int) this->numberOfDigits() / 2; i >= 2; --i) + { + hdn *= 10; + } + if (lo < hdn) + { + lo = hdn; + } + hdn *= 100; + if (hi > hdn) + { + hi = hdn; + } +} + +inline InfInt InfInt::intSqrt() const +{ + PROFINY_SCOPE + if (*this <= 0) + { +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("intSqrt called for non-positive integer"); +#else + std::cerr << "intSqrt called for non-positive integer: " << *this << std::endl; + return 0; +#endif + } + InfInt hi = *this / 2 + 1, lo = 0, mid, mid2; + optimizeSqrtSearchBounds(lo, hi); + do + { + mid = (hi + lo) / 2; // 8 factor + mid2 = mid * mid; // 1 factor + if (mid2 == *this) + { + lo = mid; + break; + } + else if (mid2 < *this) + { + lo = mid; + } + else + { + hi = mid; + } + } while (lo < hi - 1 && mid2 != *this); + return lo; +} + +inline char InfInt::digitAt(size_t i) const +{ + PROFINY_SCOPE + if (numberOfDigits() <= i) + { +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("invalid digit index"); +#else + std::cerr << "Invalid digit index: " << i << std::endl; + return -1; +#endif + } + return (val[i / DIGIT_COUNT] / powersOfTen[i % DIGIT_COUNT]) % 10; +} + +inline size_t InfInt::numberOfDigits() const +{ + PROFINY_SCOPE + return (val.size() - 1) * DIGIT_COUNT +#ifdef INFINT_USE_SHORT_BASE + + (val.back() > 999 ? 4 : (val.back() > 99 ? 3 : (val.back() > 9 ? 2 : 1))); +#else + + (val.back() > 99999999 ? 9 : (val.back() > 9999999 ? 8 : (val.back() > 999999 ? 7 : (val.back() > 99999 ? 6 : + (val.back() > 9999 ? 5 : (val.back() > 999 ? 4 : (val.back() > 99 ? 3 : (val.back() > 9 ? 2 : 1)))))))); +#endif +} + +inline std::string InfInt::toString() const +{ + PROFINY_SCOPE + std::ostringstream oss; + oss << *this; + return oss.str(); +} + +inline size_t InfInt::size() const +{ + PROFINY_SCOPE + return val.size() * sizeof(ELEM_TYPE) + sizeof(bool); +} + +inline int InfInt::toInt() const +{ + PROFINY_SCOPE + if (*this > INT_MAX || *this < INT_MIN) +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("out of bounds"); +#else + std::cerr << "Out of INT bounds: " << *this << std::endl; +#endif + int result = 0; + for (int i = (int) val.size() - 1; i >= 0; --i) + { + result = result * BASE + val[i]; + } + return pos ? result : -result; +} + +inline long InfInt::toLong() const +{ + PROFINY_SCOPE + if (*this > LONG_MAX || *this < LONG_MIN) +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("out of bounds"); +#else + std::cerr << "Out of LONG bounds: " << *this << std::endl; +#endif + long result = 0; + for (int i = (int) val.size() - 1; i >= 0; --i) + { + result = result * BASE + val[i]; + } + return pos ? result : -result; +} + +inline long long InfInt::toLongLong() const +{ + PROFINY_SCOPE + if (*this > LONG_LONG_MAX || *this < LONG_LONG_MIN) +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("out of bounds"); +#else + std::cerr << "Out of LLONG bounds: " << *this << std::endl; +#endif + long long result = 0; + for (int i = (int) val.size() - 1; i >= 0; --i) + { + result = result * BASE + val[i]; + } + return pos ? result : -result; +} + +inline unsigned int InfInt::toUnsignedInt() const +{ + PROFINY_SCOPE + if (!pos || *this > UINT_MAX) +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("out of bounds"); +#else + std::cerr << "Out of UINT bounds: " << *this << std::endl; +#endif + unsigned int result = 0; + for (int i = (int) val.size() - 1; i >= 0; --i) + { + result = result * BASE + val[i]; + } + return result; +} + +inline unsigned long InfInt::toUnsignedLong() const +{ + PROFINY_SCOPE + if (!pos || *this > ULONG_MAX) +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("out of bounds"); +#else + std::cerr << "Out of ULONG bounds: " << *this << std::endl; +#endif + unsigned long result = 0; + for (int i = (int) val.size() - 1; i >= 0; --i) + { + result = result * BASE + val[i]; + } + return result; +} + +inline unsigned long long InfInt::toUnsignedLongLong() const +{ + PROFINY_SCOPE + if (!pos || *this > ULONG_LONG_MAX) +#ifdef INFINT_USE_EXCEPTIONS + throw InfIntException("out of bounds"); +#else + std::cerr << "Out of ULLONG bounds: " << *this << std::endl; +#endif + unsigned long long result = 0; + for (int i = (int) val.size() - 1; i >= 0; --i) + { + result = result * BASE + val[i]; + } + return result; +} + +inline void InfInt::truncateToBase() +{ + PROFINY_SCOPE + for (size_t i = 0; i < val.size(); ++i) // truncate each + { + if (val[i] >= BASE || val[i] <= -BASE) + { + PROFINY_SCOPE + div_t dt = div(val[i], BASE); + val[i] = dt.rem; + if (i + 1 >= val.size()) + { + PROFINY_SCOPE + val.push_back(dt.quot); + } + else + { + PROFINY_SCOPE + val[i + 1] += dt.quot; + } + } + } +} + +inline bool InfInt::equalizeSigns() +{ + PROFINY_SCOPE + bool isPositive = true; + int i = (int) ((val.size())) - 1; + for (; i >= 0; --i) + { + if (val[i] != 0) + { + isPositive = val[i--] > 0; + break; + } + } + + if (isPositive) + { + for (; i >= 0; --i) + { + if (val[i] < 0) + { + int k = 0, index = i + 1; + for (; (size_t)(index) < val.size() && val[index] == 0; ++k, ++index) + ; // count adjacent zeros on left + //if ((size_t)(index) < val.size() && val[index] > 0) + { // number on the left is positive + val[index] -= 1; + val[i] += BASE; + for (; k > 0; --k) + { + val[i + k] = UPPER_BOUND; + } + } + } + } + } + else + { + for (; i >= 0; --i) + { + if (val[i] > 0) + { + int k = 0, index = i + 1; + for (; (size_t)(index) < val.size() && val[index] == 0; ++k, ++index) + ; // count adjacent zeros on right + //if ((size_t)(index) < val.size() && val[index] < 0) + { // number on the left is negative + val[index] += 1; + val[i] -= BASE; + for (; k > 0; --k) + { + val[i + k] = -UPPER_BOUND; + } + } + } + } + } + + return isPositive; +} + +inline void InfInt::removeLeadingZeros() +{ + PROFINY_SCOPE + for (int i = (int) (val.size()) - 1; i > 0; --i) // remove leading 0's + { + if (val[i] != 0) + { + return; + } + else + { + val.erase(val.begin() + i); + } + } +} + +inline void InfInt::correct(bool justCheckLeadingZeros, bool hasValidSign) +{ + PROFINY_SCOPE + if (!justCheckLeadingZeros) + { + truncateToBase(); + + if (equalizeSigns()) + { + pos = ((val.size() == 1 && val[0] == 0) || !hasValidSign) ? true : pos; + } + else + { + pos = hasValidSign ? !pos : false; + for (size_t i = 0; i < val.size(); ++i) + { + val[i] = abs(val[i]); + } + } + } + + removeLeadingZeros(); +} + +inline void InfInt::fromString(const std::string& s) +{ + PROFINY_SCOPE + pos = true; + val.clear(); + // TODO use resize + val.reserve(s.size() / DIGIT_COUNT + 1); + int i = (int) s.size() - DIGIT_COUNT; + for (; i >= 0; i -= DIGIT_COUNT) + { + val.push_back(atoi(s.substr(i, DIGIT_COUNT).c_str())); + } + if (i > -DIGIT_COUNT) + { + std::string ss = s.substr(0, i + DIGIT_COUNT); + if (ss.size() == 1 && ss[0] == '-') + { + pos = false; + } + else + { + val.push_back(atoi(ss.c_str())); + } + } + if (val.back() < 0) + { + val.back() = -val.back(); + pos = false; + } + correct(true); +} + +inline ELEM_TYPE InfInt::dInR(const InfInt& R, const InfInt& D) +{ + PROFINY_SCOPE + ELEM_TYPE min = 0, max = UPPER_BOUND; + while (max - min > 0) + { + ELEM_TYPE avg = max + min; + div_t dt = div(avg, 2); + avg = dt.rem ? (dt.quot + 1) : dt.quot; + //ELEM_TYPE havg = avg / 2; + //avg = (avg - havg * 2) ? (havg + 1) : havg; + InfInt prod = D * avg; + if (R == prod) + { + return avg; + } + else if (R > prod) + { + min = avg; + } + else + { + max = avg - 1; + } + } + return min; +} + +inline void InfInt::multiplyByDigit(ELEM_TYPE factor, std::vector& val) +{ + PROFINY_SCOPE + ELEM_TYPE carry = 0; + for (size_t i = 0; i < val.size(); ++i) + { + PRODUCT_TYPE pval = val[i] * (PRODUCT_TYPE) factor + carry; + if (pval >= BASE || pval <= -BASE) + { + //carry = (ELEM_TYPE) (pval / BASE); + //pval %= BASE; + + carry = (ELEM_TYPE) (pval / BASE); + pval -= carry * BASE; + } + else + { + carry = 0; + } + val[i] = (ELEM_TYPE) pval; + } + if (carry > 0) + { + val.push_back(carry); + } +} + +/**************************************************************/ +/******************** NON-MEMBER OPERATORS ********************/ +/**************************************************************/ + +inline std::istream& operator>>(std::istream &s, InfInt &n) +{ + PROFINY_SCOPE + std::string str; + s >> str; + n.fromString(str); + return s; +} + +inline std::ostream& operator<<(std::ostream &s, const InfInt &n) +{ + PROFINY_SCOPE + if (!n.pos) + { + s << '-'; + } + bool first = true; + for (int i = (int) n.val.size() - 1; i >= 0; --i) + { + if (first) + { + s << n.val[i]; + first = false; + } + else + { + s << std::setfill('0') << std::setw(DIGIT_COUNT) << n.val[i]; + } + } + return s; +} + +#endif diff -Nru alpino-corpus-2.4.1/src/util/NameCompare.cpp alpino-corpus-2.6.0/src/util/NameCompare.cpp --- alpino-corpus-2.4.1/src/util/NameCompare.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/src/util/NameCompare.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -5,6 +5,7 @@ #include +#include "InfInt.h" #include "NameCompare.hh" #include "parseString.hh" @@ -57,8 +58,8 @@ // Both digits? Sort on digits! if (std::isdigit(i1[i][0]) && std::isdigit(i2[i][0])) { - long long int d1 = util::parseString(i1[i]); - long long int d2 = util::parseString(i2[i]); + InfInt d1 = util::parseString(i1[i]); + InfInt d2 = util::parseString(i2[i]); if (d1 != d2) return d1 < d2; diff -Nru alpino-corpus-2.4.1/util/create/main.cpp alpino-corpus-2.6.0/util/create/main.cpp --- alpino-corpus-2.4.1/util/create/main.cpp 2016-05-04 14:08:39.000000000 +0000 +++ alpino-corpus-2.6.0/util/create/main.cpp 2017-01-21 07:33:18.000000000 +0000 @@ -4,7 +4,6 @@ #include #include - #include #include @@ -40,6 +39,9 @@ using alpinocorpus::Either; using alpinocorpus::Entry; using alpinocorpus::LexItem; +using alpinocorpus::NaturalOrder; +using alpinocorpus::NumericalOrder; +using alpinocorpus::SortOrder; #if defined(USE_DBXML) using alpinocorpus::DbCorpusWriter; @@ -57,19 +59,21 @@ " -d filename\tCreate a Dact dbxml archive" << std::endl << #endif " -m filename\tLoad macro file" << std::endl << + " -n\t\tUse numerical sorting (when available)" << std::endl << " -q query\tFilter the treebank using the given query" << std::endl << " -r\t\tProcess a directory of corpora recursively" << std::endl << std::endl; } void writeCorpus(boost::shared_ptr reader, boost::shared_ptr writer, - std::string const &query) + std::string const &query, + SortOrder sortOrder) { CorpusReader::EntryIterator i; if (query.empty()) - i = reader->entries(); + i = reader->entries(sortOrder); else - i = reader->query(CorpusReader::XPATH, query); + i = reader->query(CorpusReader::XPATH, query, sortOrder); // We need to be *really* sure when writing a corpus that an entry was not written // before. So, we'll use a set, rather than a basic filter. @@ -90,7 +94,7 @@ boost::scoped_ptr opts; try { opts.reset(new ProgramOptions(argc, const_cast(argv), - "c:d:m:q:r")); + "c:d:m:nq:r")); } catch (std::exception &e) { std::cerr << e.what() << std::endl; return 1; @@ -115,6 +119,11 @@ std::endl; return 1; } + + SortOrder sortOrder = NaturalOrder; + if (opts->option('n')) { + sortOrder = NumericalOrder; + } boost::shared_ptr reader; try { @@ -168,7 +177,7 @@ #if defined(USE_DBXML) boost::shared_ptr wr(new DbCorpusWriter(treebankOut, true)); - writeCorpus(reader, wr, query); + writeCorpus(reader, wr, query, sortOrder); #else throw std::runtime_error("AlpinoCorpus was compiled without DBXML support."); #endif // defined(USE_DBXML) @@ -194,7 +203,7 @@ throw std::runtime_error("Attempting to write to the source treebank."); boost::shared_ptr wr(new CompactCorpusWriter(treebankOut)); - writeCorpus(reader, wr, query); + writeCorpus(reader, wr, query, sortOrder); } catch (std::runtime_error const &e) { std::cerr << opts->programName() <<