diff -Nru ucto-0.9.6/aclocal.m4 ucto-0.14/aclocal.m4 --- ucto-0.9.6/aclocal.m4 2017-01-23 12:52:11.000000000 +0000 +++ ucto-0.14/aclocal.m4 2018-11-15 15:08:20.000000000 +0000 @@ -1,6 +1,6 @@ -# generated automatically by aclocal 1.15 -*- Autoconf -*- +# generated automatically by aclocal 1.15.1 -*- Autoconf -*- -# Copyright (C) 1996-2014 Free Software Foundation, Inc. +# Copyright (C) 1996-2017 Free Software Foundation, Inc. # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -20,7 +20,7 @@ If you have problems, you may need to regenerate the build system entirely. To do so, use the procedure documented by the package, typically 'autoreconf'.])]) -# Copyright (C) 2002-2014 Free Software Foundation, Inc. +# Copyright (C) 2002-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -35,7 +35,7 @@ [am__api_version='1.15' dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to dnl require some minimum version. Point them to the right macro. -m4_if([$1], [1.15], [], +m4_if([$1], [1.15.1], [], [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl ]) @@ -51,14 +51,14 @@ # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. # This function is AC_REQUIREd by AM_INIT_AUTOMAKE. AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], -[AM_AUTOMAKE_VERSION([1.15])dnl +[AM_AUTOMAKE_VERSION([1.15.1])dnl m4_ifndef([AC_AUTOCONF_VERSION], [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # AM_AUX_DIR_EXPAND -*- Autoconf -*- -# Copyright (C) 2001-2014 Free Software Foundation, Inc. +# Copyright (C) 2001-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -110,7 +110,7 @@ # AM_CONDITIONAL -*- Autoconf -*- -# Copyright (C) 1997-2014 Free Software Foundation, Inc. +# Copyright (C) 1997-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -141,7 +141,7 @@ Usually this means the macro was only invoked conditionally.]]) fi])]) -# Copyright (C) 1999-2014 Free Software Foundation, Inc. +# Copyright (C) 1999-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -332,7 +332,7 @@ # Generate code to set up dependency tracking. -*- Autoconf -*- -# Copyright (C) 1999-2014 Free Software Foundation, Inc. +# Copyright (C) 1999-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -408,7 +408,7 @@ # Do all the work for Automake. -*- Autoconf -*- -# Copyright (C) 1996-2014 Free Software Foundation, Inc. +# Copyright (C) 1996-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -605,7 +605,7 @@ done echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) -# Copyright (C) 2001-2014 Free Software Foundation, Inc. +# Copyright (C) 2001-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -626,7 +626,7 @@ fi AC_SUBST([install_sh])]) -# Copyright (C) 2003-2014 Free Software Foundation, Inc. +# Copyright (C) 2003-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -647,7 +647,7 @@ # Check to see how 'make' treats includes. -*- Autoconf -*- -# Copyright (C) 2001-2014 Free Software Foundation, Inc. +# Copyright (C) 2001-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -697,7 +697,7 @@ # Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- -# Copyright (C) 1997-2014 Free Software Foundation, Inc. +# Copyright (C) 1997-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -736,7 +736,7 @@ # Helper functions for option handling. -*- Autoconf -*- -# Copyright (C) 2001-2014 Free Software Foundation, Inc. +# Copyright (C) 2001-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -765,7 +765,7 @@ AC_DEFUN([_AM_IF_OPTION], [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])]) -# Copyright (C) 1999-2014 Free Software Foundation, Inc. +# Copyright (C) 1999-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -812,7 +812,7 @@ # For backward compatibility. AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])]) -# Copyright (C) 2001-2014 Free Software Foundation, Inc. +# Copyright (C) 2001-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -831,7 +831,7 @@ # Check to make sure that the build environment is sane. -*- Autoconf -*- -# Copyright (C) 1996-2014 Free Software Foundation, Inc. +# Copyright (C) 1996-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -912,7 +912,7 @@ rm -f conftest.file ]) -# Copyright (C) 2009-2014 Free Software Foundation, Inc. +# Copyright (C) 2009-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -972,7 +972,7 @@ _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl ]) -# Copyright (C) 2001-2014 Free Software Foundation, Inc. +# Copyright (C) 2001-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -1000,7 +1000,7 @@ INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" AC_SUBST([INSTALL_STRIP_PROGRAM])]) -# Copyright (C) 2006-2014 Free Software Foundation, Inc. +# Copyright (C) 2006-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -1019,7 +1019,7 @@ # Check how to create a tarball. -*- Autoconf -*- -# Copyright (C) 2004-2014 Free Software Foundation, Inc. +# Copyright (C) 2004-2017 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -1150,7 +1150,9 @@ AC_SUBST([am__untar]) ]) # _AM_PROG_TAR -m4_include([m4/ax_icu_check.m4]) +m4_include([m4/ac_check_old_textcat.m4]) +m4_include([m4/ac_osx_pkg.m4]) +m4_include([m4/ac_search_lm.m4]) m4_include([m4/ax_lib_readline.m4]) m4_include([m4/libtool.m4]) m4_include([m4/ltoptions.m4]) diff -Nru ucto-0.9.6/AUTHORS ucto-0.14/AUTHORS --- ucto-0.9.6/AUTHORS 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/AUTHORS 2018-11-13 16:30:47.000000000 +0000 @@ -1,3 +1,4 @@ Maarten van Gompel Ko van der Sloot Folgert Karsdorp +Iris Hendrickx diff -Nru ucto-0.9.6/bootstrap.sh ucto-0.14/bootstrap.sh --- ucto-0.9.6/bootstrap.sh 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/bootstrap.sh 2018-11-13 16:30:47.000000000 +0000 @@ -1,6 +1,3 @@ -# $Id$ -# $URL$ - # bootstrap - script to bootstrap the distribution rolling engine # usage: diff -Nru ucto-0.9.6/ChangeLog ucto-0.14/ChangeLog --- ucto-0.9.6/ChangeLog 2017-01-23 13:25:39.000000000 +0000 +++ ucto-0.14/ChangeLog 2018-11-29 13:42:53.000000000 +0000 @@ -1,3 +1,1151 @@ +2018-11-29 Ko van der Sloot + + * NEWS, codemeta.json: updated NEWS for release + +2018-11-29 Ko van der Sloot + + * .travis.yml: back to travis:trusty, until we understand what fails + with boost and clang. + +2018-11-28 Ko van der Sloot + + * .travis.yml, src/tokenize.cxx: small fix. Also build on + travis:xenial now + +2018-11-16 Maarten van Gompel + + * README.md: updated link to ucto documentation in README + +2018-11-16 Iris Hendrickx + + * docs/source/index.rst: minor spelling edits to manual + +2018-11-16 Iris Hendrickx + + * : commit 89cc9fcfc63f398bd192e74c6b35ee6262f1ba8d Author: Iris + Hendrickx Date: Fri Nov 16 10:31:56 2018 +0100 + +2018-11-16 Ko van der Sloot + + * : commit a41d7965a9200d54c2677728b62cae4d2c5a2f7a Author: Ko van + der Sloot Date: Fri Nov 16 09:21:33 2018 + +0100 + +2018-11-15 Ko van der Sloot + + * include/ucto/setting.h, include/ucto/tokenize.h, src/setting.cxx, + src/tokenize.cxx: cleaning up icu namespace mess a bit. + +2018-11-13 Ko van der Sloot + + * docs/source/index.rst: Update index.rst + +2018-11-13 Ko van der Sloot + + * docs/source/index.rst: Update index.rst + +2018-11-13 Ko van der Sloot + + * docs/source/index.rst: Update index.rst + +2018-11-13 Ko van der Sloot + + * src/ucto.cxx: updated usage(). Removed 'S' from allowed options. + +2018-11-13 Ko van der Sloot + + * docs/ucto.1: add missing info to the man page + +2018-11-13 Ko van der Sloot + + * docs/ucto.1: updated man page + +2018-11-13 Ko van der Sloot + + * src/ucto.cxx: improved usage() layout + +2018-11-01 Ko van der Sloot + + * : commit 4cc63b838c2f33bf29dce0657e41eccf1bd739d3 Author: Ko van + der Sloot Date: Thu Nov 1 15:00:03 2018 + +0100 + +2018-11-01 Maarten van Gompel + + * docs/source/index.rst: [documentation] refer to new frog site + instead of ILK site + +2018-11-01 Maarten van Gompel + + * docs/source/index.rst: processed ucto documentation: some fixes, + spelling corrections, elaborations, and RST style improvements + +2018-10-25 Ko van der Sloot + + * : commit d988612bbfaf657fe72b9d357c41cdd94265ab7f Merge: a4f70cb + 6c83d1d Author: Iris Hendrickx Date: Thu Oct 25 + 14:58:16 2018 +0200 + +2018-10-25 Iris Hendrickx + + * docs/source/index.rst: adding license and citation info to ucto + rest documentation + +2018-10-25 Ko van der Sloot + + * m4/Makefile.am: cleanup + +2018-10-24 Ko van der Sloot + + * : commit 2c77f9ca825f345ce5ba719feb18076a97cb58f1 Author: Ko van + der Sloot Date: Wed Oct 24 12:31:24 2018 + +0200 + +2018-10-24 Iris Hendrickx + + * docs/source/index.rst: ucto documentation -minor changes + +2018-10-24 Iris Hendrickx + + * docs/source/index.rst: ucto documentation -maior update on + index.rst file + +2018-10-22 Ko van der Sloot + + * src/ucto.cxx: added test and adapted usage. --uselanguages is ONLY + for FoLiA inpuy + +2018-10-20 Ko van der Sloot + + * .travis.yml: typo in .travis.yml + +2018-10-18 Ko van der Sloot + + * src/setting.cxx, src/tokenize.cxx: fixed problem: printing an + unitialized filter would crash.... + +2018-10-18 Ko van der Sloot + + * : commit b2c258ca29029aa4d836bf46e7f142edb83c4a4c Author: Ko van + der Sloot Date: Thu Oct 18 16:44:29 2018 + +0200 + +2018-10-11 Maarten van Gompel + + * docs/source/conf.py: removed theme option + +2018-10-11 Maarten van Gompel + + * .gitignore: updated gitignore + +2018-10-11 Maarten van Gompel + + * docs/Makefile.sphinx: makefile fix + +2018-10-10 Maarten van Gompel + + * docs/source/conf.py: changed documentation theme + +2018-10-05 Iris Hendrickx + + * : commit c144928a22da93eb5247526ff651225137b6f3da Author: Iris + Hendrickx Date: Fri Oct 5 16:50:42 2018 +0200 + +2018-10-05 Ko van der Sloot + + * : commit b9eae191b5f0cbe79b699448a8afc883400d5964 Author: Ko van + der Sloot Date: Fri Oct 5 16:40:43 2018 + +0200 + +2018-10-04 Maarten van Gompel + + * docs/source/index.rst: rst fix + +2018-10-04 Maarten van Gompel + + * docs/source/index.rst: copied the README contents as initial + documentation #2 + +2018-10-04 Maarten van Gompel + + * : commit a97100477c4bf76719ebdba345f60a38e617ca29 Author: Maarten + van Gompel Date: Thu Oct 4 17:10:53 2018 + +0200 + +2018-08-30 Ko van der Sloot + + * src/tokenize.cxx: make sure the correct textclass is added to + nodes + +2018-08-06 Ko van der Sloot + + * src/tokenize.cxx: closing in on the icu:: namespaces + +2018-08-06 Ko van der Sloot + + * include/ucto/tokenize.h: add more icu:: namepsaces + +2018-08-06 Ko van der Sloot + + * include/ucto/setting.h: add icu:: namespace + +2018-07-31 Ko van der Sloot + + * include/ucto/tokenize.h: clarified comment + +2018-05-28 Ko van der Sloot + + * src/Makefile.am, src/tokenize.cxx: small refactoring + +2018-05-17 Ko van der Sloot + + * configure.ac: bump version after release + +2018-05-17 Ko van der Sloot + + * NEWS, configure.ac: created a bug fix release + +2018-05-17 Ko van der Sloot + + * config/Makefile.am: no longer install default rule. uctodata is + required now + +2018-05-17 Ko van der Sloot + + * configure.ac: bumped version after release + +2018-05-17 Ko van der Sloot + + * NEWS: NEWS about bug fix release 0.13.1 + +2018-05-17 Ko van der Sloot + + * configure.ac: reverted last edit. + +2018-05-17 Ko van der Sloot + + * configure.ac: removed debug lines from configure.ac + +2018-05-17 Ko van der Sloot + + * configure.ac, src/setting.cxx: configure now tries to locate the + uctodata files. We use that location. Should work on Mac too :{ + +2018-05-16 Ko van der Sloot + + * configure.ac: bump version after release + +2018-05-16 Ko van der Sloot + + * NEWS: typos! + +2018-05-16 Ko van der Sloot + + * NEWS: get ready for release 0.13 + +2018-05-02 Maarten van Gompel + + * codemeta.json, configure.ac: metadata update + +2018-05-02 Ko van der Sloot + + * src/tokenize.cxx: removed the experimental bracketing code. is + solved in the configuration + +2018-05-01 Ko van der Sloot + + * tests/issue051.nl.tok.V, tests/issue051.nl.txt: modified example + with ABBREVIATION inside brackets + +2018-04-30 Ko van der Sloot + + * src/tokenize.cxx: added experimental code for bracket handling. + (disabled for now) + +2018-04-30 Ko van der Sloot + + * tests/issue051.nl.tok.V, tests/issue051.nl.txt: for now, let the + test pass. Fixing might be hard + +2018-04-30 Ko van der Sloot + + * tests/issue051.nl.tok.V, tests/issue051.nl.txt: added a (yet + FAILING!) test for issue051 + +2018-04-30 Ko van der Sloot + + * tests/testoption-split, tests/testoption-split.ok: modified test + +2018-04-25 Ko van der Sloot + + * src/tokenize.cxx: spitOnly wasn't initialized... + +2018-04-25 Ko van der Sloot + + * src/tokenize.cxx: improved --split option + +2018-04-25 Ko van der Sloot + + * tests/testall, tests/testoption-split, tests/testoption-split.ok: + added a test for the --split option. It proves that quotes are very + difficult! + +2018-04-24 Ko van der Sloot + + * include/ucto/tokenize.h, src/Makefile.am, src/tokenize.cxx, + src/ucto.cxx: added a --split option + +2018-04-24 Ko van der Sloot + + * tests/testoption-P, tests/testoption-P.ok: extended test + +2018-04-24 Ko van der Sloot + + * tests/testall, tests/testoption-P, tests/testoption-P.ok: added a + test for the -P option (disable paragraph detection) + +2018-04-24 Ko van der Sloot + + * include/ucto/tokenize.h: removed the -S option (disable + sentence-detection) because is was only half implemented.... + +2018-04-24 Ko van der Sloot + + * src/tokenize.cxx, src/ucto.cxx: removed the -S option (disable + sentence-detection) because is was only half implemented.... + +2018-04-23 Maarten van Gompel + + * codemeta.json: forgot to actually commit codemeta.json + +2018-04-19 Ko van der Sloot + + * tests/issue048.nl.tok.V, tests/issue048.nl.txt: added another + test. (may run forever or not) + +2018-04-18 Maarten van Gompel + + * : commit 2bff822b2353530e12aa6a74472bbe981c9293bf Author: Maarten + van Gompel Date: Wed Apr 18 20:39:05 2018 + +0200 + +2018-04-09 Ko van der Sloot + + * tests/testall, tests/tokens.txt: updated test + +2018-04-09 Ko van der Sloot + + * tests/testtokens, tests/testtokens.ok, tests/tokens, + tests/tokens.txt: added a test for the new --add-tokens option + +2018-04-09 Ko van der Sloot + + * include/ucto/setting.h, include/ucto/tokenize.h: add an extra + --add-tokens option to add special tokens to settings of the default + language + +2018-04-09 Ko van der Sloot + + * src/setting.cxx, src/tokenize.cxx, src/ucto.cxx: add an extra + --add-tokens option to add special tokens to settings of the default + language + +2018-04-04 Ko van der Sloot + + * src/setting.cxx, src/tokenize.cxx: add icu:: namespaces + +2018-04-03 Ko van der Sloot + + * tests/cell.xml: oesp, lost a file! + +2018-04-03 Ko van der Sloot + + * src/setting.cxx, src/tokenize.cxx, tests/cell.xml: some + refactoring concerning namespace usage. + +2018-04-03 Ko van der Sloot + + * tests/issue046.nl.tok.V, tests/issue046.nl.txt: expanded test + +2018-04-03 Ko van der Sloot + + * tests/issue046.nl.tok.V, tests/issue046.nl.txt, + tests/ocr.nl.tok.V, tests/test.nl.tok.V: added new test. adapted + some others to newest uctodata + +2018-03-22 Ko van der Sloot + + * .travis.yml: updates .travis.yml + +2018-03-06 Ko van der Sloot + + * configure.ac: and try to fix + +2018-03-06 Ko van der Sloot + + * configure.ac: Try to brake build on travis + +2018-03-06 Ko van der Sloot + + * configure.ac, m4/ac_check_old_textcat.m4, m4/ac_search_lm.m4: + attempt to clean up configure a bit + +2018-03-06 Ko van der Sloot + + * configure.ac, m4/ac_search_lm.m4: next attempt to fix configure + +2018-03-06 Ko van der Sloot + + * : commit 380e35eb977d8d9ea4f1ea68f412ea1935e3af74 Author: Ko van + der Sloot Date: Tue Mar 6 11:40:25 2018 + +0100 + +2018-02-28 Ko van der Sloot + + * configure.ac: let configure fail when no libtextcat is found. + +2018-02-22 Ko van der Sloot + + * src/setting.cxx: added experimental code to enable an alternative + search path for the ucto data files. + +2018-02-19 Ko van der Sloot + + * configure.ac: bump version after release + +2018-02-19 Ko van der Sloot + + * include/ucto/Makefile.am, include/ucto/unicode.h, + src/Makefile.am, src/unicode.cxx: removed old and unsused unicode + stuff + +2018-02-19 Ko van der Sloot + + * NEWS, README.md: Updated README abit + +2018-02-16 Ko van der Sloot + + * include/ucto/tokenize.h, src/setting.cxx, src/tokenize.cxx, + src/ucto.cxx: cleanup old ucto includes + +2018-02-16 Ko van der Sloot + + * include/ucto/Makefile.am, include/ucto/setting.h, + include/ucto/tokenize.h, src/Makefile.am, src/ucto.cxx: use + UniFilter from ticcutils now + +2018-02-15 Ko van der Sloot + + * src/tokenize.cxx: added include + +2018-02-15 Ko van der Sloot + + * include/ucto/unicode.h, src/unicode.cxx: removed unused includes + +2018-02-12 Ko van der Sloot + + * configure.ac, include/ucto/setting.h, include/ucto/unicode.h, + src/setting.cxx, src/unicode.cxx: moved unicode code to ticcutils (v + 0.18 or above) + +2018-01-30 Ko van der Sloot + + * tests/test.py: don't use 'diff -Z' mac OSX doesn't know it + +2018-01-30 Ko van der Sloot + + * config/textcat_alt.cfg, config/{textcat_normal.cfg => + textcat_alt.cfg.in}, config/textcat_normal.cfg.in: oesp, wrong files + in last commit + +2018-01-30 Ko van der Sloot + + * config/Makefile.am, configure.ac, include/ucto/my_textcat.h: + updated configuration + +2018-01-30 Ko van der Sloot + + * config/textcat_alt.cfg, config/textcat_normal.cfg, + m4/ac_check_old_textcat.m4, m4/ac_search_lm.m4: new files + +2018-01-29 Ko van der Sloot + + * configure.ac, include/ucto/my_textcat.h: improved configuration + for libtextcat (notably for MacOSX) + +2018-01-25 Ko van der Sloot + + * .travis.yml: attempt to fix runtime dependency + +2018-01-25 Ko van der Sloot + + * .travis.yml: move cppcheck to better location and DON'T fail, + please + +2018-01-25 Ko van der Sloot + + * .travis.yml: enable OSX test again + +2018-01-11 Ko van der Sloot + + * include/ucto/tokenize.h, include/ucto/unicode.h, src/unicode.cxx: + use the TiccUtils UnicodeNormalizer now + +2018-01-05 Ko van der Sloot + + * .travis.yml: it works vor Clang! + +2018-01-05 Ko van der Sloot + + * .travis.yml: why doesn't is work? Or does it? + +2018-01-04 Ko van der Sloot + + * .travis.yml: that didn;t work well... + +2018-01-04 Ko van der Sloot + + * .travis.yml: next clang attempt + +2018-01-04 Ko van der Sloot + + * .travis.yml: disable osx for now. focussing in CLANG + +2018-01-04 Ko van der Sloot + + * .travis.yml, configure.ac, src/Makefile.am, src/{textcat.cxx => + my_textcat.cxx}: getting desperate + +2018-01-04 Ko van der Sloot + + * .travis.yml, configure.ac: and the next round + +2018-01-04 Ko van der Sloot + + * configure.ac: and next round of errors + +2018-01-04 Ko van der Sloot + + * configure.ac: next OSX attempt + +2018-01-04 Ko van der Sloot + + * configure.ac: attempt to fix libtextcat stuff on OSX + +2018-01-04 Ko van der Sloot + + * src/ucto.cxx: the same oesp in another file + +2018-01-04 Ko van der Sloot + + * src/tokenize.cxx: oesp. include files has a different name now + +2018-01-04 Ko van der Sloot + + * configure.ac, include/ucto/Makefile.am, include/ucto/{textcat.h + => my_textcat.h}, src/textcat.cxx, src/tokenize.cxx: attempt to + simplify lib(ext)textcat stuff using PKG_CONFIG + +2018-01-04 Ko van der Sloot + + * .travis.yml: added missing libtextcat dependency for OSX + +2018-01-04 Ko van der Sloot + + * .travis.yml: ok, lets try OSX build + +2018-01-04 Ko van der Sloot + + * .travis.yml: brainfarth corrected + +2018-01-04 Ko van der Sloot + + * m4/ac_osx_pkg.m4: forgotten a file + +2018-01-04 Ko van der Sloot + + * .travis.yml: attempt to fix .travis.yml + +2018-01-04 Ko van der Sloot + + * .travis.yml, configure.ac: test new .travis config. No OSX yet + +2018-01-02 Ko van der Sloot + + * src/ucto.cxx: another 2017 ==> 2018 transition + +2018-01-02 Ko van der Sloot + + * include/ucto/setting.h, include/ucto/textcat.h, + include/ucto/tokenize.h, include/ucto/unicode.h, src/setting.cxx, + src/textcat.cxx, src/tokenize.cxx, src/ucto.cxx, src/unicode.cxx: + 2018 + +2018-01-02 Ko van der Sloot + + * .gitignore, Makefile.am, README: also package README.md + +2017-12-04 Ko van der Sloot + + * configure.ac: bumped version after release + +2017-12-04 Ko van der Sloot + + * NEWS: NEWS about release 0.11 + +2017-12-02 Ko van der Sloot + + * src/tokenize.cxx: fixed problem with text in cell (code get + hackier every time....) + +2017-12-02 Ko van der Sloot + + * tests/cell.xml, tests/testfoliain, tests/testfoliain.ok: added a + new test + +2017-11-07 Ko van der Sloot + + * configure.ac: bumped version after release + +2017-11-07 Ko van der Sloot + + * NEWS: new NEWS prior to yar (yet another release) + +2017-11-06 Ko van der Sloot + + * : commit 49f6aa8d553e988c92654a828f96be925d4cf52f Author: Ko van + der Sloot Date: Mon Nov 6 17:24:11 2017 + +0100 + +2017-11-06 Ko van der Sloot + + * NEWS, configure.ac, src/Makefile.am: Minor fix: bumped the .so + version to 3.0.0 + +2017-11-02 Ko van der Sloot + + * configure.ac, include/ucto/tokenize.h, src/tokenize.cxx: some + refactoring + +2017-10-24 Ko van der Sloot + + * tests/test.de.tok.V: German test result is changed (improved) + after fix in NOSPACE handling + +2017-10-24 Ko van der Sloot + + * src/tokenize.cxx: fixed a problem with NOSPACE inside recursive + rules + +2017-10-23 Ko van der Sloot + + * tests/smileys.nl.tok.V: added a test + +2017-10-23 Ko van der Sloot + + * src/tokenize.cxx, tests/smileys.nl.txt: added detection of + UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS + +2017-10-23 Ko van der Sloot + + * src/tokenize.cxx: when adding 'extra' text, skip
nodes. They + don't take + +2017-10-23 Ko van der Sloot + + * configure.ac: bumped version after release + +2017-10-23 Ko van der Sloot + + * NEWS: NEWS + +2017-10-23 Ko van der Sloot + + * tests/testutt, tests/testutt.ok, tests/utt2.xml: added anothrer + utterance test. + +2017-10-22 Maarten van Gompel + + * src/tokenize.cxx: Attempted fix for utterance/sentence problem #37 + +2017-10-22 Maarten van Gompel + + * src/tokenize.cxx: another related comment + +2017-10-22 Maarten van Gompel + + * src/tokenize.cxx: just added a comment/suggestion on detection + structure elements + +2017-10-19 Ko van der Sloot + + * NEWS: small folia ==> FoLiA edit + +2017-10-18 Ko van der Sloot + + * configure.ac: bumped version after release + +2017-10-18 Ko van der Sloot + + * NEWS: some typos in NEWS + +2017-10-18 Ko van der Sloot + + * NEWS: Updated NEWS with old news from 23-01-2017 + +2017-10-17 Ko van der Sloot + + * NEWS: some news + +2017-10-11 Ko van der Sloot + + * src/tokenize.cxx, tests/testfoliain.ok: fixed + textredundancy="full". Now it adds text upto the highest level. + +2017-10-11 Ko van der Sloot + + * tests/testfoliain, tests/testfoliain.ok, tests/textproblem.xml: + added and modified tests, after change in FoLiA parser + +2017-10-11 Ko van der Sloot + + * include/ucto/tokenize.h, src/tokenize.cxx: added a + setTextRedundancy member + +2017-10-10 Ko van der Sloot + + * tests/partest2_folia.nl.xml, tests/partest_folia.nl.xml, + tests/testfolia.ok, tests/testfolia2.ok, tests/testfoliain.ok, + tests/testlang.ok, tests/testutt.ok: adapted tests to changed + textredundancy level + +2017-10-10 Ko van der Sloot + + * src/tokenize.cxx, src/ucto.cxx: changed textredundancy default to + 'minimal' + +2017-10-10 Ko van der Sloot + + * tests/testfoliain.ok: adapted test to changed
handling + +2017-10-10 Ko van der Sloot + + * src/tokenize.cxx: for now, disable the
 handling. It is too + complicated. + +2017-10-02 Ko van der Sloot + + * tests/testfolia2, tests/testfolia2.ok, tests/testfoliain.ok: fixed + tests + +2017-10-02 Ko van der Sloot + + * include/ucto/tokenize.h, src/tokenize.cxx, src/ucto.cxx, + tests/testfolia, tests/testfoliain, tests/testfoliain.ok: + implemented --textredundancy option (replaces --noredundanttext) + +2017-10-02 Ko van der Sloot + + * include/ucto/tokenize.h, src/tokenize.cxx: removed an unused + function. Give a warning when attempting to set language on metadata + of non-native type + +2017-10-02 Ko van der Sloot + + * configure.ac: re-instated --with-icu in configure.ac + +2017-09-28 Ko van der Sloot + + * src/tokenize.cxx: added safeguards around set_metadata + +2017-09-27 Ko van der Sloot + + * src/tokenize.cxx: the default is doRedundantText == true + +2017-09-27 Ko van der Sloot + + * tests/testfoliain: adapted test to check automagically detecting + folia + +2017-09-27 Ko van der Sloot + + * src/ucto.cxx: automatically switch to -F or -X when input or + outputfile have '.xml' extension(s) + +2017-09-27 Ko van der Sloot + + * tests/testfolia2, tests/testfolia2.ok: modified test to also test + -T option + +2017-09-26 Maarten van Gompel + + * src/ucto.cxx: added CLST, Nijmegen to --version + +2017-09-26 Maarten van Gompel + + * src/ucto.cxx: Added shortcut option for --noredundanttext (-T) and + changed help text a bit #31 + +2017-09-26 Ko van der Sloot + + * tests/testfolia.ok: add updated file, missing from previous commit + +2017-09-26 Ko van der Sloot + + * include/ucto/tokenize.h, src/tokenize.cxx, src/ucto.cxx, + tests/testfolia, tests/testfoliain, tests/testfoliain.ok: + implemented an --noredundanttext option. and added tests + +2017-09-12 Ko van der Sloot + + * configure.ac: be sure to use recent libfolia + +2017-09-12 Ko van der Sloot + + * src/tokenize.cxx, tests/testfoliain.ok: set textclass on when + outputclass != inputclass + +2017-09-11 Ko van der Sloot + + * configure.ac: use C++! + +2017-08-30 Ko van der Sloot + + * ucto.pc.in: removed icu requirement + +2017-08-30 Ko van der Sloot + + * : commit 5ee40601de62c8612f4660a7748151fee7ea9929 Author: Ko van + der Sloot Date: Wed Aug 30 16:24:06 2017 + +0200 + +2017-08-30 Maarten van Gompel + + * docs/ucto_manual.tex: typo fix (and automatic trailing space + stuff) + +2017-08-21 Ko van der Sloot + + * tests/folia9a.xml, tests/folia9b.xml, tests/testfoliain, + tests/testfoliain.ok: added test documents with embedded tabs, + newlines and multiple spaces. + +2017-08-18 Ko van der Sloot + + * tests/folia8.xml: new file + +2017-08-18 Ko van der Sloot + + * configure.ac, tests/testfoliain, tests/testfoliain.ok: added a + test wikt xml comment inside a + +2017-08-17 Maarten van Gompel + + * src/tokenize.cxx, src/ucto.cxx: language fix + +2017-08-15 Ko van der Sloot + + * src/tokenize.cxx: added some more debug lines + +2017-08-14 Ko van der Sloot + + * src/tokenize.cxx: try to generate id's based on the parents ID or + there parents ID. + +2017-07-27 Ko van der Sloot + + * .travis.yml: add libtar-dev too + +2017-07-25 Ko van der Sloot + + * : commit 00c3b9e94e36331b756f67110c0fc940ff83075d Author: Ko van + der Sloot Date: Tue Jul 25 10:45:38 2017 + +0200 + +2017-07-20 Maarten van Gompel + + * tests/testall: use python2 explicitly + +2017-07-20 Maarten van Gompel + + * tests/test.py: use python 2 explicitly + +2017-07-20 Ko van der Sloot + + * src/tokenize.cxx, tests/testutt.ok: fixed utterance handling + (quite hacky) + +2017-07-20 Ko van der Sloot + + * tests/testall, tests/testutt, tests/utt.xml: added a (yet failing) + test + +2017-07-18 Ko van der Sloot + + * .travis.yml: attempt to fix clang test on travis + +2017-07-18 Ko van der Sloot + + * src/tokenize.cxx: disable filtering in XML files in more cases + +2017-06-28 Ko van der Sloot + + * .travis.yml: attempt to fix build + +2017-06-28 Ko van der Sloot + + * tests/testfoliain.ok: adaped test, now newline handling is fixed + +2017-06-28 Ko van der Sloot + + * include/ucto/tokenize.h, src/tokenize.cxx: added code to handle + embedded newlines in FoLiA documents. + +2017-06-26 Ko van der Sloot + + * src/tokenize.cxx: adapted to changed libfolis + +2017-06-01 Maarten van Gompel + + * : commit 2037878fff5e9bb47911c1a0c54b9c79291754fc Author: Maarten + van Gompel Date: Thu Jun 1 21:30:05 2017 + +0200 + +2017-05-22 Ko van der Sloot + + * src/setting.cxx, src/tokenize.cxx, src/ucto.cxx, + tests/testfiles2.ok, tests/testfoliain.ok, tests/testlang.ok, + tests/testoption2.ok, tests/testslash.ok: sorted out logging and + such a bit. + +2017-05-22 Ko van der Sloot + + * tests/testfoliain.ok, tests/testlang.ok, tests/testslash.ok: + adaptes tests + +2017-05-22 Ko van der Sloot + + * src/ucto.cxx: No longer SILENTLY set --filter=NO for FoLiA with + equal input ans output class + +2017-05-22 Ko van der Sloot + + * src/ucto.cxx, tests/testnormalisation: added an --filter option. + superseeds -f (that could only switch filtering OFF) + +2017-05-17 Ko van der Sloot + + * tests/folia1.xml, tests/testfoliain, tests/testfoliain.ok: + enhanced and extended folia testing + +2017-05-17 Ko van der Sloot + + * src/tokenize.cxx, src/ucto.cxx, tests/testfoliain.ok: Disable + filtering of characters on FoLiA input with same inputclass and + outputclass + +2017-05-10 Ko van der Sloot + + * tests/filter.xml, tests/testfoliain.ok, tests/testtext, + tests/testtext.ok: added a test, and adapted to changes results + +2017-05-10 Ko van der Sloot + + * src/tokenize.cxx: now we adapt text on and

to the lower + layers + +2017-05-10 Ko van der Sloot + + * configure.ac: simplified configuration + +2017-05-10 Ko van der Sloot + + * .travis.yml: added IRC notification + +2017-05-10 Ko van der Sloot + + * tests/testlang.ok: adepted test after fix in libfolia + +2017-05-10 Ko van der Sloot + + * docs/ucto.1, src/ucto.cxx: update manpage. Fixed typo. + +2017-05-09 Ko van der Sloot + + * Makefile.am, configure.ac, ucto.pc.in: more configuration cleanup. + +2017-05-08 Ko van der Sloot + + * bootstrap.sh, configure.ac: modernized build system + +2017-05-03 Ko van der Sloot + + * src/tokenize.cxx: still a leak was left. plugging... + +2017-05-03 Ko van der Sloot + + * src/setting.cxx, src/tokenize.cxx: fixed a memory leak + +2017-04-20 Ko van der Sloot + + * src/tokenize.cxx: added some comment + +2017-04-10 Ko van der Sloot + + * src/tokenize.cxx: better debug output + +2017-04-10 Ko van der Sloot + + * tests/folia7.xml, tests/testfolia, tests/testfoliain, + tests/testfoliain.ok: added a test + +2017-04-04 Ko van der Sloot + + * .travis.yml: revert back to default g++ + +2017-03-30 Ko van der Sloot + + * src/tokenize.cxx: numb edits + +2017-03-28 Ko van der Sloot + + * include/ucto/tokenize.h, src/tokenize.cxx, src/ucto.cxx, + tests/folia-lang-2.xml, tests/testlang: started implementing + language detection in FoLiA input too. Not done, nothing broke (yet) + +2017-03-27 Ko van der Sloot + + * src/tokenize.cxx: fixed a problem with log token detection + +2017-03-14 Maarten van Gompel + + * : Merge pull request #17 from sanmai-NL/speed_up_CI_build Limit network transfers, add `ccache` + +2017-03-01 Ko van der Sloot + + * src/tokenize.cxx: Oops. A function got lost... :{ + +2017-02-27 Ko van der Sloot + + * src/ucto.cxx: removed redundant mentioning of configfile. (is + empty > 90% of time) + +2017-02-27 Ko van der Sloot + + * include/ucto/tokenize.h, src/tokenize.cxx: in case of problems in + tokenizeLine(), we display the offending line numner OR the FoLiA + element ID. + +2017-02-26 Ko van der Sloot + + * src/tokenize.cxx: for extreme long 'words' display a part of the + offensive intput. Also typo corrected. + +2017-02-21 Ko van der Sloot + + * src/setting.cxx, src/ucto.cxx: give better information when + language is missing or wrong + +2017-02-20 Ko van der Sloot + + * src/ucto.cxx: updated usage() + +2017-02-20 Ko van der Sloot + + * docs/ucto.1: updated ucto man page + +2017-02-20 Ko van der Sloot + + * .travis.yml: another final attempt :{ + +2017-02-20 Ko van der Sloot + + * .travis.yml: final attempt + +2017-02-20 Ko van der Sloot + + * .travis.yml: getting closer? + +2017-02-20 Ko van der Sloot + + * .travis.yml: wow wat lastig + +2017-02-20 Ko van der Sloot + + * .travis.yml: next try + +2017-02-20 Ko van der Sloot + + * .travis.yml: another attempt + +2017-02-20 Ko van der Sloot + + * .travis.yml: attempt to fix + +2017-02-20 Ko van der Sloot + + * .travis.yml: modernized Travis config + +2017-02-20 Ko van der Sloot + + * .travis.yml: added dependency for travis + +2017-02-20 Ko van der Sloot + + * src/ucto.cxx: Warn about use of unsupported languages. Don't use + 'generic' by default. + +2017-02-20 Ko van der Sloot + + * src/ucto.cxx: check specified languages against the installed ones + +2017-02-20 Ko van der Sloot + + * include/ucto/setting.h, src/setting.cxx, src/ucto.cxx: use a set + to store resulte, not a vector + +2017-02-20 Ko van der Sloot + + * include/ucto/setting.h, src/setting.cxx, src/ucto.cxx: added a + function to search for installed languages + +2017-02-20 Ko van der Sloot + + * src/tokenize.cxx: typo corrected + +2017-02-20 Ko van der Sloot + + * src/tokenize.cxx: choke on words from 2500 characters ore more + +2017-02-08 Ko van der Sloot + + * include/ucto/tokenize.h, src/tokenize.cxx: some more repait + considering outputclass + +2017-02-08 Ko van der Sloot + + * src/tokenize.cxx, src/ucto.cxx: when using the --textclass option. + make sure --inputclass and --outputclass are not used. + +2017-02-07 Ko van der Sloot + + * include/ucto/tokenize.h, src/Makefile.am, src/tokenize.cxx: + attempt to speed up some stuff + +2017-02-02 Ko van der Sloot + + * src/Makefile.am, src/tokenize.cxx: minor changes + +2017-01-24 Ko van der Sloot + + * include/ucto/textcat.h, src/Makefile.am, src/setting.cxx, + src/textcat.cxx, src/tokenize.cxx, src/ucto.cxx, src/unicode.cxx: + some refactoring to satisfy static checkers + +2017-01-23 Ko van der Sloot + + * configure.ac: bumped version after release + 2017-01-23 Maarten van Gompel * configure.ac: rely on uctodata 0.4 @@ -38,6 +1186,10 @@ * config/Makefile.am, src/Makefile.am: unstall and look for datafiles in $PREFIX/share/ucto +2017-01-18 Sander Maijers + + * .travis.yml: Speed up CI builds + 2017-01-18 Ko van der Sloot * tests/test.nl.tok.V, tests/test.nl.txt: added more DATE testcases diff -Nru ucto-0.9.6/codemeta.json ucto-0.14/codemeta.json --- ucto-0.9.6/codemeta.json 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/codemeta.json 2018-11-29 13:31:42.000000000 +0000 @@ -0,0 +1,114 @@ +{ + "@context": [ + "https://doi.org/10.5063/schema/codemeta-2.0", + "http://schema.org", + { + "entryPoints": { "@reverse": "schema:actionApplication" }, + "interfaceType": { "@id": "codemeta:interfaceType" } + } + ], + "@type": "SoftwareSourceCode", + "identifier": "ucto", + "name": "ucto", + "version": "0.14", + "description": "Ucto tokenizes text files: it separates words from punctuation, and splits sentences. This is one of the first tasks for almost any Natural Language Processing application. Ucto offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation.", + "license": "https://spdx.org/licenses/GPL-3.0", + "url": "https://languagemachines.github.io/ucto", + "producer": { + "@id": "https://www.ru.nl/clst", + "@type": "Organization", + "name": "Centre for Language and Speech Technology", + "url": "https://www.ru.nl/clst", + "parentOrganization": { + "@id": "https://www.ru.nl/cls", + "@type": "Organization", + "name": "Centre for Language Studies", + "url": "https://www.ru.nl/cls", + "parentOrganization": { + "@id": "https://www.ru.nl", + "name": "Radboud University", + "@type": "Organization", + "url": "https://www.ru.nl", + "location": { + "@type": "Place", + "name": "Nijmegen" + } + } + + } + }, + "author": [ + { + "@id": "https://orcid.org/0000-0002-1046-0006", + "@type": "Person", + "givenName": "Maarten", + "familyName": "van Gompel", + "email": "proycon@anaproy.nl", + "affiliation": { "@id": "https://www.ru.nl/clst" } + }, + { + "@type": "Person", + "givenName": "Ko", + "familyName": "van der Sloot", + "email": "ko.vandersloot@let.ru.nl", + "affiliation": { "@id": "https://www.ru.nl/clst" } + } + ], + "sourceOrganization": { "@id": "https://www.ru.nl/clst" }, + "programmingLanguage": { + "@type": "ComputerLanguage", + "identifier": "c++", + "name": "C++" + }, + "operatingSystem": "POSIX", + "codeRepository": "https://github.com/LanguageMachines/ucto", + "softwareRequirements": [ + { + "@type": "SoftwareApplication", + "identifier": "icu", + "name": "icu" + }, + { + "@type": "SoftwareApplication", + "identifier": "libxml2", + "name": "libxml2" + }, + { + "@type": "SoftwareApplication", + "identifier": "ticcutils", + "name": "ticcutils" + }, + { + "@type": "SoftwareApplication", + "identifier": "libfolia", + "name": "libfolia" + } + ], + "funder": [ + { + "@type": "Organization", + "name": "CLARIN-NL" + }, + { + "@type": "Organization", + "name": "CLARIAH", + "url": "https://www.clariah.nl" + } + ], + "readme": "https://github.com/LanguageMachines/ucto/blob/master/README.md", + "issueTracker": "https://github.com/LanguageMachines/ucto/issues", + "contIntegration": "https://travis-ci.org/LanguageMachines/ucto", + "releaseNotes": "https://github.com/LanguageMachines/ucto/releases", + "developmentStatus": "active", + "keywords": [ "nlp", "natural language processing", "tokenization", "tokenizer" ], + "dateCreated": "2011-03-27", + "entryPoints": [ + { + "@type": "EntryPoint", + "name": "ucto", + "urlTemplate": "file:///ucto", + "description": "Ucto tokeniser, full command line interface", + "interfaceType": "CLI" + } + ] +} diff -Nru ucto-0.9.6/config/e-mail.rule ucto-0.14/config/e-mail.rule --- ucto-0.9.6/config/e-mail.rule 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/config/e-mail.rule 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -E-MAIL=^([\p{L}\p{N}\._%+\-]+@[\p{L}\p{N}\.\-]+\.\p{L}{2,4})\P{L}?$ diff -Nru ucto-0.9.6/config/Makefile.am ucto-0.14/config/Makefile.am --- ucto-0.9.6/config/Makefile.am 2017-01-23 12:03:47.000000000 +0000 +++ ucto-0.14/config/Makefile.am 2018-11-13 16:30:47.000000000 +0000 @@ -1,6 +1,13 @@ -config_DATA = tokconfig-generic url.rule e-mail.rule smiley.rule \ - standard-quotes.quote standard-eos.eos textcat.cfg +config_DATA = textcat_normal.cfg textcat_alt.cfg configdir = $(datadir)/$(PACKAGE) EXTRA_DIST = $(config_DATA) + +install-data-hook: + rm -f $(configdir)/textcat.cfg +if OLD_LM + $(LN_S) $(configdir)/textcat_alt.cfg $(configdir)/textcat.cfg +else + $(LN_S) $(configdir)/textcat_normal.cfg $(configdir)/textcat.cfg +endif diff -Nru ucto-0.9.6/config/Makefile.in ucto-0.14/config/Makefile.in --- ucto-0.9.6/config/Makefile.in 2017-01-23 12:52:24.000000000 +0000 +++ ucto-0.14/config/Makefile.in 2018-11-20 09:07:54.000000000 +0000 @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. +# Makefile.in generated by automake 1.15.1 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2014 Free Software Foundation, Inc. +# Copyright (C) 1994-2017 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -90,7 +90,9 @@ host_triplet = @host@ subdir = config ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_old_textcat.m4 \ + $(top_srcdir)/m4/ac_osx_pkg.m4 \ + $(top_srcdir)/m4/ac_search_lm.m4 \ $(top_srcdir)/m4/ax_lib_readline.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ @@ -101,7 +103,7 @@ DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/config.h -CONFIG_CLEAN_FILES = +CONFIG_CLEAN_FILES = textcat_normal.cfg textcat_alt.cfg CONFIG_CLEAN_VPATH_FILES = AM_V_P = $(am__v_P_@AM_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) @@ -152,7 +154,8 @@ am__installdirs = "$(DESTDIR)$(configdir)" DATA = $(config_DATA) am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) -am__DIST_COMMON = $(srcdir)/Makefile.in +am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/textcat_alt.cfg.in \ + $(srcdir)/textcat_normal.cfg.in DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ @@ -185,13 +188,7 @@ FGREP = @FGREP@ GREP = @GREP@ ICU_CFLAGS = @ICU_CFLAGS@ -ICU_CONFIG = @ICU_CONFIG@ -ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@ -ICU_CXXFLAGS = @ICU_CXXFLAGS@ -ICU_IOLIBS = @ICU_IOLIBS@ -ICU_LIBPATH = @ICU_LIBPATH@ ICU_LIBS = @ICU_LIBS@ -ICU_VERSION = @ICU_VERSION@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ @@ -209,6 +206,7 @@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +MODULE_PREFIX = @MODULE_PREFIX@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ @@ -232,6 +230,8 @@ SHELL = @SHELL@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ +TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ +TEXTCAT_LIBS = @TEXTCAT_LIBS@ VERSION = @VERSION@ XML2_CFLAGS = @XML2_CFLAGS@ XML2_LIBS = @XML2_LIBS@ @@ -282,6 +282,7 @@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +runstatedir = @runstatedir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ @@ -294,9 +295,7 @@ top_srcdir = @top_srcdir@ uctodata_CFLAGS = @uctodata_CFLAGS@ uctodata_LIBS = @uctodata_LIBS@ -config_DATA = tokconfig-generic url.rule e-mail.rule smiley.rule \ - standard-quotes.quote standard-eos.eos textcat.cfg - +config_DATA = textcat_normal.cfg textcat_alt.cfg configdir = $(datadir)/$(PACKAGE) EXTRA_DIST = $(config_DATA) all: all-am @@ -331,6 +330,10 @@ $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(am__aclocal_m4_deps): +textcat_normal.cfg: $(top_builddir)/config.status $(srcdir)/textcat_normal.cfg.in + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ +textcat_alt.cfg: $(top_builddir)/config.status $(srcdir)/textcat_alt.cfg.in + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ mostlyclean-libtool: -rm -f *.lo @@ -453,7 +456,8 @@ info-am: install-data-am: install-configDATA - + @$(NORMAL_INSTALL) + $(MAKE) $(AM_MAKEFLAGS) install-data-hook install-dvi: install-dvi-am install-dvi-am: @@ -498,17 +502,17 @@ uninstall-am: uninstall-configDATA -.MAKE: install-am install-strip +.MAKE: install-am install-data-am install-strip .PHONY: all all-am check check-am clean clean-generic clean-libtool \ cscopelist-am ctags-am distclean distclean-generic \ distclean-libtool distdir dvi dvi-am html html-am info info-am \ install install-am install-configDATA install-data \ - install-data-am install-dvi install-dvi-am install-exec \ - install-exec-am install-html install-html-am install-info \ - install-info-am install-man install-pdf install-pdf-am \ - install-ps install-ps-am install-strip installcheck \ - installcheck-am installdirs maintainer-clean \ + install-data-am install-data-hook install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ maintainer-clean-generic mostlyclean mostlyclean-generic \ mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \ uninstall-am uninstall-configDATA @@ -516,6 +520,11 @@ .PRECIOUS: Makefile +install-data-hook: + rm -f $(configdir)/textcat.cfg +@OLD_LM_TRUE@ $(LN_S) $(configdir)/textcat_alt.cfg $(configdir)/textcat.cfg +@OLD_LM_FALSE@ $(LN_S) $(configdir)/textcat_normal.cfg $(configdir)/textcat.cfg + # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: diff -Nru ucto-0.9.6/config/smiley.rule ucto-0.14/config/smiley.rule --- ucto-0.9.6/config/smiley.rule 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/config/smiley.rule 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -SMILEY=^(?:[oO>}\])]?)(?:[\:;8][',]?[-\^]?(?:[sSdDpPcCoO#@*$|?]|\)\)*|\{|\[|\(\(*)=?)$ -REVERSE-SMILEY=^(?:\(|\)|\}*)(?:[sScCoO#@*$|?]?|\{|\[|\(\(?)=?(?:[',]?[-\^]?[\:;8])(?:[oO<}\[)]?)$ diff -Nru ucto-0.9.6/config/standard-eos.eos ucto-0.14/config/standard-eos.eos --- ucto-0.9.6/config/standard-eos.eos 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/config/standard-eos.eos 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -# Character: . -# Name: FULL STOP -# Code: 46 (0x2E) -\u002E - -# Character: ! -# Name: EXCLAMATION MARK -# Code: 33 (0x21) -\u0021 - -# Character: ? -# Name: QUESTION MARK -# Code: 63 (0x3f) -\u003F - diff -Nru ucto-0.9.6/config/standard-quotes.quote ucto-0.14/config/standard-quotes.quote --- ucto-0.9.6/config/standard-quotes.quote 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/config/standard-quotes.quote 1970-01-01 00:00:00.000000000 +0000 @@ -1,8 +0,0 @@ -# all quotes must be entered as pairs of open en close quotes -# separated by a space -# When more opening quotes match a single closing quote (or visa versa) -# they must be aggregated in one string! -# the ambiguous quotes " and ' are handled automaticly - -‘ ’ -“„‟ ” diff -Nru ucto-0.9.6/config/textcat_alt.cfg ucto-0.14/config/textcat_alt.cfg --- ucto-0.9.6/config/textcat_alt.cfg 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/config/textcat_alt.cfg 2018-11-29 09:08:05.000000000 +0000 @@ -0,0 +1,22 @@ +# +# A config file for the language models we support for texcat +# provided with Gertjan van Noords language guesser +# (http://odur.let.rug.nl/~vannoord/TextCat/) +# +# Notes: +# - we only support a small range of languages. +# - Putting the most probable languages at the top of the list +# improves performance, because this will raise the threshold for +# likely candidates more quickly. +# +# +/usr/share/libexttextcat/dutch.lm nld +/usr/share/libexttextcat/english.lm eng +/usr/share/libexttextcat/german.lm deu +/usr/share/libexttextcat/french.lm fra +/usr/share/libexttextcat/italian.lm ita +/usr/share/libexttextcat/spanish.lm spa +/usr/share/libexttextcat/russian-iso8859_5.lm rus +/usr/share/libexttextcat/portuguese.lm por +/usr/share/libexttextcat/swedish.lm swe +/usr/share/libexttextcat/turkish.lm tur diff -Nru ucto-0.9.6/config/textcat_alt.cfg.in ucto-0.14/config/textcat_alt.cfg.in --- ucto-0.9.6/config/textcat_alt.cfg.in 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/config/textcat_alt.cfg.in 2018-11-13 16:30:47.000000000 +0000 @@ -0,0 +1,22 @@ +# +# A config file for the language models we support for texcat +# provided with Gertjan van Noords language guesser +# (http://odur.let.rug.nl/~vannoord/TextCat/) +# +# Notes: +# - we only support a small range of languages. +# - Putting the most probable languages at the top of the list +# improves performance, because this will raise the threshold for +# likely candidates more quickly. +# +# +@MODULE_PREFIX@/dutch.lm nld +@MODULE_PREFIX@/english.lm eng +@MODULE_PREFIX@/german.lm deu +@MODULE_PREFIX@/french.lm fra +@MODULE_PREFIX@/italian.lm ita +@MODULE_PREFIX@/spanish.lm spa +@MODULE_PREFIX@/russian-iso8859_5.lm rus +@MODULE_PREFIX@/portuguese.lm por +@MODULE_PREFIX@/swedish.lm swe +@MODULE_PREFIX@/turkish.lm tur diff -Nru ucto-0.9.6/config/textcat.cfg ucto-0.14/config/textcat.cfg --- ucto-0.9.6/config/textcat.cfg 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/config/textcat.cfg 1970-01-01 00:00:00.000000000 +0000 @@ -1,22 +0,0 @@ -# -# A config file for the language models we support for texcat -# provided with Gertjan van Noords language guesser -# (http://odur.let.rug.nl/~vannoord/TextCat/) -# -# Notes: -# - we only support a small range of languages. -# - Putting the most probable languages at the top of the list -# improves performance, because this will raise the threshold for -# likely candidates more quickly. -# -# -/usr/share/libexttextcat/nl.lm nld -/usr/share/libexttextcat/en.lm eng -/usr/share/libexttextcat/de.lm deu -/usr/share/libexttextcat/fr.lm fra -/usr/share/libexttextcat/it.lm ita -/usr/share/libexttextcat/es.lm spa -/usr/share/libexttextcat/ru.lm rus -/usr/share/libexttextcat/pt.lm por -/usr/share/libexttextcat/sv.lm swe -/usr/share/libexttextcat/tr.lm tur diff -Nru ucto-0.9.6/config/textcat_normal.cfg ucto-0.14/config/textcat_normal.cfg --- ucto-0.9.6/config/textcat_normal.cfg 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/config/textcat_normal.cfg 2018-11-29 09:08:05.000000000 +0000 @@ -0,0 +1,22 @@ +# +# A config file for the language models we support for texcat +# provided with Gertjan van Noords language guesser +# (http://odur.let.rug.nl/~vannoord/TextCat/) +# +# Notes: +# - we only support a small range of languages. +# - Putting the most probable languages at the top of the list +# improves performance, because this will raise the threshold for +# likely candidates more quickly. +# +# +/usr/share/libexttextcat/nl.lm nld +/usr/share/libexttextcat/en.lm eng +/usr/share/libexttextcat/de.lm deu +/usr/share/libexttextcat/fr.lm fra +/usr/share/libexttextcat/it.lm ita +/usr/share/libexttextcat/es.lm spa +/usr/share/libexttextcat/ru.lm rus +/usr/share/libexttextcat/pt.lm por +/usr/share/libexttextcat/sv.lm swe +/usr/share/libexttextcat/tr.lm tur diff -Nru ucto-0.9.6/config/textcat_normal.cfg.in ucto-0.14/config/textcat_normal.cfg.in --- ucto-0.9.6/config/textcat_normal.cfg.in 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/config/textcat_normal.cfg.in 2018-11-13 16:30:47.000000000 +0000 @@ -0,0 +1,22 @@ +# +# A config file for the language models we support for texcat +# provided with Gertjan van Noords language guesser +# (http://odur.let.rug.nl/~vannoord/TextCat/) +# +# Notes: +# - we only support a small range of languages. +# - Putting the most probable languages at the top of the list +# improves performance, because this will raise the threshold for +# likely candidates more quickly. +# +# +@MODULE_PREFIX@/nl.lm nld +@MODULE_PREFIX@/en.lm eng +@MODULE_PREFIX@/de.lm deu +@MODULE_PREFIX@/fr.lm fra +@MODULE_PREFIX@/it.lm ita +@MODULE_PREFIX@/es.lm spa +@MODULE_PREFIX@/ru.lm rus +@MODULE_PREFIX@/pt.lm por +@MODULE_PREFIX@/sv.lm swe +@MODULE_PREFIX@/tr.lm tur diff -Nru ucto-0.9.6/config/tokconfig-generic ucto-0.14/config/tokconfig-generic --- ucto-0.9.6/config/tokconfig-generic 2017-01-16 11:13:43.000000000 +0000 +++ ucto-0.14/config/tokconfig-generic 1970-01-01 00:00:00.000000000 +0000 @@ -1,107 +0,0 @@ -version=0.2 -[RULE-ORDER] -URL URL-WWW URL-DOMAIN -E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND -ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE -NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN - - -[META-RULES] - -[RULES] -%include url -%include e-mail -%include smiley - -#Ex: (dis)information -WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)* - -#Ex: understand(s) -WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe}) - -#Keep dash/underscore connected parts (even if they are in parenthesis) -WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+ - -#Abbreviations with multiple periods -ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;]) - -#retain initials -INITIAL=^(?:\p{Lt}|\p{Lu})\.$ - -#Homogeneous punctuation (ellipsis etc) -PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,} - -#Date -DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4} -DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2} - -FRACNUMBER=\p{N}+(?:/\p{N}+)+ - -NUMBER-YEAR=(['`’‘´]\p{N}{2})(?:\P{N}|\z) - -#Times -TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:a\.?m\.?|p\.?m\.?)? - -#retain digits, including those starting with initial period (.22), and negative numbers -NUMBER=-?(?:[\.,]?\p{N}+)+ - -CURRENCY=\p{Sc} - -WORD=[\p{L}\p{Mn}]+ - -PUNCTUATION=\p{P} - -UNKNOWN=. - -[PREFIXES] - -[SUFFIXES] - -[ORDINALS] - -[TOKENS] - -[UNITS] -km -m -cm -mm -g -kg -C -l -s -sec -min -gb -mb -kb - - -[CURRENCY] -USD -GBP -CAD -NZD -AUD -SGD -HKD -EUR - -[ABBREVIATIONS] - - -[FILTER] -fl fl -ff ff -ffi ffi -ffl ffl -# also filter soft hyphen -\u00AD - - -[EOSMARKERS] -%include standard-eos - -[QUOTES] -%include standard-quotes diff -Nru ucto-0.9.6/config/url.rule ucto-0.14/config/url.rule --- ucto-0.9.6/config/url.rule 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/config/url.rule 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ -URL=(?i:https?|ftps?|nfs|sshfs|gopher|smb)://[\p{L}\p{N}]+(?:[[:punct:]=]+[\p{L}\p{N}]+)+/? -URL-WWW=www\.[\p{L}\p{N}]+(?:[[:punct:]]+[\p{L}\p{N}]+)*/? -URL-DOMAIN=^[\p{L}\p{N}]+(?:\.[\p{L}\p{N}]+)*\.(?:com|org|net|edu|mil|int|nl|be|fr|de|uk|es|it|pt|dk|se|no|fi|ch|at|hr|bg|ro|br|ru|cn|in|id|eu|ly|to|tk|za|ko|jp) diff -Nru ucto-0.9.6/config.guess ucto-0.14/config.guess --- ucto-0.9.6/config.guess 2016-09-09 05:06:00.000000000 +0000 +++ ucto-0.14/config.guess 2018-02-24 16:00:57.000000000 +0000 @@ -1,8 +1,8 @@ #! /bin/sh # Attempt to guess a canonical system name. -# Copyright 1992-2014 Free Software Foundation, Inc. +# Copyright 1992-2018 Free Software Foundation, Inc. -timestamp='2014-11-04' +timestamp='2018-02-24' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -15,7 +15,7 @@ # General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, see . +# along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a @@ -27,7 +27,7 @@ # Originally written by Per Bothner; maintained since 2000 by Ben Elliston. # # You can get the latest version of this script from: -# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD +# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess # # Please send patches to . @@ -39,7 +39,7 @@ Output the configuration name of the system \`$me' is run on. -Operation modes: +Options: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit @@ -50,7 +50,7 @@ GNU config.guess ($timestamp) Originally written by Per Bothner. -Copyright 1992-2014 Free Software Foundation, Inc. +Copyright 1992-2018 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -107,9 +107,9 @@ dummy=$tmp/dummy ; tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; case $CC_FOR_BUILD,$HOST_CC,$CC in - ,,) echo "int x;" > $dummy.c ; + ,,) echo "int x;" > "$dummy.c" ; for c in cc gcc c89 c99 ; do - if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then + if ($c -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then CC_FOR_BUILD="$c"; break ; fi ; done ; @@ -132,14 +132,14 @@ UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown -case "${UNAME_SYSTEM}" in +case "$UNAME_SYSTEM" in Linux|GNU|GNU/*) # If the system lacks a compiler, then just pick glibc. # We could probably try harder. LIBC=gnu - eval $set_cc_for_build - cat <<-EOF > $dummy.c + eval "$set_cc_for_build" + cat <<-EOF > "$dummy.c" #include #if defined(__UCLIBC__) LIBC=uclibc @@ -149,13 +149,20 @@ LIBC=gnu #endif EOF - eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'` + eval "`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`" + + # If ldd exists, use it to detect musl libc. + if command -v ldd >/dev/null && \ + ldd --version 2>&1 | grep -q ^musl + then + LIBC=musl + fi ;; esac # Note: order is significant - the case branches are not exclusive. -case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in +case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in *:NetBSD:*:*) # NetBSD (nbsd) targets should (where applicable) match one or # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, @@ -168,21 +175,31 @@ # Note: NetBSD doesn't particularly care about the vendor # portion of the name. We always set it to "unknown". sysctl="sysctl -n hw.machine_arch" - UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ - /usr/sbin/$sysctl 2>/dev/null || echo unknown)` - case "${UNAME_MACHINE_ARCH}" in + UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \ + "/sbin/$sysctl" 2>/dev/null || \ + "/usr/sbin/$sysctl" 2>/dev/null || \ + echo unknown)` + case "$UNAME_MACHINE_ARCH" in armeb) machine=armeb-unknown ;; arm*) machine=arm-unknown ;; sh3el) machine=shl-unknown ;; sh3eb) machine=sh-unknown ;; sh5el) machine=sh5le-unknown ;; - *) machine=${UNAME_MACHINE_ARCH}-unknown ;; + earmv*) + arch=`echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,'` + endian=`echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p'` + machine="${arch}${endian}"-unknown + ;; + *) machine="$UNAME_MACHINE_ARCH"-unknown ;; esac # The Operating System including object format, if it has switched - # to ELF recently, or will in the future. - case "${UNAME_MACHINE_ARCH}" in + # to ELF recently (or will in the future) and ABI. + case "$UNAME_MACHINE_ARCH" in + earm*) + os=netbsdelf + ;; arm*|i386|m68k|ns32k|sh3*|sparc|vax) - eval $set_cc_for_build + eval "$set_cc_for_build" if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ELF__ then @@ -197,44 +214,67 @@ os=netbsd ;; esac + # Determine ABI tags. + case "$UNAME_MACHINE_ARCH" in + earm*) + expr='s/^earmv[0-9]/-eabi/;s/eb$//' + abi=`echo "$UNAME_MACHINE_ARCH" | sed -e "$expr"` + ;; + esac # The OS release # Debian GNU/NetBSD machines have a different userland, and # thus, need a distinct triplet. However, they do not need # kernel version information, so it can be replaced with a # suitable tag, in the style of linux-gnu. - case "${UNAME_VERSION}" in + case "$UNAME_VERSION" in Debian*) release='-gnu' ;; *) - release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + release=`echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2` ;; esac # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: # contains redundant information, the shorter form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. - echo "${machine}-${os}${release}" + echo "$machine-${os}${release}${abi}" exit ;; *:Bitrig:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` - echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE} + echo "$UNAME_MACHINE_ARCH"-unknown-bitrig"$UNAME_RELEASE" exit ;; *:OpenBSD:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` - echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} + echo "$UNAME_MACHINE_ARCH"-unknown-openbsd"$UNAME_RELEASE" + exit ;; + *:LibertyBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'` + echo "$UNAME_MACHINE_ARCH"-unknown-libertybsd"$UNAME_RELEASE" + exit ;; + *:MidnightBSD:*:*) + echo "$UNAME_MACHINE"-unknown-midnightbsd"$UNAME_RELEASE" exit ;; *:ekkoBSD:*:*) - echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} + echo "$UNAME_MACHINE"-unknown-ekkobsd"$UNAME_RELEASE" exit ;; *:SolidBSD:*:*) - echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} + echo "$UNAME_MACHINE"-unknown-solidbsd"$UNAME_RELEASE" exit ;; macppc:MirBSD:*:*) - echo powerpc-unknown-mirbsd${UNAME_RELEASE} + echo powerpc-unknown-mirbsd"$UNAME_RELEASE" exit ;; *:MirBSD:*:*) - echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} + echo "$UNAME_MACHINE"-unknown-mirbsd"$UNAME_RELEASE" + exit ;; + *:Sortix:*:*) + echo "$UNAME_MACHINE"-unknown-sortix exit ;; + *:Redox:*:*) + echo "$UNAME_MACHINE"-unknown-redox + exit ;; + mips:OSF1:*.*) + echo mips-dec-osf1 + exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) @@ -251,63 +291,54 @@ ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` case "$ALPHA_CPU_TYPE" in "EV4 (21064)") - UNAME_MACHINE="alpha" ;; + UNAME_MACHINE=alpha ;; "EV4.5 (21064)") - UNAME_MACHINE="alpha" ;; + UNAME_MACHINE=alpha ;; "LCA4 (21066/21068)") - UNAME_MACHINE="alpha" ;; + UNAME_MACHINE=alpha ;; "EV5 (21164)") - UNAME_MACHINE="alphaev5" ;; + UNAME_MACHINE=alphaev5 ;; "EV5.6 (21164A)") - UNAME_MACHINE="alphaev56" ;; + UNAME_MACHINE=alphaev56 ;; "EV5.6 (21164PC)") - UNAME_MACHINE="alphapca56" ;; + UNAME_MACHINE=alphapca56 ;; "EV5.7 (21164PC)") - UNAME_MACHINE="alphapca57" ;; + UNAME_MACHINE=alphapca57 ;; "EV6 (21264)") - UNAME_MACHINE="alphaev6" ;; + UNAME_MACHINE=alphaev6 ;; "EV6.7 (21264A)") - UNAME_MACHINE="alphaev67" ;; + UNAME_MACHINE=alphaev67 ;; "EV6.8CB (21264C)") - UNAME_MACHINE="alphaev68" ;; + UNAME_MACHINE=alphaev68 ;; "EV6.8AL (21264B)") - UNAME_MACHINE="alphaev68" ;; + UNAME_MACHINE=alphaev68 ;; "EV6.8CX (21264D)") - UNAME_MACHINE="alphaev68" ;; + UNAME_MACHINE=alphaev68 ;; "EV6.9A (21264/EV69A)") - UNAME_MACHINE="alphaev69" ;; + UNAME_MACHINE=alphaev69 ;; "EV7 (21364)") - UNAME_MACHINE="alphaev7" ;; + UNAME_MACHINE=alphaev7 ;; "EV7.9 (21364A)") - UNAME_MACHINE="alphaev79" ;; + UNAME_MACHINE=alphaev79 ;; esac # A Pn.n version is a patched version. # A Vn.n version is a released version. # A Tn.n version is a released field test version. # A Xn.n version is an unreleased experimental baselevel. # 1.2 uses "1.2" for uname -r. - echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + echo "$UNAME_MACHINE"-dec-osf"`echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`" # Reset EXIT trap before exiting to avoid spurious non-zero exit code. exitcode=$? trap '' 0 exit $exitcode ;; - Alpha\ *:Windows_NT*:*) - # How do we know it's Interix rather than the generic POSIX subsystem? - # Should we change UNAME_MACHINE based on the output of uname instead - # of the specific Alpha model? - echo alpha-pc-interix - exit ;; - 21064:Windows_NT:50:3) - echo alpha-dec-winnt3.5 - exit ;; Amiga*:UNIX_System_V:4.0:*) echo m68k-unknown-sysv4 exit ;; *:[Aa]miga[Oo][Ss]:*:*) - echo ${UNAME_MACHINE}-unknown-amigaos + echo "$UNAME_MACHINE"-unknown-amigaos exit ;; *:[Mm]orph[Oo][Ss]:*:*) - echo ${UNAME_MACHINE}-unknown-morphos + echo "$UNAME_MACHINE"-unknown-morphos exit ;; *:OS/390:*:*) echo i370-ibm-openedition @@ -319,7 +350,7 @@ echo powerpc-ibm-os400 exit ;; arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) - echo arm-acorn-riscix${UNAME_RELEASE} + echo arm-acorn-riscix"$UNAME_RELEASE" exit ;; arm*:riscos:*:*|arm*:RISCOS:*:*) echo arm-unknown-riscos @@ -346,38 +377,38 @@ sparc) echo sparc-icl-nx7; exit ;; esac ;; s390x:SunOS:*:*) - echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + echo "$UNAME_MACHINE"-ibm-solaris2"`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`" exit ;; sun4H:SunOS:5.*:*) - echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + echo sparc-hal-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" exit ;; sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) - echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + echo sparc-sun-solaris2"`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`" exit ;; i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) - echo i386-pc-auroraux${UNAME_RELEASE} + echo i386-pc-auroraux"$UNAME_RELEASE" exit ;; i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) - eval $set_cc_for_build - SUN_ARCH="i386" + eval "$set_cc_for_build" + SUN_ARCH=i386 # If there is a compiler, see if it is configured for 64-bit objects. # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. # This test works for both compilers. - if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then + if [ "$CC_FOR_BUILD" != no_compiler_found ]; then if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then - SUN_ARCH="x86_64" + SUN_ARCH=x86_64 fi fi - echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + echo "$SUN_ARCH"-pc-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" exit ;; sun4*:SunOS:6*:*) # According to config.sub, this is the proper way to canonicalize # SunOS6. Hard to guess exactly what SunOS6 will be like, but # it's likely to be more like Solaris than SunOS4. - echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + echo sparc-sun-solaris3"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" exit ;; sun4*:SunOS:*:*) case "`/usr/bin/arch -k`" in @@ -386,25 +417,25 @@ ;; esac # Japanese Language versions have a version number like `4.1.3-JL'. - echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` + echo sparc-sun-sunos"`echo "$UNAME_RELEASE"|sed -e 's/-/_/'`" exit ;; sun3*:SunOS:*:*) - echo m68k-sun-sunos${UNAME_RELEASE} + echo m68k-sun-sunos"$UNAME_RELEASE" exit ;; sun*:*:4.2BSD:*) UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` - test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 + test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3 case "`/bin/arch`" in sun3) - echo m68k-sun-sunos${UNAME_RELEASE} + echo m68k-sun-sunos"$UNAME_RELEASE" ;; sun4) - echo sparc-sun-sunos${UNAME_RELEASE} + echo sparc-sun-sunos"$UNAME_RELEASE" ;; esac exit ;; aushp:SunOS:*:*) - echo sparc-auspex-sunos${UNAME_RELEASE} + echo sparc-auspex-sunos"$UNAME_RELEASE" exit ;; # The situation for MiNT is a little confusing. The machine name # can be virtually everything (everything which is not @@ -415,44 +446,44 @@ # MiNT. But MiNT is downward compatible to TOS, so this should # be no problem. atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} + echo m68k-atari-mint"$UNAME_RELEASE" exit ;; atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} + echo m68k-atari-mint"$UNAME_RELEASE" exit ;; *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} + echo m68k-atari-mint"$UNAME_RELEASE" exit ;; milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) - echo m68k-milan-mint${UNAME_RELEASE} + echo m68k-milan-mint"$UNAME_RELEASE" exit ;; hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) - echo m68k-hades-mint${UNAME_RELEASE} + echo m68k-hades-mint"$UNAME_RELEASE" exit ;; *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) - echo m68k-unknown-mint${UNAME_RELEASE} + echo m68k-unknown-mint"$UNAME_RELEASE" exit ;; m68k:machten:*:*) - echo m68k-apple-machten${UNAME_RELEASE} + echo m68k-apple-machten"$UNAME_RELEASE" exit ;; powerpc:machten:*:*) - echo powerpc-apple-machten${UNAME_RELEASE} + echo powerpc-apple-machten"$UNAME_RELEASE" exit ;; RISC*:Mach:*:*) echo mips-dec-mach_bsd4.3 exit ;; RISC*:ULTRIX:*:*) - echo mips-dec-ultrix${UNAME_RELEASE} + echo mips-dec-ultrix"$UNAME_RELEASE" exit ;; VAX*:ULTRIX*:*:*) - echo vax-dec-ultrix${UNAME_RELEASE} + echo vax-dec-ultrix"$UNAME_RELEASE" exit ;; 2020:CLIX:*:* | 2430:CLIX:*:*) - echo clipper-intergraph-clix${UNAME_RELEASE} + echo clipper-intergraph-clix"$UNAME_RELEASE" exit ;; mips:*:*:UMIPS | mips:*:*:RISCos) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c + eval "$set_cc_for_build" + sed 's/^ //' << EOF > "$dummy.c" #ifdef __cplusplus #include /* for printf() prototype */ int main (int argc, char *argv[]) { @@ -461,23 +492,23 @@ #endif #if defined (host_mips) && defined (MIPSEB) #if defined (SYSTYPE_SYSV) - printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); + printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_SVR4) - printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); + printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) - printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); + printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0); #endif #endif exit (-1); } EOF - $CC_FOR_BUILD -o $dummy $dummy.c && - dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && - SYSTEM_NAME=`$dummy $dummyarg` && + $CC_FOR_BUILD -o "$dummy" "$dummy.c" && + dummyarg=`echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p'` && + SYSTEM_NAME=`"$dummy" "$dummyarg"` && { echo "$SYSTEM_NAME"; exit; } - echo mips-mips-riscos${UNAME_RELEASE} + echo mips-mips-riscos"$UNAME_RELEASE" exit ;; Motorola:PowerMAX_OS:*:*) echo powerpc-motorola-powermax @@ -503,17 +534,17 @@ AViiON:dgux:*:*) # DG/UX returns AViiON for all architectures UNAME_PROCESSOR=`/usr/bin/uname -p` - if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] + if [ "$UNAME_PROCESSOR" = mc88100 ] || [ "$UNAME_PROCESSOR" = mc88110 ] then - if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ - [ ${TARGET_BINARY_INTERFACE}x = x ] + if [ "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx ] || \ + [ "$TARGET_BINARY_INTERFACE"x = x ] then - echo m88k-dg-dgux${UNAME_RELEASE} + echo m88k-dg-dgux"$UNAME_RELEASE" else - echo m88k-dg-dguxbcs${UNAME_RELEASE} + echo m88k-dg-dguxbcs"$UNAME_RELEASE" fi else - echo i586-dg-dgux${UNAME_RELEASE} + echo i586-dg-dgux"$UNAME_RELEASE" fi exit ;; M88*:DolphinOS:*:*) # DolphinOS (SVR3) @@ -530,7 +561,7 @@ echo m68k-tektronix-bsd exit ;; *:IRIX*:*:*) - echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` + echo mips-sgi-irix"`echo "$UNAME_RELEASE"|sed -e 's/-/_/g'`" exit ;; ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id @@ -542,14 +573,14 @@ if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else - IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + IBM_REV="$UNAME_VERSION.$UNAME_RELEASE" fi - echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} + echo "$UNAME_MACHINE"-ibm-aix"$IBM_REV" exit ;; *:AIX:2:3) if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c + eval "$set_cc_for_build" + sed 's/^ //' << EOF > "$dummy.c" #include main() @@ -560,7 +591,7 @@ exit(0); } EOF - if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` + if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` then echo "$SYSTEM_NAME" else @@ -574,7 +605,7 @@ exit ;; *:AIX:*:[4567]) IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` - if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then + if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then IBM_ARCH=rs6000 else IBM_ARCH=powerpc @@ -583,18 +614,18 @@ IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | awk -F: '{ print $3 }' | sed s/[0-9]*$/0/` else - IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + IBM_REV="$UNAME_VERSION.$UNAME_RELEASE" fi - echo ${IBM_ARCH}-ibm-aix${IBM_REV} + echo "$IBM_ARCH"-ibm-aix"$IBM_REV" exit ;; *:AIX:*:*) echo rs6000-ibm-aix exit ;; - ibmrt:4.4BSD:*|romp-ibm:BSD:*) + ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*) echo romp-ibm-bsd4.4 exit ;; ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and - echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to + echo romp-ibm-bsd"$UNAME_RELEASE" # 4.3 with uname added to exit ;; # report: romp-ibm BSD 4.3 *:BOSX:*:*) echo rs6000-bull-bosx @@ -609,28 +640,28 @@ echo m68k-hp-bsd4.4 exit ;; 9000/[34678]??:HP-UX:*:*) - HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` - case "${UNAME_MACHINE}" in - 9000/31? ) HP_ARCH=m68000 ;; - 9000/[34]?? ) HP_ARCH=m68k ;; + HPUX_REV=`echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//'` + case "$UNAME_MACHINE" in + 9000/31?) HP_ARCH=m68000 ;; + 9000/[34]??) HP_ARCH=m68k ;; 9000/[678][0-9][0-9]) if [ -x /usr/bin/getconf ]; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` - case "${sc_cpu_version}" in - 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 - 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 + case "$sc_cpu_version" in + 523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0 + 528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1 532) # CPU_PA_RISC2_0 - case "${sc_kernel_bits}" in - 32) HP_ARCH="hppa2.0n" ;; - 64) HP_ARCH="hppa2.0w" ;; - '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 + case "$sc_kernel_bits" in + 32) HP_ARCH=hppa2.0n ;; + 64) HP_ARCH=hppa2.0w ;; + '') HP_ARCH=hppa2.0 ;; # HP-UX 10.20 esac ;; esac fi - if [ "${HP_ARCH}" = "" ]; then - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c + if [ "$HP_ARCH" = "" ]; then + eval "$set_cc_for_build" + sed 's/^ //' << EOF > "$dummy.c" #define _HPUX_SOURCE #include @@ -663,13 +694,13 @@ exit (0); } EOF - (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` + (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=`"$dummy"` test -z "$HP_ARCH" && HP_ARCH=hppa fi ;; esac - if [ ${HP_ARCH} = "hppa2.0w" ] + if [ "$HP_ARCH" = hppa2.0w ] then - eval $set_cc_for_build + eval "$set_cc_for_build" # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler @@ -680,23 +711,23 @@ # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess # => hppa64-hp-hpux11.23 - if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | + if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | grep -q __LP64__ then - HP_ARCH="hppa2.0w" + HP_ARCH=hppa2.0w else - HP_ARCH="hppa64" + HP_ARCH=hppa64 fi fi - echo ${HP_ARCH}-hp-hpux${HPUX_REV} + echo "$HP_ARCH"-hp-hpux"$HPUX_REV" exit ;; ia64:HP-UX:*:*) - HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` - echo ia64-hp-hpux${HPUX_REV} + HPUX_REV=`echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//'` + echo ia64-hp-hpux"$HPUX_REV" exit ;; 3050*:HI-UX:*:*) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c + eval "$set_cc_for_build" + sed 's/^ //' << EOF > "$dummy.c" #include int main () @@ -721,11 +752,11 @@ exit (0); } EOF - $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && + $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` && { echo "$SYSTEM_NAME"; exit; } echo unknown-hitachi-hiuxwe2 exit ;; - 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*) echo hppa1.1-hp-bsd exit ;; 9000/8??:4.3bsd:*:*) @@ -734,7 +765,7 @@ *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) echo hppa1.0-hp-mpeix exit ;; - hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*) echo hppa1.1-hp-osf exit ;; hp8??:OSF1:*:*) @@ -742,9 +773,9 @@ exit ;; i*86:OSF1:*:*) if [ -x /usr/sbin/sysversion ] ; then - echo ${UNAME_MACHINE}-unknown-osf1mk + echo "$UNAME_MACHINE"-unknown-osf1mk else - echo ${UNAME_MACHINE}-unknown-osf1 + echo "$UNAME_MACHINE"-unknown-osf1 fi exit ;; parisc*:Lites*:*:*) @@ -769,127 +800,109 @@ echo c4-convex-bsd exit ;; CRAY*Y-MP:*:*:*) - echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + echo ymp-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*[A-Z]90:*:*:*) - echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ + echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \ | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ -e 's/\.[^.]*$/.X/' exit ;; CRAY*TS:*:*:*) - echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + echo t90-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*T3E:*:*:*) - echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + echo alphaev5-cray-unicosmk"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*SV1:*:*:*) - echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + echo sv1-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' exit ;; *:UNICOS/mp:*:*) - echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + echo craynv-cray-unicosmp"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/' exit ;; F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) - FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` - FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` + FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` + FUJITSU_REL=`echo "$UNAME_RELEASE" | sed -e 's/ /_/'` echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; 5000:UNIX_System_V:4.*:*) - FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` + FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` + FUJITSU_REL=`echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'` echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) - echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} + echo "$UNAME_MACHINE"-pc-bsdi"$UNAME_RELEASE" exit ;; sparc*:BSD/OS:*:*) - echo sparc-unknown-bsdi${UNAME_RELEASE} + echo sparc-unknown-bsdi"$UNAME_RELEASE" exit ;; *:BSD/OS:*:*) - echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} + echo "$UNAME_MACHINE"-unknown-bsdi"$UNAME_RELEASE" exit ;; *:FreeBSD:*:*) UNAME_PROCESSOR=`/usr/bin/uname -p` - case ${UNAME_PROCESSOR} in + case "$UNAME_PROCESSOR" in amd64) - echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; - *) - echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + UNAME_PROCESSOR=x86_64 ;; + i386) + UNAME_PROCESSOR=i586 ;; esac + echo "$UNAME_PROCESSOR"-unknown-freebsd"`echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`" exit ;; i*:CYGWIN*:*) - echo ${UNAME_MACHINE}-pc-cygwin + echo "$UNAME_MACHINE"-pc-cygwin exit ;; *:MINGW64*:*) - echo ${UNAME_MACHINE}-pc-mingw64 + echo "$UNAME_MACHINE"-pc-mingw64 exit ;; *:MINGW*:*) - echo ${UNAME_MACHINE}-pc-mingw32 + echo "$UNAME_MACHINE"-pc-mingw32 exit ;; *:MSYS*:*) - echo ${UNAME_MACHINE}-pc-msys - exit ;; - i*:windows32*:*) - # uname -m includes "-pc" on this system. - echo ${UNAME_MACHINE}-mingw32 + echo "$UNAME_MACHINE"-pc-msys exit ;; i*:PW*:*) - echo ${UNAME_MACHINE}-pc-pw32 + echo "$UNAME_MACHINE"-pc-pw32 exit ;; *:Interix*:*) - case ${UNAME_MACHINE} in + case "$UNAME_MACHINE" in x86) - echo i586-pc-interix${UNAME_RELEASE} + echo i586-pc-interix"$UNAME_RELEASE" exit ;; authenticamd | genuineintel | EM64T) - echo x86_64-unknown-interix${UNAME_RELEASE} + echo x86_64-unknown-interix"$UNAME_RELEASE" exit ;; IA64) - echo ia64-unknown-interix${UNAME_RELEASE} + echo ia64-unknown-interix"$UNAME_RELEASE" exit ;; esac ;; - [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) - echo i${UNAME_MACHINE}-pc-mks - exit ;; - 8664:Windows_NT:*) - echo x86_64-pc-mks - exit ;; - i*:Windows_NT*:* | Pentium*:Windows_NT*:*) - # How do we know it's Interix rather than the generic POSIX subsystem? - # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we - # UNAME_MACHINE based on the output of uname instead of i386? - echo i586-pc-interix - exit ;; i*:UWIN*:*) - echo ${UNAME_MACHINE}-pc-uwin + echo "$UNAME_MACHINE"-pc-uwin exit ;; amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) echo x86_64-unknown-cygwin exit ;; - p*:CYGWIN*:*) - echo powerpcle-unknown-cygwin - exit ;; prep*:SunOS:5.*:*) - echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + echo powerpcle-unknown-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" exit ;; *:GNU:*:*) # the GNU system - echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + echo "`echo "$UNAME_MACHINE"|sed -e 's,[-/].*$,,'`-unknown-$LIBC`echo "$UNAME_RELEASE"|sed -e 's,/.*$,,'`" exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland - echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC} + echo "$UNAME_MACHINE-unknown-`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`-$LIBC" exit ;; i*86:Minix:*:*) - echo ${UNAME_MACHINE}-pc-minix + echo "$UNAME_MACHINE"-pc-minix exit ;; aarch64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in @@ -902,58 +915,64 @@ EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep -q ld.so.1 - if test "$?" = 0 ; then LIBC="gnulibc1" ; fi - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + if test "$?" = 0 ; then LIBC=gnulibc1 ; fi + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; arc:Linux:*:* | arceb:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; arm*:Linux:*:*) - eval $set_cc_for_build + eval "$set_cc_for_build" if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" else if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_PCS_VFP then - echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"eabi else - echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"eabihf fi fi exit ;; avr32*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; cris:Linux:*:*) - echo ${UNAME_MACHINE}-axis-linux-${LIBC} + echo "$UNAME_MACHINE"-axis-linux-"$LIBC" exit ;; crisv32:Linux:*:*) - echo ${UNAME_MACHINE}-axis-linux-${LIBC} + echo "$UNAME_MACHINE"-axis-linux-"$LIBC" + exit ;; + e2k:Linux:*:*) + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; frv:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; hexagon:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; i*86:Linux:*:*) - echo ${UNAME_MACHINE}-pc-linux-${LIBC} + echo "$UNAME_MACHINE"-pc-linux-"$LIBC" exit ;; ia64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" + exit ;; + k1om:Linux:*:*) + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; m32r*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; m68*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; mips:Linux:*:* | mips64:Linux:*:*) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c + eval "$set_cc_for_build" + sed 's/^ //' << EOF > "$dummy.c" #undef CPU #undef ${UNAME_MACHINE} #undef ${UNAME_MACHINE}el @@ -967,64 +986,74 @@ #endif #endif EOF - eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` - test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; } + eval "`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU'`" + test "x$CPU" != x && { echo "$CPU-unknown-linux-$LIBC"; exit; } ;; + mips64el:Linux:*:*) + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" + exit ;; openrisc*:Linux:*:*) - echo or1k-unknown-linux-${LIBC} + echo or1k-unknown-linux-"$LIBC" exit ;; or32:Linux:*:* | or1k*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; padre:Linux:*:*) - echo sparc-unknown-linux-${LIBC} + echo sparc-unknown-linux-"$LIBC" exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) - echo hppa64-unknown-linux-${LIBC} + echo hppa64-unknown-linux-"$LIBC" exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in - PA7*) echo hppa1.1-unknown-linux-${LIBC} ;; - PA8*) echo hppa2.0-unknown-linux-${LIBC} ;; - *) echo hppa-unknown-linux-${LIBC} ;; + PA7*) echo hppa1.1-unknown-linux-"$LIBC" ;; + PA8*) echo hppa2.0-unknown-linux-"$LIBC" ;; + *) echo hppa-unknown-linux-"$LIBC" ;; esac exit ;; ppc64:Linux:*:*) - echo powerpc64-unknown-linux-${LIBC} + echo powerpc64-unknown-linux-"$LIBC" exit ;; ppc:Linux:*:*) - echo powerpc-unknown-linux-${LIBC} + echo powerpc-unknown-linux-"$LIBC" exit ;; ppc64le:Linux:*:*) - echo powerpc64le-unknown-linux-${LIBC} + echo powerpc64le-unknown-linux-"$LIBC" exit ;; ppcle:Linux:*:*) - echo powerpcle-unknown-linux-${LIBC} + echo powerpcle-unknown-linux-"$LIBC" + exit ;; + riscv32:Linux:*:* | riscv64:Linux:*:*) + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; s390:Linux:*:* | s390x:Linux:*:*) - echo ${UNAME_MACHINE}-ibm-linux-${LIBC} + echo "$UNAME_MACHINE"-ibm-linux-"$LIBC" exit ;; sh64*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; sh*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; tile*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; vax:Linux:*:*) - echo ${UNAME_MACHINE}-dec-linux-${LIBC} + echo "$UNAME_MACHINE"-dec-linux-"$LIBC" exit ;; x86_64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + if objdump -f /bin/sh | grep -q elf32-x86-64; then + echo "$UNAME_MACHINE"-pc-linux-"$LIBC"x32 + else + echo "$UNAME_MACHINE"-pc-linux-"$LIBC" + fi exit ;; xtensa*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. @@ -1038,34 +1067,34 @@ # I am not positive that other SVR4 systems won't match this, # I just have to hope. -- rms. # Use sysv4.2uw... so that sysv4* matches it. - echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} + echo "$UNAME_MACHINE"-pc-sysv4.2uw"$UNAME_VERSION" exit ;; i*86:OS/2:*:*) # If we were able to find `uname', then EMX Unix compatibility # is probably installed. - echo ${UNAME_MACHINE}-pc-os2-emx + echo "$UNAME_MACHINE"-pc-os2-emx exit ;; i*86:XTS-300:*:STOP) - echo ${UNAME_MACHINE}-unknown-stop + echo "$UNAME_MACHINE"-unknown-stop exit ;; i*86:atheos:*:*) - echo ${UNAME_MACHINE}-unknown-atheos + echo "$UNAME_MACHINE"-unknown-atheos exit ;; i*86:syllable:*:*) - echo ${UNAME_MACHINE}-pc-syllable + echo "$UNAME_MACHINE"-pc-syllable exit ;; i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) - echo i386-unknown-lynxos${UNAME_RELEASE} + echo i386-unknown-lynxos"$UNAME_RELEASE" exit ;; i*86:*DOS:*:*) - echo ${UNAME_MACHINE}-pc-msdosdjgpp + echo "$UNAME_MACHINE"-pc-msdosdjgpp exit ;; - i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) - UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` + i*86:*:4.*:*) + UNAME_REL=`echo "$UNAME_RELEASE" | sed 's/\/MP$//'` if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then - echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} + echo "$UNAME_MACHINE"-univel-sysv"$UNAME_REL" else - echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} + echo "$UNAME_MACHINE"-pc-sysv"$UNAME_REL" fi exit ;; i*86:*:5:[678]*) @@ -1075,12 +1104,12 @@ *Pentium) UNAME_MACHINE=i586 ;; *Pent*|*Celeron) UNAME_MACHINE=i686 ;; esac - echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} + echo "$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}{$UNAME_VERSION}" exit ;; i*86:*:3.2:*) if test -f /usr/options/cb.name; then UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 @@ -1090,9 +1119,9 @@ && UNAME_MACHINE=i686 (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ && UNAME_MACHINE=i686 - echo ${UNAME_MACHINE}-pc-sco$UNAME_REL + echo "$UNAME_MACHINE"-pc-sco"$UNAME_REL" else - echo ${UNAME_MACHINE}-pc-sysv32 + echo "$UNAME_MACHINE"-pc-sysv32 fi exit ;; pc:*:*:*) @@ -1100,7 +1129,7 @@ # uname -m prints for DJGPP always 'pc', but it prints nothing about # the processor, so we play safe by assuming i586. # Note: whatever this is, it MUST be the same as what config.sub - # prints for the "djgpp" host, or else GDB configury will decide that + # prints for the "djgpp" host, or else GDB configure will decide that # this is a cross-build. echo i586-pc-msdosdjgpp exit ;; @@ -1112,9 +1141,9 @@ exit ;; i860:*:4.*:*) # i860-SVR4 if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then - echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 + echo i860-stardent-sysv"$UNAME_RELEASE" # Stardent Vistra i860-SVR4 else # Add other i860-SVR4 vendors below as they are discovered. - echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 + echo i860-unknown-sysv"$UNAME_RELEASE" # Unknown i860-SVR4 fi exit ;; mini*:CTIX:SYS*5:*) @@ -1134,9 +1163,9 @@ test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4.3${OS_REL}; exit; } + && { echo i486-ncr-sysv4.3"$OS_REL"; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ - && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;; 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4; exit; } ;; @@ -1145,28 +1174,28 @@ test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4.3${OS_REL}; exit; } + && { echo i486-ncr-sysv4.3"$OS_REL"; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ - && { echo i586-ncr-sysv4.3${OS_REL}; exit; } + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ - && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;; m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) - echo m68k-unknown-lynxos${UNAME_RELEASE} + echo m68k-unknown-lynxos"$UNAME_RELEASE" exit ;; mc68030:UNIX_System_V:4.*:*) echo m68k-atari-sysv4 exit ;; TSUNAMI:LynxOS:2.*:*) - echo sparc-unknown-lynxos${UNAME_RELEASE} + echo sparc-unknown-lynxos"$UNAME_RELEASE" exit ;; rs6000:LynxOS:2.*:*) - echo rs6000-unknown-lynxos${UNAME_RELEASE} + echo rs6000-unknown-lynxos"$UNAME_RELEASE" exit ;; PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) - echo powerpc-unknown-lynxos${UNAME_RELEASE} + echo powerpc-unknown-lynxos"$UNAME_RELEASE" exit ;; SM[BE]S:UNIX_SV:*:*) - echo mips-dde-sysv${UNAME_RELEASE} + echo mips-dde-sysv"$UNAME_RELEASE" exit ;; RM*:ReliantUNIX-*:*:*) echo mips-sni-sysv4 @@ -1177,7 +1206,7 @@ *:SINIX-*:*:*) if uname -p 2>/dev/null >/dev/null ; then UNAME_MACHINE=`(uname -p) 2>/dev/null` - echo ${UNAME_MACHINE}-sni-sysv4 + echo "$UNAME_MACHINE"-sni-sysv4 else echo ns32k-sni-sysv fi @@ -1197,23 +1226,23 @@ exit ;; i*86:VOS:*:*) # From Paul.Green@stratus.com. - echo ${UNAME_MACHINE}-stratus-vos + echo "$UNAME_MACHINE"-stratus-vos exit ;; *:VOS:*:*) # From Paul.Green@stratus.com. echo hppa1.1-stratus-vos exit ;; mc68*:A/UX:*:*) - echo m68k-apple-aux${UNAME_RELEASE} + echo m68k-apple-aux"$UNAME_RELEASE" exit ;; news*:NEWS-OS:6*:*) echo mips-sony-newsos6 exit ;; R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) if [ -d /usr/nec ]; then - echo mips-nec-sysv${UNAME_RELEASE} + echo mips-nec-sysv"$UNAME_RELEASE" else - echo mips-unknown-sysv${UNAME_RELEASE} + echo mips-unknown-sysv"$UNAME_RELEASE" fi exit ;; BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. @@ -1232,46 +1261,56 @@ echo x86_64-unknown-haiku exit ;; SX-4:SUPER-UX:*:*) - echo sx4-nec-superux${UNAME_RELEASE} + echo sx4-nec-superux"$UNAME_RELEASE" exit ;; SX-5:SUPER-UX:*:*) - echo sx5-nec-superux${UNAME_RELEASE} + echo sx5-nec-superux"$UNAME_RELEASE" exit ;; SX-6:SUPER-UX:*:*) - echo sx6-nec-superux${UNAME_RELEASE} + echo sx6-nec-superux"$UNAME_RELEASE" exit ;; SX-7:SUPER-UX:*:*) - echo sx7-nec-superux${UNAME_RELEASE} + echo sx7-nec-superux"$UNAME_RELEASE" exit ;; SX-8:SUPER-UX:*:*) - echo sx8-nec-superux${UNAME_RELEASE} + echo sx8-nec-superux"$UNAME_RELEASE" exit ;; SX-8R:SUPER-UX:*:*) - echo sx8r-nec-superux${UNAME_RELEASE} + echo sx8r-nec-superux"$UNAME_RELEASE" + exit ;; + SX-ACE:SUPER-UX:*:*) + echo sxace-nec-superux"$UNAME_RELEASE" exit ;; Power*:Rhapsody:*:*) - echo powerpc-apple-rhapsody${UNAME_RELEASE} + echo powerpc-apple-rhapsody"$UNAME_RELEASE" exit ;; *:Rhapsody:*:*) - echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} + echo "$UNAME_MACHINE"-apple-rhapsody"$UNAME_RELEASE" exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown - eval $set_cc_for_build + eval "$set_cc_for_build" if test "$UNAME_PROCESSOR" = unknown ; then UNAME_PROCESSOR=powerpc fi - if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then - if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then + if test "`echo "$UNAME_RELEASE" | sed -e 's/\..*//'`" -le 10 ; then + if [ "$CC_FOR_BUILD" != no_compiler_found ]; then if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null then case $UNAME_PROCESSOR in i386) UNAME_PROCESSOR=x86_64 ;; powerpc) UNAME_PROCESSOR=powerpc64 ;; esac fi + # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc + if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_PPC >/dev/null + then + UNAME_PROCESSOR=powerpc + fi fi elif test "$UNAME_PROCESSOR" = i386 ; then # Avoid executing cc on OS X 10.9, as it ships with a stub @@ -1282,27 +1321,33 @@ # that Apple uses in portable devices. UNAME_PROCESSOR=x86_64 fi - echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} + echo "$UNAME_PROCESSOR"-apple-darwin"$UNAME_RELEASE" exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) UNAME_PROCESSOR=`uname -p` - if test "$UNAME_PROCESSOR" = "x86"; then + if test "$UNAME_PROCESSOR" = x86; then UNAME_PROCESSOR=i386 UNAME_MACHINE=pc fi - echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} + echo "$UNAME_PROCESSOR"-"$UNAME_MACHINE"-nto-qnx"$UNAME_RELEASE" exit ;; *:QNX:*:4*) echo i386-pc-qnx exit ;; - NEO-?:NONSTOP_KERNEL:*:*) - echo neo-tandem-nsk${UNAME_RELEASE} + NEO-*:NONSTOP_KERNEL:*:*) + echo neo-tandem-nsk"$UNAME_RELEASE" exit ;; NSE-*:NONSTOP_KERNEL:*:*) - echo nse-tandem-nsk${UNAME_RELEASE} + echo nse-tandem-nsk"$UNAME_RELEASE" exit ;; - NSR-?:NONSTOP_KERNEL:*:*) - echo nsr-tandem-nsk${UNAME_RELEASE} + NSR-*:NONSTOP_KERNEL:*:*) + echo nsr-tandem-nsk"$UNAME_RELEASE" + exit ;; + NSV-*:NONSTOP_KERNEL:*:*) + echo nsv-tandem-nsk"$UNAME_RELEASE" + exit ;; + NSX-*:NONSTOP_KERNEL:*:*) + echo nsx-tandem-nsk"$UNAME_RELEASE" exit ;; *:NonStop-UX:*:*) echo mips-compaq-nonstopux @@ -1311,18 +1356,18 @@ echo bs2000-siemens-sysv exit ;; DS/*:UNIX_System_V:*:*) - echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} + echo "$UNAME_MACHINE"-"$UNAME_SYSTEM"-"$UNAME_RELEASE" exit ;; *:Plan9:*:*) # "uname -m" is not consistent, so use $cputype instead. 386 # is converted to i386 for consistency with other x86 # operating systems. - if test "$cputype" = "386"; then + if test "$cputype" = 386; then UNAME_MACHINE=i386 else UNAME_MACHINE="$cputype" fi - echo ${UNAME_MACHINE}-unknown-plan9 + echo "$UNAME_MACHINE"-unknown-plan9 exit ;; *:TOPS-10:*:*) echo pdp10-unknown-tops10 @@ -1343,14 +1388,14 @@ echo pdp10-unknown-its exit ;; SEI:*:*:SEIUX) - echo mips-sei-seiux${UNAME_RELEASE} + echo mips-sei-seiux"$UNAME_RELEASE" exit ;; *:DragonFly:*:*) - echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` + echo "$UNAME_MACHINE"-unknown-dragonfly"`echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`" exit ;; *:*VMS:*:*) UNAME_MACHINE=`(uname -p) 2>/dev/null` - case "${UNAME_MACHINE}" in + case "$UNAME_MACHINE" in A*) echo alpha-dec-vms ; exit ;; I*) echo ia64-dec-vms ; exit ;; V*) echo vax-dec-vms ; exit ;; @@ -1359,34 +1404,48 @@ echo i386-pc-xenix exit ;; i*86:skyos:*:*) - echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' + echo "$UNAME_MACHINE"-pc-skyos"`echo "$UNAME_RELEASE" | sed -e 's/ .*$//'`" exit ;; i*86:rdos:*:*) - echo ${UNAME_MACHINE}-pc-rdos + echo "$UNAME_MACHINE"-pc-rdos exit ;; i*86:AROS:*:*) - echo ${UNAME_MACHINE}-pc-aros + echo "$UNAME_MACHINE"-pc-aros exit ;; x86_64:VMkernel:*:*) - echo ${UNAME_MACHINE}-unknown-esx + echo "$UNAME_MACHINE"-unknown-esx + exit ;; + amd64:Isilon\ OneFS:*:*) + echo x86_64-unknown-onefs exit ;; esac +echo "$0: unable to guess system type" >&2 + +case "$UNAME_MACHINE:$UNAME_SYSTEM" in + mips:Linux | mips64:Linux) + # If we got here on MIPS GNU/Linux, output extra information. + cat >&2 <&2 < in order to provide the needed -information to handle your system. +If $0 has already been updated, send the following data and any +information you think might be pertinent to config-patches@gnu.org to +provide the necessary information to handle your system. config.guess timestamp = $timestamp @@ -1405,16 +1464,16 @@ /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` -UNAME_MACHINE = ${UNAME_MACHINE} -UNAME_RELEASE = ${UNAME_RELEASE} -UNAME_SYSTEM = ${UNAME_SYSTEM} -UNAME_VERSION = ${UNAME_VERSION} +UNAME_MACHINE = "$UNAME_MACHINE" +UNAME_RELEASE = "$UNAME_RELEASE" +UNAME_SYSTEM = "$UNAME_SYSTEM" +UNAME_VERSION = "$UNAME_VERSION" EOF exit 1 # Local variables: -# eval: (add-hook 'write-file-hooks 'time-stamp) +# eval: (add-hook 'write-file-functions 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" diff -Nru ucto-0.9.6/config.h.in ucto-0.14/config.h.in --- ucto-0.9.6/config.h.in 2017-01-23 12:53:13.000000000 +0000 +++ ucto-0.14/config.h.in 2018-11-15 15:08:35.000000000 +0000 @@ -6,15 +6,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_HISTORY_H -/* we want to use ICU */ -#undef HAVE_ICU - /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H -/* Define to 1 if you have the header file. */ -#undef HAVE_LIBEXTTEXTCAT_TEXTCAT_H - /* Define if you have a readline compatible library */ #undef HAVE_LIBREADLINE @@ -24,6 +18,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H +/* textcat needs C linkage */ +#undef HAVE_OLD_TEXTCAT + /* Define to 1 if you have the header file. */ #undef HAVE_READLINE_H @@ -57,11 +54,8 @@ /* Define to 1 if you have the header file. */ #undef HAVE_SYS_TYPES_H -/* Define to 1 if you have the header file. */ -#undef HAVE_TEXTCAT_H - -/* textcat_lib */ -#undef HAVE_TEXTCAT_LIB +/* textcat */ +#undef HAVE_TEXTCAT /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H @@ -96,6 +90,9 @@ /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS +/* location of uctodata files */ +#undef UCTODATA_DIR + /* Version number of package */ #undef VERSION diff -Nru ucto-0.9.6/config.sub ucto-0.14/config.sub --- ucto-0.9.6/config.sub 2016-09-09 05:06:00.000000000 +0000 +++ ucto-0.14/config.sub 2018-02-24 16:00:57.000000000 +0000 @@ -1,8 +1,8 @@ #! /bin/sh # Configuration validation subroutine script. -# Copyright 1992-2014 Free Software Foundation, Inc. +# Copyright 1992-2018 Free Software Foundation, Inc. -timestamp='2014-12-03' +timestamp='2018-02-22' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -15,7 +15,7 @@ # General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, see . +# along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a @@ -33,7 +33,7 @@ # Otherwise, we print the canonical config type on stdout and succeed. # You can get the latest version of this script from: -# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD +# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub # This file is supposed to be the same for all GNU packages # and recognize all the CPU types, system types and aliases @@ -53,12 +53,11 @@ me=`echo "$0" | sed -e 's,.*/,,'` usage="\ -Usage: $0 [OPTION] CPU-MFR-OPSYS - $0 [OPTION] ALIAS +Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS Canonicalize a configuration name. -Operation modes: +Options: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit @@ -68,7 +67,7 @@ version="\ GNU config.sub ($timestamp) -Copyright 1992-2014 Free Software Foundation, Inc. +Copyright 1992-2018 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -95,7 +94,7 @@ *local*) # First pass through any local machine types. - echo $1 + echo "$1" exit ;; * ) @@ -113,24 +112,24 @@ # Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). # Here we must recognize all the valid KERNEL-OS combinations. -maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` +maybe_os=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` case $maybe_os in nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ - knetbsd*-gnu* | netbsd*-gnu* | \ - kopensolaris*-gnu* | \ + knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \ + kopensolaris*-gnu* | cloudabi*-eabi* | \ storm-chaos* | os2-emx* | rtmk-nova*) os=-$maybe_os - basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` + basic_machine=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` ;; android-linux) os=-linux-android - basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown + basic_machine=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown ;; *) - basic_machine=`echo $1 | sed 's/-[^-]*$//'` - if [ $basic_machine != $1 ] - then os=`echo $1 | sed 's/.*-/-/'` + basic_machine=`echo "$1" | sed 's/-[^-]*$//'` + if [ "$basic_machine" != "$1" ] + then os=`echo "$1" | sed 's/.*-/-/'` else os=; fi ;; esac @@ -179,44 +178,44 @@ ;; -sco6) os=-sco5v6 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -sco5) os=-sco3.2v5 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -sco4) os=-sco3.2v4 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -sco3.2.[4-9]*) os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -sco3.2v[4-9]*) # Don't forget version if it is 3.2v4 or newer. - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -sco5v6*) # Don't forget version if it is 3.2v4 or newer. - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -sco*) os=-sco3.2v2 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -udk*) - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -isc) os=-isc2.2 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -clix*) basic_machine=clipper-intergraph ;; -isc*) - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` ;; -lynx*178) os=-lynxos178 @@ -228,10 +227,7 @@ os=-lynxos ;; -ptx*) - basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` - ;; - -windowsnt*) - os=`echo $os | sed -e 's/windowsnt/winnt/'` + basic_machine=`echo "$1" | sed -e 's/86-.*/86-sequent/'` ;; -psos*) os=-psos @@ -255,15 +251,16 @@ | arc | arceb \ | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ | avr | avr32 \ + | ba \ | be32 | be64 \ | bfin \ | c4x | c8051 | clipper \ | d10v | d30v | dlx | dsp16xx \ - | epiphany \ - | fido | fr30 | frv \ + | e2k | epiphany \ + | fido | fr30 | frv | ft32 \ | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | hexagon \ - | i370 | i860 | i960 | ia64 \ + | i370 | i860 | i960 | ia16 | ia64 \ | ip2k | iq2000 \ | k1om \ | le32 | le64 \ @@ -299,13 +296,14 @@ | nios | nios2 | nios2eb | nios2el \ | ns16k | ns32k \ | open8 | or1k | or1knd | or32 \ - | pdp10 | pdp11 | pj | pjl \ + | pdp10 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle \ + | pru \ | pyramid \ | riscv32 | riscv64 \ | rl78 | rx \ | score \ - | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ + | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ | sh64 | sh64le \ | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ @@ -314,7 +312,7 @@ | ubicom32 \ | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \ | visium \ - | we32k \ + | wasm32 \ | x86 | xc16x | xstormy16 | xtensa \ | z8k | z80) basic_machine=$basic_machine-unknown @@ -335,7 +333,7 @@ basic_machine=$basic_machine-unknown os=-none ;; - m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) + m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65) ;; ms1) basic_machine=mt-unknown @@ -364,7 +362,7 @@ ;; # Object if more than one company name word. *-*-*) - echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + echo Invalid configuration \`"$1"\': machine \`"$basic_machine"\' not recognized 1>&2 exit 1 ;; # Recognize the basic CPU types with company name. @@ -376,17 +374,18 @@ | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* | avr32-* \ + | ba-* \ | be32-* | be64-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* \ | c8051-* | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ - | elxsi-* \ + | e2k-* | elxsi-* \ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ | h8300-* | h8500-* \ | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ | hexagon-* \ - | i*86-* | i860-* | i960-* | ia64-* \ + | i*86-* | i860-* | i960-* | ia16-* | ia64-* \ | ip2k-* | iq2000-* \ | k1om-* \ | le32-* | le64-* \ @@ -427,13 +426,15 @@ | orion-* \ | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \ + | pru-* \ | pyramid-* \ + | riscv32-* | riscv64-* \ | rl78-* | romp-* | rs6000-* | rx-* \ | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ | sparclite-* \ - | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \ + | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \ | tahoe-* \ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ | tile*-* \ @@ -442,6 +443,7 @@ | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \ | vax-* \ | visium-* \ + | wasm32-* \ | we32k-* \ | x86-* | x86_64-* | xc16x-* | xps100-* \ | xstormy16-* | xtensa*-* \ @@ -455,7 +457,7 @@ # Recognize the various machine names and aliases which stand # for a CPU type and a company and sometimes even an OS. 386bsd) - basic_machine=i386-unknown + basic_machine=i386-pc os=-bsd ;; 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) @@ -489,7 +491,7 @@ basic_machine=x86_64-pc ;; amd64-*) - basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=x86_64-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; amdahl) basic_machine=580-amdahl @@ -518,6 +520,9 @@ basic_machine=i386-pc os=-aros ;; + asmjs) + basic_machine=asmjs-unknown + ;; aux) basic_machine=m68k-apple os=-aux @@ -531,7 +536,7 @@ os=-linux ;; blackfin-*) - basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=bfin-`echo "$basic_machine" | sed 's/^[^-]*-//'` os=-linux ;; bluegene*) @@ -539,13 +544,13 @@ os=-cnk ;; c54x-*) - basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=tic54x-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; c55x-*) - basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=tic55x-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; c6x-*) - basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=tic6x-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; c90) basic_machine=c90-cray @@ -634,10 +639,18 @@ basic_machine=rs6000-bull os=-bosx ;; - dpx2* | dpx2*-bull) + dpx2*) basic_machine=m68k-bull os=-sysv3 ;; + e500v[12]) + basic_machine=powerpc-unknown + os=$os"spe" + ;; + e500v[12]-*) + basic_machine=powerpc-`echo "$basic_machine" | sed 's/^[^-]*-//'` + os=$os"spe" + ;; ebmon29k) basic_machine=a29k-amd os=-ebmon @@ -727,9 +740,6 @@ hp9k8[0-9][0-9] | hp8[0-9][0-9]) basic_machine=hppa1.0-hp ;; - hppa-next) - os=-nextstep3 - ;; hppaosf) basic_machine=hppa1.1-hp os=-osf @@ -742,26 +752,26 @@ basic_machine=i370-ibm ;; i*86v32) - basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'` os=-sysv32 ;; i*86v4*) - basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'` os=-sysv4 ;; i*86v) - basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'` os=-sysv ;; i*86sol2) - basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'` os=-solaris2 ;; i386mach) basic_machine=i386-mach os=-mach ;; - i386-vsta | vsta) + vsta) basic_machine=i386-unknown os=-vsta ;; @@ -780,19 +790,16 @@ os=-sysv ;; leon-*|leon[3-9]-*) - basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'` + basic_machine=sparc-`echo "$basic_machine" | sed 's/-.*//'` ;; m68knommu) basic_machine=m68k-unknown os=-linux ;; m68knommu-*) - basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=m68k-`echo "$basic_machine" | sed 's/^[^-]*-//'` os=-linux ;; - m88k-omron*) - basic_machine=m88k-omron - ;; magnum | m3230) basic_machine=mips-mips os=-sysv @@ -824,10 +831,10 @@ os=-mint ;; mips3*-*) - basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` + basic_machine=`echo "$basic_machine" | sed -e 's/mips3/mips64/'` ;; mips3*) - basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown + basic_machine=`echo "$basic_machine" | sed -e 's/mips3/mips64/'`-unknown ;; monitor) basic_machine=m68k-rom68k @@ -846,7 +853,7 @@ os=-msdos ;; ms1-*) - basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` + basic_machine=`echo "$basic_machine" | sed -e 's/ms1-/mt-/'` ;; msys) basic_machine=i686-pc @@ -888,7 +895,7 @@ basic_machine=v70-nec os=-sysv ;; - next | m*-next ) + next | m*-next) basic_machine=m68k-next case $os in -nextstep* ) @@ -933,6 +940,12 @@ nsr-tandem) basic_machine=nsr-tandem ;; + nsv-tandem) + basic_machine=nsv-tandem + ;; + nsx-tandem) + basic_machine=nsx-tandem + ;; op50n-* | op60c-*) basic_machine=hppa1.1-oki os=-proelf @@ -965,7 +978,7 @@ os=-linux ;; parisc-*) - basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=hppa-`echo "$basic_machine" | sed 's/^[^-]*-//'` os=-linux ;; pbd) @@ -981,7 +994,7 @@ basic_machine=i386-pc ;; pc98-*) - basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=i386-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; pentium | p5 | k5 | k6 | nexgen | viac3) basic_machine=i586-pc @@ -996,16 +1009,16 @@ basic_machine=i786-pc ;; pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) - basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=i586-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; pentiumpro-* | p6-* | 6x86-* | athlon-*) - basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=i686-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) - basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=i686-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; pentium4-*) - basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=i786-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; pn) basic_machine=pn-gould @@ -1015,23 +1028,23 @@ ppc | ppcbe) basic_machine=powerpc-unknown ;; ppc-* | ppcbe-*) - basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=powerpc-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; - ppcle | powerpclittle | ppc-le | powerpc-little) + ppcle | powerpclittle) basic_machine=powerpcle-unknown ;; ppcle-* | powerpclittle-*) - basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=powerpcle-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; ppc64) basic_machine=powerpc64-unknown ;; - ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` + ppc64-*) basic_machine=powerpc64-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; - ppc64le | powerpc64little | ppc64-le | powerpc64-little) + ppc64le | powerpc64little) basic_machine=powerpc64le-unknown ;; ppc64le-* | powerpc64little-*) - basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=powerpc64le-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; ps2) basic_machine=i386-ibm @@ -1085,17 +1098,10 @@ sequent) basic_machine=i386-sequent ;; - sh) - basic_machine=sh-hitachi - os=-hms - ;; sh5el) basic_machine=sh5le-unknown ;; - sh64) - basic_machine=sh64-unknown - ;; - sparclite-wrs | simso-wrs) + simso-wrs) basic_machine=sparclite-wrs os=-vxworks ;; @@ -1114,7 +1120,7 @@ os=-sysv4 ;; strongarm-* | thumb-*) - basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'` + basic_machine=arm-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; sun2) basic_machine=m68000-sun @@ -1236,6 +1242,9 @@ basic_machine=hppa1.1-winbond os=-proelf ;; + x64) + basic_machine=x86_64-pc + ;; xbox) basic_machine=i686-pc os=-mingw32 @@ -1244,20 +1253,12 @@ basic_machine=xps100-honeywell ;; xscale-* | xscalee[bl]-*) - basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'` + basic_machine=`echo "$basic_machine" | sed 's/^xscale/arm/'` ;; ymp) basic_machine=ymp-cray os=-unicos ;; - z8k-*-coff) - basic_machine=z8k-unknown - os=-sim - ;; - z80-*-coff) - basic_machine=z80-unknown - os=-sim - ;; none) basic_machine=none-none os=-none @@ -1286,10 +1287,6 @@ vax) basic_machine=vax-dec ;; - pdp10) - # there are many clones, so DEC is not a safe bet - basic_machine=pdp10-unknown - ;; pdp11) basic_machine=pdp11-dec ;; @@ -1299,9 +1296,6 @@ sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) basic_machine=sh-unknown ;; - sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) - basic_machine=sparc-sun - ;; cydra) basic_machine=cydra-cydrome ;; @@ -1321,7 +1315,7 @@ # Make sure to match an already-canonicalized machine name. ;; *) - echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + echo Invalid configuration \`"$1"\': machine \`"$basic_machine"\' not recognized 1>&2 exit 1 ;; esac @@ -1329,10 +1323,10 @@ # Here we canonicalize certain aliases for manufacturers. case $basic_machine in *-digital*) - basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` + basic_machine=`echo "$basic_machine" | sed 's/digital.*/dec/'` ;; *-commodore*) - basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` + basic_machine=`echo "$basic_machine" | sed 's/commodore.*/cbm/'` ;; *) ;; @@ -1343,8 +1337,8 @@ if [ x"$os" != x"" ] then case $os in - # First match some system type aliases - # that might get confused with valid system types. + # First match some system type aliases that might get confused + # with valid system types. # -solaris* is a basic system type, with this one exception. -auroraux) os=-auroraux @@ -1355,45 +1349,48 @@ -solaris) os=-solaris2 ;; - -svr4*) - os=-sysv4 - ;; -unixware*) os=-sysv4.2uw ;; -gnu/linux*) os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` ;; - # First accept the basic system types. + # es1800 is here to avoid being matched by es* (a different OS) + -es1800*) + os=-ose + ;; + # Now accept the basic system types. # The portable systems comes first. - # Each alternative MUST END IN A *, to match a version number. + # Each alternative MUST end in a * to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* | -plan9* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ - | -aos* | -aros* \ + | -aos* | -aros* | -cloudabi* | -sortix* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ - | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ - | -bitrig* | -openbsd* | -solidbsd* \ + | -hiux* | -knetbsd* | -mirbsd* | -netbsd* \ + | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \ | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ - | -chorusos* | -chorusrdb* | -cegcc* \ + | -chorusos* | -chorusrdb* | -cegcc* | -glidix* \ | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ - | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ + | -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ | -linux-newlib* | -linux-musl* | -linux-uclibc* \ | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \ - | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ + | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* \ | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ - | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ + | -morphos* | -superux* | -rtmk* | -windiss* \ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ - | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -tirtos*) + | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \ + | -onefs* | -tirtos* | -phoenix* | -fuchsia* | -redox* | -bme* \ + | -midnightbsd*) # Remember, each alternative MUST END IN *, to match a version number. ;; -qnx*) @@ -1410,12 +1407,12 @@ -nto*) os=`echo $os | sed -e 's|nto|nto-qnx|'` ;; - -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ - | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ + -sim | -xray | -os68k* | -v88r* \ + | -windows* | -osx | -abug | -netware* | -os9* \ | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) ;; -mac*) - os=`echo $os | sed -e 's|mac|macos|'` + os=`echo "$os" | sed -e 's|mac|macos|'` ;; -linux-dietlibc) os=-linux-dietlibc @@ -1424,10 +1421,10 @@ os=`echo $os | sed -e 's|linux|linux-gnu|'` ;; -sunos5*) - os=`echo $os | sed -e 's|sunos5|solaris2|'` + os=`echo "$os" | sed -e 's|sunos5|solaris2|'` ;; -sunos6*) - os=`echo $os | sed -e 's|sunos6|solaris3|'` + os=`echo "$os" | sed -e 's|sunos6|solaris3|'` ;; -opened*) os=-openedition @@ -1438,12 +1435,6 @@ -wince*) os=-wince ;; - -osfrose*) - os=-osfrose - ;; - -osf*) - os=-osf - ;; -utek*) os=-bsd ;; @@ -1468,7 +1459,7 @@ -nova*) os=-rtmk-nova ;; - -ns2 ) + -ns2) os=-nextstep2 ;; -nsk*) @@ -1490,7 +1481,7 @@ -oss*) os=-sysv3 ;; - -svr4) + -svr4*) os=-sysv4 ;; -svr3) @@ -1505,32 +1496,38 @@ -ose*) os=-ose ;; - -es1800*) - os=-ose - ;; - -xenix) - os=-xenix - ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) os=-mint ;; - -aros*) - os=-aros - ;; -zvmoe) os=-zvmoe ;; -dicos*) os=-dicos ;; + -pikeos*) + # Until real need of OS specific support for + # particular features comes up, bare metal + # configurations are quite functional. + case $basic_machine in + arm*) + os=-eabi + ;; + *) + os=-elf + ;; + esac + ;; -nacl*) ;; + -ios) + ;; -none) ;; *) # Get rid of the `-' at the beginning of $os. os=`echo $os | sed 's/[^-]*-//'` - echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 + echo Invalid configuration \`"$1"\': system \`"$os"\' not recognized 1>&2 exit 1 ;; esac @@ -1620,12 +1617,12 @@ sparc-* | *-sun) os=-sunos4.1.1 ;; + pru-*) + os=-elf + ;; *-be) os=-beos ;; - *-haiku) - os=-haiku - ;; *-ibm) os=-aix ;; @@ -1665,7 +1662,7 @@ m88k-omron*) os=-luna ;; - *-next ) + *-next) os=-nextstep ;; *-sequent) @@ -1680,9 +1677,6 @@ i370-*) os=-mvs ;; - *-next) - os=-nextstep3 - ;; *-gould) os=-sysv ;; @@ -1792,15 +1786,15 @@ vendor=stratus ;; esac - basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` + basic_machine=`echo "$basic_machine" | sed "s/unknown/$vendor/"` ;; esac -echo $basic_machine$os +echo "$basic_machine$os" exit # Local variables: -# eval: (add-hook 'write-file-hooks 'time-stamp) +# eval: (add-hook 'write-file-functions 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" diff -Nru ucto-0.9.6/configure ucto-0.14/configure --- ucto-0.9.6/configure 2017-01-23 12:52:11.000000000 +0000 +++ ucto-0.14/configure 2018-11-29 09:08:00.000000000 +0000 @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for ucto 0.9.6. +# Generated by GNU Autoconf 2.69 for ucto 0.14. # # Report bugs to . # @@ -590,8 +590,8 @@ # Identity of this package. PACKAGE_NAME='ucto' PACKAGE_TARNAME='ucto' -PACKAGE_VERSION='0.9.6' -PACKAGE_STRING='ucto 0.9.6' +PACKAGE_VERSION='0.14' +PACKAGE_STRING='ucto 0.14' PACKAGE_BUGREPORT='lamasoftware@science.ru.nl' PACKAGE_URL='' @@ -644,17 +644,16 @@ folia_CFLAGS XML2_LIBS XML2_CFLAGS +ICU_LIBS +ICU_CFLAGS +OLD_LM_FALSE +OLD_LM_TRUE +MODULE_PREFIX +TEXTCAT_LIBS +TEXTCAT_CFLAGS PKG_CONFIG_LIBDIR PKG_CONFIG_PATH PKG_CONFIG -ICU_IOLIBS -ICU_LIBS -ICU_LIBPATH -ICU_VERSION -ICU_CPPSEARCHPATH -ICU_CXXFLAGS -ICU_CFLAGS -ICU_CONFIG CXXCPP CPP LT_SYS_LIBRARY_PATH @@ -757,6 +756,7 @@ docdir oldincludedir includedir +runstatedir localstatedir sharedstatedir sysconfdir @@ -789,9 +789,6 @@ with_gnu_ld with_sysroot enable_libtool_lock -with_icu -with_folia -with_ticcutils ' ac_precious_vars='build_alias host_alias @@ -810,6 +807,10 @@ PKG_CONFIG PKG_CONFIG_PATH PKG_CONFIG_LIBDIR +TEXTCAT_CFLAGS +TEXTCAT_LIBS +ICU_CFLAGS +ICU_LIBS XML2_CFLAGS XML2_LIBS folia_CFLAGS @@ -856,6 +857,7 @@ sysconfdir='${prefix}/etc' sharedstatedir='${prefix}/com' localstatedir='${prefix}/var' +runstatedir='${localstatedir}/run' includedir='${prefix}/include' oldincludedir='/usr/include' docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' @@ -1108,6 +1110,15 @@ | -silent | --silent | --silen | --sile | --sil) silent=yes ;; + -runstatedir | --runstatedir | --runstatedi | --runstated \ + | --runstate | --runstat | --runsta | --runst | --runs \ + | --run | --ru | --r) + ac_prev=runstatedir ;; + -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \ + | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \ + | --run=* | --ru=* | --r=*) + runstatedir=$ac_optarg ;; + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) ac_prev=sbindir ;; -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ @@ -1245,7 +1256,7 @@ for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ datadir sysconfdir sharedstatedir localstatedir includedir \ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ - libdir localedir mandir + libdir localedir mandir runstatedir do eval ac_val=\$$ac_var # Remove trailing slashes. @@ -1358,7 +1369,7 @@ # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures ucto 0.9.6 to adapt to many kinds of systems. +\`configure' configures ucto 0.14 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1398,6 +1409,7 @@ --sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --localstatedir=DIR modifiable single-machine data [PREFIX/var] + --runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run] --libdir=DIR object code libraries [EPREFIX/lib] --includedir=DIR C header files [PREFIX/include] --oldincludedir=DIR C header files for non-gcc [/usr/include] @@ -1428,7 +1440,7 @@ if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of ucto 0.9.6:";; + short | recursive ) echo "Configuration of ucto 0.14:";; esac cat <<\_ACEOF @@ -1459,13 +1471,6 @@ --with-gnu-ld assume the C compiler uses GNU ld [default=no] --with-sysroot[=DIR] Search for dependent libraries within DIR (or the compiler's sysroot if not specified). - --with-icu=DIR use ICU installed in

- --with-folia=DIR use libfolia installed in ; - note that you can install folia in a non-default directory with - ./configure --prefix= in the folia installation directory - --with-ticcutils=DIR use ticcutils installed in ; - note that you can install ticcutils in a non-default directory with - ./configure --prefix= in the ticcutils installation directory Some influential environment variables: CXX C++ compiler command @@ -1486,6 +1491,12 @@ directories to add to pkg-config's search path PKG_CONFIG_LIBDIR path overriding pkg-config's built-in search path + TEXTCAT_CFLAGS + C compiler flags for TEXTCAT, overriding pkg-config + TEXTCAT_LIBS + linker flags for TEXTCAT, overriding pkg-config + ICU_CFLAGS C compiler flags for ICU, overriding pkg-config + ICU_LIBS linker flags for ICU, overriding pkg-config XML2_CFLAGS C compiler flags for XML2, overriding pkg-config XML2_LIBS linker flags for XML2, overriding pkg-config folia_CFLAGS @@ -1566,7 +1577,7 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -ucto configure 0.9.6 +ucto configure 0.14 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2186,7 +2197,7 @@ This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by ucto $as_me 0.9.6, which was +It was created by ucto $as_me 0.14, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2533,7 +2544,7 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu - + #adapt version number in codemeta.json as well am__api_version='1.15' ac_aux_dir= @@ -3049,7 +3060,7 @@ # Define the identity of the package. PACKAGE='ucto' - VERSION='0.9.6' + VERSION='0.14' cat >>confdefs.h <<_ACEOF @@ -3150,13 +3161,6 @@ SYSCONFDIR=$sysconfdir -if test x"${CXXFLAGS+set}" = xset; then - # the user set CXXFLAGS; don't override it. - cxx_flags_were_set=true -else - cxx_flags_were_set=false -fi - # Checks for programs. ac_ext=cpp ac_cpp='$CXXCPP $CPPFLAGS' @@ -3168,7 +3172,7 @@ CXX=$CCC else if test -n "$ac_tool_prefix"; then - for ac_prog in g++ c++ + for ac_prog in c++ do # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. set dummy $ac_tool_prefix$ac_prog; ac_word=$2 @@ -3212,7 +3216,7 @@ fi if test -z "$CXX"; then ac_ct_CXX=$CXX - for ac_prog in g++ c++ + for ac_prog in c++ do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 @@ -3857,10 +3861,6 @@ -if $cxx_flags_were_set; then - CXXFLAGS=$CXXFLAGS -fi - # use libtool case `pwd` in *\ * | *\ *) @@ -5897,7 +5897,7 @@ lt_cv_deplibs_check_method=pass_all ;; -netbsd*) +netbsd* | netbsdelf*-gnu) if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then lt_cv_deplibs_check_method='match_pattern /lib[^/]+(\.so\.[0-9]+\.[0-9]+|_pic\.a)$' else @@ -9601,6 +9601,9 @@ openbsd* | bitrig*) with_gnu_ld=no ;; + linux* | k*bsd*-gnu | gnu*) + link_all_deplibs=no + ;; esac ld_shlibs=yes @@ -9855,7 +9858,7 @@ fi ;; - netbsd*) + netbsd* | netbsdelf*-gnu) if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib' wlarc= @@ -10525,6 +10528,7 @@ if test yes = "$lt_cv_irix_exported_symbol"; then archive_expsym_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations $wl-exports_file $wl$export_symbols -o $lib' fi + link_all_deplibs=no else archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib' archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -exports_file $export_symbols -o $lib' @@ -10546,7 +10550,7 @@ esac ;; - netbsd*) + netbsd* | netbsdelf*-gnu) if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' # a.out else @@ -11661,6 +11665,18 @@ dynamic_linker='GNU/Linux ld.so' ;; +netbsdelf*-gnu) + version_type=linux + need_lib_prefix=no + need_version=no + library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}' + soname_spec='${libname}${release}${shared_ext}$major' + shlibpath_var=LD_LIBRARY_PATH + shlibpath_overrides_runpath=no + hardcode_into_libs=yes + dynamic_linker='NetBSD ld.elf_so' + ;; + netbsd*) version_type=sunos need_lib_prefix=no @@ -14555,7 +14571,7 @@ ;; esac ;; - netbsd*) + netbsd* | netbsdelf*-gnu) ;; *qnx* | *nto*) # QNX uses GNU C++, but need to define -shared option too, otherwise @@ -14930,6 +14946,9 @@ ;; esac ;; + linux* | k*bsd*-gnu | gnu*) + link_all_deplibs_CXX=no + ;; *) export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols' ;; @@ -15623,6 +15642,18 @@ dynamic_linker='GNU/Linux ld.so' ;; +netbsdelf*-gnu) + version_type=linux + need_lib_prefix=no + need_version=no + library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}' + soname_spec='${libname}${release}${shared_ext}$major' + shlibpath_var=LD_LIBRARY_PATH + shlibpath_overrides_runpath=no + hardcode_into_libs=yes + dynamic_linker='NetBSD ld.elf_so' + ;; + netbsd*) version_type=sunos need_lib_prefix=no @@ -15973,6 +16004,17 @@ +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ln -s works" >&5 +$as_echo_n "checking whether ln -s works... " >&6; } +LN_S=$as_ln_s +if test "$LN_S" = "ln -s"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no, using $LN_S" >&5 +$as_echo "no, using $LN_S" >&6; } +fi + # when running tests, use CXX ac_ext=cpp @@ -16281,263 +16323,8 @@ fi -# ugly hack when PKG_CONFIG_PATH isn't defined. -# couldn't get it to work otherwise -if test "x$PKG_CONFIG_PATH" = x; then - export PKG_CONFIG_PATH="" -fi -#AC_MSG_NOTICE( [pkg-config search path:$PKG_CONFIG_PATH dus] ) -for ac_header in libexttextcat/textcat.h -do : - ac_fn_cxx_check_header_mongrel "$LINENO" "libexttextcat/textcat.h" "ac_cv_header_libexttextcat_textcat_h" "$ac_includes_default" -if test "x$ac_cv_header_libexttextcat_textcat_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_LIBEXTTEXTCAT_TEXTCAT_H 1 -_ACEOF - CXXFLAGS="$CXXFLAGS -I$prefix/include" -else - for ac_header in libtextcat/textcat.h -do : - ac_fn_cxx_check_header_mongrel "$LINENO" "libtextcat/textcat.h" "ac_cv_header_libtextcat_textcat_h" "$ac_includes_default" -if test "x$ac_cv_header_libtextcat_textcat_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_LIBTEXTCAT_TEXTCAT_H 1 -_ACEOF - CXXFLAGS="$CXXFLAGS -I$prefix/include" -else - for ac_header in textcat.h -do : - ac_fn_cxx_check_header_mongrel "$LINENO" "textcat.h" "ac_cv_header_textcat_h" "$ac_includes_default" -if test "x$ac_cv_header_textcat_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_TEXTCAT_H 1 -_ACEOF - -else - { $as_echo "$as_me:${as_lineno-$LINENO}: textcat support not available" >&5 -$as_echo "$as_me: textcat support not available" >&6;} -fi - -done - -fi - -done - -fi - -done - - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing textcat_Init" >&5 -$as_echo_n "checking for library containing textcat_Init... " >&6; } -if ${ac_cv_search_textcat_Init+:} false; then : - $as_echo_n "(cached) " >&6 -else - ac_func_search_save_LIBS=$LIBS -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -/* Override any GCC internal prototype to avoid an error. - Use char because int might match the return type of a GCC - builtin and then its argument prototype would still apply. */ -#ifdef __cplusplus -extern "C" -#endif -char textcat_Init (); -int -main () -{ -return textcat_Init (); - ; - return 0; -} -_ACEOF -for ac_lib in '' exttextcat-2.0 exttextcat textcat; do - if test -z "$ac_lib"; then - ac_res="none required" - else - ac_res=-l$ac_lib - LIBS="-l$ac_lib $ac_func_search_save_LIBS" - fi - if ac_fn_cxx_try_link "$LINENO"; then : - ac_cv_search_textcat_Init=$ac_res -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext - if ${ac_cv_search_textcat_Init+:} false; then : - break -fi -done -if ${ac_cv_search_textcat_Init+:} false; then : - -else - ac_cv_search_textcat_Init=no -fi -rm conftest.$ac_ext -LIBS=$ac_func_search_save_LIBS -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_textcat_Init" >&5 -$as_echo "$ac_cv_search_textcat_Init" >&6; } -ac_res=$ac_cv_search_textcat_Init -if test "$ac_res" != no; then : - test "$ac_res" = "none required" || LIBS="$ac_res $LIBS" - -$as_echo "#define HAVE_TEXTCAT_LIB 1" >>confdefs.h - -else - { $as_echo "$as_me:${as_lineno-$LINENO}: Unable to find textcat library. textcat support not available" >&5 -$as_echo "$as_me: Unable to find textcat library. textcat support not available" >&6;} -fi - - -useICU=1; -# inspired by feh-1.3.4/configure.ac. Tnx Tom Gilbert and feh hackers. - -# Check whether --with-icu was given. -if test "${with_icu+set}" = set; then : - withval=$with_icu; if test "$with_icu" = "no"; then - useICU=0 - else - CXXFLAGS="$CXXFLAGS -I$withval/include" - LIBS="-L$withval/lib $LIBS" - fi -fi - - -if test "$useICU" = "1"; then - - succeeded=no - - if test -z "$ICU_CONFIG"; then - # Extract the first word of "icu-config", so it can be a program name with args. -set dummy icu-config; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_path_ICU_CONFIG+:} false; then : - $as_echo_n "(cached) " >&6 -else - case $ICU_CONFIG in - [\\/]* | ?:[\\/]*) - ac_cv_path_ICU_CONFIG="$ICU_CONFIG" # Let the user override the test with a path. - ;; - *) - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then - ac_cv_path_ICU_CONFIG="$as_dir/$ac_word$ac_exec_ext" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done - done -IFS=$as_save_IFS - - test -z "$ac_cv_path_ICU_CONFIG" && ac_cv_path_ICU_CONFIG="no" - ;; -esac -fi -ICU_CONFIG=$ac_cv_path_ICU_CONFIG -if test -n "$ICU_CONFIG"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_CONFIG" >&5 -$as_echo "$ICU_CONFIG" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi - - - fi - - if test "$ICU_CONFIG" = "no" ; then - echo "*** The icu-config script could not be found. Make sure it is" - echo "*** in your path, and that taglib is properly installed." - echo "*** Or see http://www.icu-project.org/" - else - ICU_VERSION=`$ICU_CONFIG --version` - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ICU >= 5.2" >&5 -$as_echo_n "checking for ICU >= 5.2... " >&6; } - VERSION_CHECK=`expr $ICU_VERSION \>\= 5.2` - if test "$VERSION_CHECK" = "1" ; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - succeeded=yes - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_CFLAGS" >&5 -$as_echo_n "checking ICU_CFLAGS... " >&6; } - ICU_CFLAGS=`$ICU_CONFIG --cflags` - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_CFLAGS" >&5 -$as_echo "$ICU_CFLAGS" >&6; } - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_CPPSEARCHPATH" >&5 -$as_echo_n "checking ICU_CPPSEARCHPATH... " >&6; } - ICU_CPPSEARCHPATH=`$ICU_CONFIG --cppflags-searchpath` - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_CPPSEARCHPATH" >&5 -$as_echo "$ICU_CPPSEARCHPATH" >&6; } - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_CXXFLAGS" >&5 -$as_echo_n "checking ICU_CXXFLAGS... " >&6; } - ICU_CXXFLAGS=`$ICU_CONFIG --cxxflags` - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_CXXFLAGS" >&5 -$as_echo "$ICU_CXXFLAGS" >&6; } - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_LIBS" >&5 -$as_echo_n "checking ICU_LIBS... " >&6; } - ICU_LIBS=`$ICU_CONFIG --ldflags-libsonly` - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_LIBS" >&5 -$as_echo "$ICU_LIBS" >&6; } - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_LIBPATH" >&5 -$as_echo_n "checking ICU_LIBPATH... " >&6; } - ICU_LIBPATH=`$ICU_CONFIG --ldflags-searchpath` - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_LIBPATH" >&5 -$as_echo "$ICU_LIBPATH" >&6; } - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_IOLIBS" >&5 -$as_echo_n "checking ICU_IOLIBS... " >&6; } - ICU_IOLIBS=`$ICU_CONFIG --ldflags-icuio` - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_IOLIBS" >&5 -$as_echo "$ICU_IOLIBS" >&6; } - else - ICU_CFLAGS="" - ICU_CXXFLAGS="" - ICU_CPPSEARCHPATH="" - ICU_LIBPATH="" - ICU_LIBS="" - ICU_IOLIBS="" - ## If we have a custom action on failure, don't print errors, but - ## do set a variable so people can do so. - - fi - - - - - - - - - fi - - if test $succeeded = yes; then - CXXFLAGS="$CXXFLAGS $ICU_CPPSEARCHPATH" - LIBS="$ICU_LIBPATH $ICU_LIBS $ICU_IOLIBS $LIBS" - else - { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -as_fn_error $? "\"No ICU development environment found. Please check if libicu-dev or the like is installed\" -See \`config.log' for more details" "$LINENO" 5; } - fi - - -$as_echo "#define HAVE_ICU 1" >>confdefs.h - -else - as_fn_error $? "\"ICU support is required\"" "$LINENO" 5 +if test $prefix = "NONE"; then + prefix="$ac_default_prefix" fi @@ -16661,20 +16448,47 @@ fi fi +if test "x$PKG_CONFIG_PATH" = x; then + export PKG_CONFIG_PATH="$prefix/lib/pkgconfig" +else + export PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH" +fi + + +case ${host_os} in + linux*) + # linux is wellbehaved + ;; + darwin*) + # darwin isn't + for i in icu4c + do + if test -d "/usr/local/opt/$i/lib/pkgconfig" + then + export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/usr/local/opt/$i/lib/pkgconfig" + fi + done + ;; +esac + +{ $as_echo "$as_me:${as_lineno-$LINENO}: pkg-config: PATH=$PKG_CONFIG_PATH" >&5 +$as_echo "$as_me: pkg-config: PATH=$PKG_CONFIG_PATH" >&6;} + + pkg_failed=no -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for XML2" >&5 -$as_echo_n "checking for XML2... " >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for TEXTCAT" >&5 +$as_echo_n "checking for TEXTCAT... " >&6; } -if test -n "$XML2_CFLAGS"; then - pkg_cv_XML2_CFLAGS="$XML2_CFLAGS" +if test -n "$TEXTCAT_CFLAGS"; then + pkg_cv_TEXTCAT_CFLAGS="$TEXTCAT_CFLAGS" elif test -n "$PKG_CONFIG"; then if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libxml-2.0 >= 2.6.16 \""; } >&5 - ($PKG_CONFIG --exists --print-errors "libxml-2.0 >= 2.6.16 ") 2>&5 + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libexttextcat\""; } >&5 + ($PKG_CONFIG --exists --print-errors "libexttextcat") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then - pkg_cv_XML2_CFLAGS=`$PKG_CONFIG --cflags "libxml-2.0 >= 2.6.16 " 2>/dev/null` + pkg_cv_TEXTCAT_CFLAGS=`$PKG_CONFIG --cflags "libexttextcat" 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes else pkg_failed=yes @@ -16682,16 +16496,16 @@ else pkg_failed=untried fi -if test -n "$XML2_LIBS"; then - pkg_cv_XML2_LIBS="$XML2_LIBS" +if test -n "$TEXTCAT_LIBS"; then + pkg_cv_TEXTCAT_LIBS="$TEXTCAT_LIBS" elif test -n "$PKG_CONFIG"; then if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libxml-2.0 >= 2.6.16 \""; } >&5 - ($PKG_CONFIG --exists --print-errors "libxml-2.0 >= 2.6.16 ") 2>&5 + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libexttextcat\""; } >&5 + ($PKG_CONFIG --exists --print-errors "libexttextcat") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then - pkg_cv_XML2_LIBS=`$PKG_CONFIG --libs "libxml-2.0 >= 2.6.16 " 2>/dev/null` + pkg_cv_TEXTCAT_LIBS=`$PKG_CONFIG --libs "libexttextcat" 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes else pkg_failed=yes @@ -16712,19 +16526,483 @@ _pkg_short_errors_supported=no fi if test $_pkg_short_errors_supported = yes; then - XML2_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libxml-2.0 >= 2.6.16 " 2>&1` + TEXTCAT_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libexttextcat" 2>&1` else - XML2_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libxml-2.0 >= 2.6.16 " 2>&1` + TEXTCAT_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libexttextcat" 2>&1` fi # Put the nasty error message in config.log where it belongs - echo "$XML2_PKG_ERRORS" >&5 + echo "$TEXTCAT_PKG_ERRORS" >&5 - as_fn_error $? "Package requirements (libxml-2.0 >= 2.6.16 ) were not met: -$XML2_PKG_ERRORS +pkg_failed=no +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for TEXTCAT" >&5 +$as_echo_n "checking for TEXTCAT... " >&6; } -Consider adjusting the PKG_CONFIG_PATH environment variable if you -installed software in a non-standard prefix. +if test -n "$TEXTCAT_CFLAGS"; then + pkg_cv_TEXTCAT_CFLAGS="$TEXTCAT_CFLAGS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libtextcat\""; } >&5 + ($PKG_CONFIG --exists --print-errors "libtextcat") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_TEXTCAT_CFLAGS=`$PKG_CONFIG --cflags "libtextcat" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi +if test -n "$TEXTCAT_LIBS"; then + pkg_cv_TEXTCAT_LIBS="$TEXTCAT_LIBS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libtextcat\""; } >&5 + ($PKG_CONFIG --exists --print-errors "libtextcat") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_TEXTCAT_LIBS=`$PKG_CONFIG --libs "libtextcat" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi + + + +if test $pkg_failed = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi + if test $_pkg_short_errors_supported = yes; then + TEXTCAT_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libtextcat" 2>&1` + else + TEXTCAT_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libtextcat" 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$TEXTCAT_PKG_ERRORS" >&5 + + TEXTCAT_FOUND=0 + +elif test $pkg_failed = untried; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + TEXTCAT_FOUND=0 + +else + TEXTCAT_CFLAGS=$pkg_cv_TEXTCAT_CFLAGS + TEXTCAT_LIBS=$pkg_cv_TEXTCAT_LIBS + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + TEXTCAT_FOUND=1 + CXXFLAGS="$CXXFLAGS $TEXTCAT_CFLAGS" + LIBS="$TEXTCAT_LIBS $LIBS" +fi + +elif test $pkg_failed = untried; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +pkg_failed=no +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for TEXTCAT" >&5 +$as_echo_n "checking for TEXTCAT... " >&6; } + +if test -n "$TEXTCAT_CFLAGS"; then + pkg_cv_TEXTCAT_CFLAGS="$TEXTCAT_CFLAGS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libtextcat\""; } >&5 + ($PKG_CONFIG --exists --print-errors "libtextcat") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_TEXTCAT_CFLAGS=`$PKG_CONFIG --cflags "libtextcat" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi +if test -n "$TEXTCAT_LIBS"; then + pkg_cv_TEXTCAT_LIBS="$TEXTCAT_LIBS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libtextcat\""; } >&5 + ($PKG_CONFIG --exists --print-errors "libtextcat") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_TEXTCAT_LIBS=`$PKG_CONFIG --libs "libtextcat" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi + + + +if test $pkg_failed = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi + if test $_pkg_short_errors_supported = yes; then + TEXTCAT_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libtextcat" 2>&1` + else + TEXTCAT_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libtextcat" 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$TEXTCAT_PKG_ERRORS" >&5 + + TEXTCAT_FOUND=0 + +elif test $pkg_failed = untried; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + TEXTCAT_FOUND=0 + +else + TEXTCAT_CFLAGS=$pkg_cv_TEXTCAT_CFLAGS + TEXTCAT_LIBS=$pkg_cv_TEXTCAT_LIBS + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + TEXTCAT_FOUND=1 + CXXFLAGS="$CXXFLAGS $TEXTCAT_CFLAGS" + LIBS="$TEXTCAT_LIBS $LIBS" +fi + +else + TEXTCAT_CFLAGS=$pkg_cv_TEXTCAT_CFLAGS + TEXTCAT_LIBS=$pkg_cv_TEXTCAT_LIBS + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + TEXTCAT_FOUND=1 + CXXFLAGS="$CXXFLAGS $TEXTCAT_CFLAGS" + LIBS="$TEXTCAT_LIBS $LIBS" +fi + +if test $TEXTCAT_FOUND = 0; then +# So, no pkg-config for textcat found. +# Hopefully an old style version can be found??? + for ac_header in libtextcat/textcat.h +do : + ac_fn_cxx_check_header_mongrel "$LINENO" "libtextcat/textcat.h" "ac_cv_header_libtextcat_textcat_h" "$ac_includes_default" +if test "x$ac_cv_header_libtextcat_textcat_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBTEXTCAT_TEXTCAT_H 1 +_ACEOF + CXXFLAGS="$CXXFLAGS -I$prefix/include/libtextcat" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing textcat_Init" >&5 +$as_echo_n "checking for library containing textcat_Init... " >&6; } +if ${ac_cv_search_textcat_Init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_func_search_save_LIBS=$LIBS +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char textcat_Init (); +int +main () +{ +return textcat_Init (); + ; + return 0; +} +_ACEOF +for ac_lib in '' textcat; do + if test -z "$ac_lib"; then + ac_res="none required" + else + ac_res=-l$ac_lib + LIBS="-l$ac_lib $ac_func_search_save_LIBS" + fi + if ac_fn_cxx_try_link "$LINENO"; then : + ac_cv_search_textcat_Init=$ac_res +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext + if ${ac_cv_search_textcat_Init+:} false; then : + break +fi +done +if ${ac_cv_search_textcat_Init+:} false; then : + +else + ac_cv_search_textcat_Init=no +fi +rm conftest.$ac_ext +LIBS=$ac_func_search_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_textcat_Init" >&5 +$as_echo "$ac_cv_search_textcat_Init" >&6; } +ac_res=$ac_cv_search_textcat_Init +if test "$ac_res" != no; then : + test "$ac_res" = "none required" || LIBS="$ac_res $LIBS" + TEXTCAT_FOUND=1 +else + { $as_echo "$as_me:${as_lineno-$LINENO}: textcat library not available" >&5 +$as_echo "$as_me: textcat library not available" >&6;} +fi + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: textcat include file not available" >&5 +$as_echo "$as_me: textcat include file not available" >&6;} +fi + +done + + + + if test $TEXTCAT_FOUND = 1; then + +$as_echo "#define HAVE_OLD_TEXTCAT 1" >>confdefs.h + + fi +fi + +if test $TEXTCAT_FOUND = 1; then + tcdirs="/usr/share/libtextcat /usr/share/libexttextcat /usr/local/share/libtextcat /usr/local/share/libexttextcat /usr/local/Cellar/libtextcat/2.2/share/LM " + + for d in $tcdirs + do + if test -f ${d}/nl.lm + then + MODULE_PREFIX=$d + + if test 1 = 0; then + OLD_LM_TRUE= + OLD_LM_FALSE='#' +else + OLD_LM_TRUE='#' + OLD_LM_FALSE= +fi + + break + fi + done + + if test "x$MODULE_PREFIX" = "x" + then + for d in $tcdirs + do + if test -f ${d}/dutch.lm + then + MODULE_PREFIX=$d + + if test 1 = 1; then + OLD_LM_TRUE= + OLD_LM_FALSE='#' +else + OLD_LM_TRUE='#' + OLD_LM_FALSE= +fi + + break + fi + done + fi + if test "x$MODULE_PREFIX" = "x" + then + { $as_echo "$as_me:${as_lineno-$LINENO}: textcat Language Model files not found. Textcat disabled." >&5 +$as_echo "$as_me: textcat Language Model files not found. Textcat disabled." >&6;} + else + TEXTCAT_FOUND=1 + fi + + +fi + +if test $TEXTCAT_FOUND = 0; then + as_fn_error $? "no working libtextcat or libexttextcat found! " "$LINENO" 5 +else + +$as_echo "#define HAVE_TEXTCAT 1" >>confdefs.h + +fi + + +pkg_failed=no +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ICU" >&5 +$as_echo_n "checking for ICU... " >&6; } + +if test -n "$ICU_CFLAGS"; then + pkg_cv_ICU_CFLAGS="$ICU_CFLAGS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"icu-uc >= 50 icu-io \""; } >&5 + ($PKG_CONFIG --exists --print-errors "icu-uc >= 50 icu-io ") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_ICU_CFLAGS=`$PKG_CONFIG --cflags "icu-uc >= 50 icu-io " 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi +if test -n "$ICU_LIBS"; then + pkg_cv_ICU_LIBS="$ICU_LIBS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"icu-uc >= 50 icu-io \""; } >&5 + ($PKG_CONFIG --exists --print-errors "icu-uc >= 50 icu-io ") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_ICU_LIBS=`$PKG_CONFIG --libs "icu-uc >= 50 icu-io " 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi + + + +if test $pkg_failed = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi + if test $_pkg_short_errors_supported = yes; then + ICU_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "icu-uc >= 50 icu-io " 2>&1` + else + ICU_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "icu-uc >= 50 icu-io " 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$ICU_PKG_ERRORS" >&5 + + as_fn_error $? "Package requirements (icu-uc >= 50 icu-io ) were not met: + +$ICU_PKG_ERRORS + +Consider adjusting the PKG_CONFIG_PATH environment variable if you +installed software in a non-standard prefix. + +Alternatively, you may set the environment variables ICU_CFLAGS +and ICU_LIBS to avoid the need to call pkg-config. +See the pkg-config man page for more details." "$LINENO" 5 +elif test $pkg_failed = untried; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "The pkg-config script could not be found or is too old. Make sure it +is in your PATH or set the PKG_CONFIG environment variable to the full +path to pkg-config. + +Alternatively, you may set the environment variables ICU_CFLAGS +and ICU_LIBS to avoid the need to call pkg-config. +See the pkg-config man page for more details. + +To get pkg-config, see . +See \`config.log' for more details" "$LINENO" 5; } +else + ICU_CFLAGS=$pkg_cv_ICU_CFLAGS + ICU_LIBS=$pkg_cv_ICU_LIBS + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +fi +CXXFLAGS="$CXXFLAGS $ICU_CFLAGS" +LIBS="$ICU_LIBS $LIBS" + + +pkg_failed=no +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for XML2" >&5 +$as_echo_n "checking for XML2... " >&6; } + +if test -n "$XML2_CFLAGS"; then + pkg_cv_XML2_CFLAGS="$XML2_CFLAGS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libxml-2.0 >= 2.6.16 \""; } >&5 + ($PKG_CONFIG --exists --print-errors "libxml-2.0 >= 2.6.16 ") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_XML2_CFLAGS=`$PKG_CONFIG --cflags "libxml-2.0 >= 2.6.16 " 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi +if test -n "$XML2_LIBS"; then + pkg_cv_XML2_LIBS="$XML2_LIBS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libxml-2.0 >= 2.6.16 \""; } >&5 + ($PKG_CONFIG --exists --print-errors "libxml-2.0 >= 2.6.16 ") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_XML2_LIBS=`$PKG_CONFIG --libs "libxml-2.0 >= 2.6.16 " 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi + + + +if test $pkg_failed = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi + if test $_pkg_short_errors_supported = yes; then + XML2_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libxml-2.0 >= 2.6.16 " 2>&1` + else + XML2_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libxml-2.0 >= 2.6.16 " 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$XML2_PKG_ERRORS" >&5 + + as_fn_error $? "Package requirements (libxml-2.0 >= 2.6.16 ) were not met: + +$XML2_PKG_ERRORS + +Consider adjusting the PKG_CONFIG_PATH environment variable if you +installed software in a non-standard prefix. Alternatively, you may set the environment variables XML2_CFLAGS and XML2_LIBS to avoid the need to call pkg-config. @@ -16755,15 +17033,6 @@ LIBS="$LIBS $XML2_LIBS" -# Check whether --with-folia was given. -if test "${with_folia+set}" = set; then : - withval=$with_folia; PKG_CONFIG_PATH="$withval/lib/pkgconfig:$PKG_CONFIG_PATH" -else - PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH" -fi - -#AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] ) - pkg_failed=no { $as_echo "$as_me:${as_lineno-$LINENO}: checking for folia" >&5 $as_echo_n "checking for folia... " >&6; } @@ -16772,12 +17041,12 @@ pkg_cv_folia_CFLAGS="$folia_CFLAGS" elif test -n "$PKG_CONFIG"; then if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 1.0 \""; } >&5 - ($PKG_CONFIG --exists --print-errors "folia >= 1.0 ") 2>&5 + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 1.10 \""; } >&5 + ($PKG_CONFIG --exists --print-errors "folia >= 1.10 ") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then - pkg_cv_folia_CFLAGS=`$PKG_CONFIG --cflags "folia >= 1.0 " 2>/dev/null` + pkg_cv_folia_CFLAGS=`$PKG_CONFIG --cflags "folia >= 1.10 " 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes else pkg_failed=yes @@ -16789,12 +17058,12 @@ pkg_cv_folia_LIBS="$folia_LIBS" elif test -n "$PKG_CONFIG"; then if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 1.0 \""; } >&5 - ($PKG_CONFIG --exists --print-errors "folia >= 1.0 ") 2>&5 + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 1.10 \""; } >&5 + ($PKG_CONFIG --exists --print-errors "folia >= 1.10 ") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then - pkg_cv_folia_LIBS=`$PKG_CONFIG --libs "folia >= 1.0 " 2>/dev/null` + pkg_cv_folia_LIBS=`$PKG_CONFIG --libs "folia >= 1.10 " 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes else pkg_failed=yes @@ -16815,14 +17084,14 @@ _pkg_short_errors_supported=no fi if test $_pkg_short_errors_supported = yes; then - folia_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "folia >= 1.0 " 2>&1` + folia_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "folia >= 1.10 " 2>&1` else - folia_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "folia >= 1.0 " 2>&1` + folia_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "folia >= 1.10 " 2>&1` fi # Put the nasty error message in config.log where it belongs echo "$folia_PKG_ERRORS" >&5 - as_fn_error $? "Package requirements (folia >= 1.0 ) were not met: + as_fn_error $? "Package requirements (folia >= 1.10 ) were not met: $folia_PKG_ERRORS @@ -16858,15 +17127,6 @@ LIBS="$folia_LIBS $LIBS" -# Check whether --with-ticcutils was given. -if test "${with_ticcutils+set}" = set; then : - withval=$with_ticcutils; PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig" -else - PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig" -fi - -# AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] ) - pkg_failed=no { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ticcutils" >&5 $as_echo_n "checking for ticcutils... " >&6; } @@ -16875,12 +17135,12 @@ pkg_cv_ticcutils_CFLAGS="$ticcutils_CFLAGS" elif test -n "$PKG_CONFIG"; then if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.6 \""; } >&5 - ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.6 ") 2>&5 + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.18 \""; } >&5 + ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.18 ") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then - pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.6 " 2>/dev/null` + pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.18 " 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes else pkg_failed=yes @@ -16892,12 +17152,12 @@ pkg_cv_ticcutils_LIBS="$ticcutils_LIBS" elif test -n "$PKG_CONFIG"; then if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.6 \""; } >&5 - ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.6 ") 2>&5 + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.18 \""; } >&5 + ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.18 ") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then - pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.6 " 2>/dev/null` + pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.18 " 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes else pkg_failed=yes @@ -16918,14 +17178,14 @@ _pkg_short_errors_supported=no fi if test $_pkg_short_errors_supported = yes; then - ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.6 " 2>&1` + ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.18 " 2>&1` else - ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.6 " 2>&1` + ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.18 " 2>&1` fi # Put the nasty error message in config.log where it belongs echo "$ticcutils_PKG_ERRORS" >&5 - as_fn_error $? "Package requirements (ticcutils >= 0.6 ) were not met: + as_fn_error $? "Package requirements (ticcutils >= 0.18 ) were not met: $ticcutils_PKG_ERRORS @@ -17019,25 +17279,13 @@ # Put the nasty error message in config.log where it belongs echo "$uctodata_PKG_ERRORS" >&5 - { $as_echo "$as_me:${as_lineno-$LINENO}: ATTENTION: - ucto datafiles are not installed! - ucto will work with only a minimal default configuration. - You should consider installing the uctodata package! " >&5 -$as_echo "$as_me: ATTENTION: - ucto datafiles are not installed! - ucto will work with only a minimal default configuration. - You should consider installing the uctodata package! " >&6;} + as_fn_error $? "ATTENTION: + ucto datafiles are not installed!. Please install uctodata package. " "$LINENO" 5 elif test $pkg_failed = untried; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } - { $as_echo "$as_me:${as_lineno-$LINENO}: ATTENTION: - ucto datafiles are not installed! - ucto will work with only a minimal default configuration. - You should consider installing the uctodata package! " >&5 -$as_echo "$as_me: ATTENTION: - ucto datafiles are not installed! - ucto will work with only a minimal default configuration. - You should consider installing the uctodata package! " >&6;} + as_fn_error $? "ATTENTION: + ucto datafiles are not installed!. Please install uctodata package. " "$LINENO" 5 else uctodata_CFLAGS=$pkg_cv_uctodata_CFLAGS uctodata_LIBS=$pkg_cv_uctodata_LIBS @@ -17052,12 +17300,12 @@ pkg_cv_uctodata_CFLAGS="$uctodata_CFLAGS" elif test -n "$PKG_CONFIG"; then if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"uctodata >= 0.4\""; } >&5 - ($PKG_CONFIG --exists --print-errors "uctodata >= 0.4") 2>&5 + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"uctodata >= 0.8\""; } >&5 + ($PKG_CONFIG --exists --print-errors "uctodata >= 0.8") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then - pkg_cv_uctodata_CFLAGS=`$PKG_CONFIG --cflags "uctodata >= 0.4" 2>/dev/null` + pkg_cv_uctodata_CFLAGS=`$PKG_CONFIG --cflags "uctodata >= 0.8" 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes else pkg_failed=yes @@ -17069,12 +17317,12 @@ pkg_cv_uctodata_LIBS="$uctodata_LIBS" elif test -n "$PKG_CONFIG"; then if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"uctodata >= 0.4\""; } >&5 - ($PKG_CONFIG --exists --print-errors "uctodata >= 0.4") 2>&5 + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"uctodata >= 0.8\""; } >&5 + ($PKG_CONFIG --exists --print-errors "uctodata >= 0.8") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then - pkg_cv_uctodata_LIBS=`$PKG_CONFIG --libs "uctodata >= 0.4" 2>/dev/null` + pkg_cv_uctodata_LIBS=`$PKG_CONFIG --libs "uctodata >= 0.8" 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes else pkg_failed=yes @@ -17095,9 +17343,9 @@ _pkg_short_errors_supported=no fi if test $_pkg_short_errors_supported = yes; then - uctodata_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "uctodata >= 0.4" 2>&1` + uctodata_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "uctodata >= 0.8" 2>&1` else - uctodata_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "uctodata >= 0.4" 2>&1` + uctodata_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "uctodata >= 0.8" 2>&1` fi # Put the nasty error message in config.log where it belongs echo "$uctodata_PKG_ERRORS" >&5 @@ -17127,9 +17375,18 @@ fi + +UCTODATA_PRE=`pkg-config --variable=prefix uctodata` +UCTODATA_DIR=$UCTODATA_PRE/share/ucto + +cat >>confdefs.h <<_ACEOF +#define UCTODATA_DIR "$UCTODATA_DIR/" +_ACEOF + + # Checks for library functions. -ac_config_files="$ac_config_files Makefile ucto.pc ucto-icu.pc m4/Makefile config/Makefile docs/Makefile src/Makefile tests/Makefile include/Makefile include/ucto/Makefile" +ac_config_files="$ac_config_files Makefile ucto.pc m4/Makefile config/Makefile config/textcat_normal.cfg config/textcat_alt.cfg docs/Makefile src/Makefile tests/Makefile include/Makefile include/ucto/Makefile" cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure @@ -17268,6 +17525,14 @@ as_fn_error $? "conditional \"am__fastdepCC\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${OLD_LM_TRUE}" && test -z "${OLD_LM_FALSE}"; then + as_fn_error $? "conditional \"OLD_LM\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi +if test -z "${OLD_LM_TRUE}" && test -z "${OLD_LM_FALSE}"; then + as_fn_error $? "conditional \"OLD_LM\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 @@ -17665,7 +17930,7 @@ # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by ucto $as_me 0.9.6, which was +This file was extended by ucto $as_me 0.14, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -17731,7 +17996,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -ucto config.status 0.9.6 +ucto config.status 0.14 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" @@ -18246,9 +18511,10 @@ "libtool") CONFIG_COMMANDS="$CONFIG_COMMANDS libtool" ;; "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; "ucto.pc") CONFIG_FILES="$CONFIG_FILES ucto.pc" ;; - "ucto-icu.pc") CONFIG_FILES="$CONFIG_FILES ucto-icu.pc" ;; "m4/Makefile") CONFIG_FILES="$CONFIG_FILES m4/Makefile" ;; "config/Makefile") CONFIG_FILES="$CONFIG_FILES config/Makefile" ;; + "config/textcat_normal.cfg") CONFIG_FILES="$CONFIG_FILES config/textcat_normal.cfg" ;; + "config/textcat_alt.cfg") CONFIG_FILES="$CONFIG_FILES config/textcat_alt.cfg" ;; "docs/Makefile") CONFIG_FILES="$CONFIG_FILES docs/Makefile" ;; "src/Makefile") CONFIG_FILES="$CONFIG_FILES src/Makefile" ;; "tests/Makefile") CONFIG_FILES="$CONFIG_FILES tests/Makefile" ;; diff -Nru ucto-0.9.6/configure.ac ucto-0.14/configure.ac --- ucto-0.9.6/configure.ac 2017-01-23 12:49:02.000000000 +0000 +++ ucto-0.14/configure.ac 2018-11-13 16:30:47.000000000 +0000 @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT([ucto], [0.9.6], [lamasoftware@science.ru.nl]) +AC_INIT([ucto], [0.14], [lamasoftware@science.ru.nl]) #adapt version number in codemeta.json as well AM_INIT_AUTOMAKE([foreign]) AC_CONFIG_SRCDIR([configure.ac]) AC_CONFIG_MACRO_DIR([m4]) @@ -11,23 +11,13 @@ SYSCONFDIR=$sysconfdir AC_SUBST([SYSCONFDIR]) -if test x"${CXXFLAGS+set}" = xset; then - # the user set CXXFLAGS; don't override it. - cxx_flags_were_set=true -else - cxx_flags_were_set=false -fi - # Checks for programs. -AC_PROG_CXX( [g++ c++] ) - -if $cxx_flags_were_set; then - CXXFLAGS=$CXXFLAGS -fi +AC_PROG_CXX( [c++] ) # use libtool AC_PROG_LIBTOOL LT_INIT +AC_PROG_LN_S # when running tests, use CXX AC_LANG([C++]) @@ -50,67 +40,67 @@ AX_LIB_READLINE -# ugly hack when PKG_CONFIG_PATH isn't defined. -# couldn't get it to work otherwise +if test $prefix = "NONE"; then + prefix="$ac_default_prefix" +fi + +PKG_PROG_PKG_CONFIG + if test "x$PKG_CONFIG_PATH" = x; then - export PKG_CONFIG_PATH="" + export PKG_CONFIG_PATH="$prefix/lib/pkgconfig" +else + export PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH" +fi + +AC_OSX_PKG( [icu4c] ) +AC_MSG_NOTICE([pkg-config: PATH=$PKG_CONFIG_PATH]) + +PKG_CHECK_MODULES( [TEXTCAT], + [libexttextcat], + [TEXTCAT_FOUND=1 + CXXFLAGS="$CXXFLAGS $TEXTCAT_CFLAGS" + LIBS="$TEXTCAT_LIBS $LIBS"], + [PKG_CHECK_MODULES( [TEXTCAT], + [libtextcat], + [TEXTCAT_FOUND=1 + CXXFLAGS="$CXXFLAGS $TEXTCAT_CFLAGS" + LIBS="$TEXTCAT_LIBS $LIBS"], + [TEXTCAT_FOUND=0] + )] + ) + +if test $TEXTCAT_FOUND = 0; then +# So, no pkg-config for textcat found. +# Hopefully an old style version can be found??? + AC_CHECK_OLD_TEXTCAT + if test $TEXTCAT_FOUND = 1; then + AC_DEFINE([HAVE_OLD_TEXTCAT], [1], [textcat needs C linkage]) + fi +fi + +if test $TEXTCAT_FOUND = 1; then + AC_SEARCH_LM fi -#AC_MSG_NOTICE( [pkg-config search path:$PKG_CONFIG_PATH dus] ) -AC_CHECK_HEADERS([libexttextcat/textcat.h], - [CXXFLAGS="$CXXFLAGS -I$prefix/include"], - [AC_CHECK_HEADERS([libtextcat/textcat.h], - [CXXFLAGS="$CXXFLAGS -I$prefix/include"], - [AC_CHECK_HEADERS([textcat.h], - [], - [AC_MSG_NOTICE([textcat support not available])])])]) - -AC_SEARCH_LIBS([textcat_Init],[exttextcat-2.0 exttextcat textcat],[AC_DEFINE(HAVE_TEXTCAT_LIB, 1, textcat_lib)], - [AC_MSG_NOTICE([Unable to find textcat library. textcat support not available])]) - -useICU=1; -# inspired by feh-1.3.4/configure.ac. Tnx Tom Gilbert and feh hackers. -AC_ARG_WITH(icu, - [ --with-icu=DIR use ICU installed in ], - [if test "$with_icu" = "no"; then - useICU=0 - else - CXXFLAGS="$CXXFLAGS -I$withval/include" - LIBS="-L$withval/lib $LIBS" - fi] ) - -if test "$useICU" = "1"; then - AX_ICU_CHECK( [5.2], - [CXXFLAGS="$CXXFLAGS $ICU_CPPSEARCHPATH" - LIBS="$ICU_LIBPATH $ICU_LIBS $ICU_IOLIBS $LIBS"], - [AC_MSG_FAILURE( "No ICU development environment found. Please check if libicu-dev or the like is installed" )] ) - AC_DEFINE(HAVE_ICU, 1, we want to use ICU ) + +if test $TEXTCAT_FOUND = 0; then + AC_MSG_ERROR( [no working libtextcat or libexttextcat found!] ) else - AC_MSG_ERROR("ICU support is required") + AC_DEFINE([HAVE_TEXTCAT], [1], [textcat]) fi +PKG_CHECK_MODULES([ICU], [icu-uc >= 50 icu-io] ) +CXXFLAGS="$CXXFLAGS $ICU_CFLAGS" +LIBS="$ICU_LIBS $LIBS" + PKG_CHECK_MODULES([XML2], [libxml-2.0 >= 2.6.16] ) CXXFLAGS="$CXXFLAGS $XML2_CFLAGS" LIBS="$LIBS $XML2_LIBS" -AC_ARG_WITH(folia, - [ --with-folia=DIR use libfolia installed in ; - note that you can install folia in a non-default directory with - ./configure --prefix= in the folia installation directory], - [PKG_CONFIG_PATH="$withval/lib/pkgconfig:$PKG_CONFIG_PATH"], - [PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH"]) -#AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] ) -PKG_CHECK_MODULES([folia], [folia >= 1.0] ) +PKG_CHECK_MODULES([folia], [folia >= 1.10] ) CXXFLAGS="$folia_CFLAGS $CXXFLAGS" LIBS="$folia_LIBS $LIBS" -AC_ARG_WITH(ticcutils, - [ --with-ticcutils=DIR use ticcutils installed in ; - note that you can install ticcutils in a non-default directory with - ./configure --prefix= in the ticcutils installation directory], - [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"], - [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"]) -# AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] ) -PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.6] ) +PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.18] ) CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS" LIBS="$LIBS $ticcutils_LIBS" @@ -119,25 +109,29 @@ [uctodata], [PKG_CHECK_MODULES( [uctodata], - [uctodata >= 0.4], + [uctodata >= 0.8], [], [AC_MSG_NOTICE([ATTENTION: Your ucto datafiles are are outdated You should consider installing a newer version of the uctodata package!])]) ], - [AC_MSG_NOTICE([ATTENTION: - ucto datafiles are not installed! - ucto will work with only a minimal default configuration. - You should consider installing the uctodata package!] )] ) + [AC_MSG_ERROR([ATTENTION: + ucto datafiles are not installed!. Please install uctodata package.] )] ) + +UCTODATA_PRE=`pkg-config --variable=prefix uctodata` +UCTODATA_DIR=$UCTODATA_PRE/share/ucto +AC_DEFINE_UNQUOTED([UCTODATA_DIR], ["$UCTODATA_DIR/"], [location of uctodata files]) + # Checks for library functions. AC_OUTPUT([ Makefile ucto.pc - ucto-icu.pc m4/Makefile config/Makefile + config/textcat_normal.cfg + config/textcat_alt.cfg docs/Makefile src/Makefile tests/Makefile diff -Nru ucto-0.9.6/debian/changelog ucto-0.14/debian/changelog --- ucto-0.9.6/debian/changelog 2018-11-14 02:48:31.000000000 +0000 +++ ucto-0.14/debian/changelog 2018-12-18 20:16:46.000000000 +0000 @@ -1,26 +1,34 @@ -ucto (0.9.6-1ubuntu1) disco; urgency=medium +ucto (0.14-2) unstable; urgency=medium - * Fix build with ICU 63.1. Closes: #913512. + * Re-added Breaks+Replace libucto3 <- libucto2 + (Closes: #916766). Thanks to Andreas Beckmann. - -- Matthias Klose Wed, 14 Nov 2018 03:48:31 +0100 + -- Maarten van Gompel Wed, 19 Dec 2018 12:01:32 +0100 -ucto (0.9.6-1build3) disco; urgency=medium +ucto (0.14-1) unstable; urgency=medium - * No-change rebuild for icu soname changes. + * Updated to new upstream (0.14) (Closes: #915259) + + Fixes FTBFS with ICU 63.1 (Closes: #913512) + * d/control: updated uctodata dependency + * d/control: updated libfolia and ticcutils dependencies + * d/control: added libexttextcat dependency + * d/control: updated standards + * d/control: updated vcs-* to salsa + * d/control: changed priority extra to optional + * d/compat: bumped to 11 + * d/copyright: updated for m4 files + * migrated away from cdbs + * d/rules: auto-tests disabled for now + * added an 'installdatahook' patch that + fixes upstream's erroneous install-data-hook that did + not take staged installations into account. + * Changed package libucto2 to libucto3 because of SO version + change. + * Added ucto.maintscript to clean conffiles left over by ucto before + the uctodata split. Added missing conffiles to libucto3.maintscript + (Closes: #852470). Thanks to Andreas Beckmann. - -- Matthias Klose Tue, 13 Nov 2018 08:21:19 +0000 - -ucto (0.9.6-1build2) bionic; urgency=medium - - * No-change rebuild for icu soname change. - - -- Matthias Klose Tue, 07 Nov 2017 08:55:57 +0000 - -ucto (0.9.6-1build1) bionic; urgency=medium - - * No-change rebuild for libicu soname change. - - -- Matthias Klose Wed, 25 Oct 2017 15:49:07 +0000 + -- Maarten van Gompel Mon, 17 Dec 2018 10:48:32 +0100 ucto (0.9.6-1) unstable; urgency=high diff -Nru ucto-0.9.6/debian/compat ucto-0.14/debian/compat --- ucto-0.9.6/debian/compat 2017-01-11 09:47:34.000000000 +0000 +++ ucto-0.14/debian/compat 2018-12-16 19:03:00.000000000 +0000 @@ -1 +1 @@ -10 +11 diff -Nru ucto-0.9.6/debian/control ucto-0.14/debian/control --- ucto-0.9.6/debian/control 2017-01-23 14:10:00.000000000 +0000 +++ ucto-0.14/debian/control 2018-12-17 22:10:31.000000000 +0000 @@ -1,22 +1,22 @@ Source: ucto Section: science -Priority: extra +Priority: optional Maintainer: Debian Science Team Uploaders: Maarten van Gompel , Ko van der Sloot -Build-Depends: cdbs (>= 0.4.93~), - debhelper (>= 10), - dh-autoreconf, - libfolia-dev (>= 1.4), +Build-Depends: debhelper (>= 11), + libfolia-dev (>= 1.15), libicu-dev, - libticcutils2-dev (>= 0.14), + libticcutils-dev (>= 0.20), + libexttextcat-dev, libxml2-dev, pkg-config, - uctodata (>= 0.4) -Standards-Version: 3.9.8 + uctodata (>= 0.8) +Standards-Version: 4.2.1 Homepage: http://languagemachines.github.io/ucto/ -Vcs-Browser: https://anonscm.debian.org/cgit/debian-science/packages/ucto.git -Vcs-Git: https://anonscm.debian.org/git/debian-science/packages/ucto.git +Vcs-Git: https://salsa.debian.org/science-team/ucto.git +Vcs-Browser: https://salsa.debian.org/science-team/ucto + Package: ucto @@ -48,7 +48,7 @@ Package: libucto-dev Section: libdevel Architecture: any -Depends: libucto2 (= ${binary:Version}), +Depends: libucto3 (= ${binary:Version}), ${misc:Depends} Conflicts: libucto1-dev Replaces: libucto1-dev @@ -74,14 +74,14 @@ do scientific research in natural language processing, ucto will likely be of use to you. -Package: libucto2 +Package: libucto3 Section: libs Architecture: any Multi-Arch: same Depends: ${misc:Depends}, ${shlibs:Depends} -Replaces: ucto (<< 0.5.5-1) -Breaks: ucto (<< 0.5.5-1) +Replaces: ucto (<< 0.5.5-1), libucto2 +Breaks: ucto (<< 0.5.5-1), libucto2 Description: Unicode Tokenizer - runtime Ucto can tokenize UTF-8 encoded text files (i.e. separate words from punctuation, split sentences, generate n-grams), and offers several other @@ -102,4 +102,3 @@ If you are interested in machine parsing of UTF-8 encoded text files, e.g. to do scientific research in natural language processing, ucto will likely be of use to you. - diff -Nru ucto-0.9.6/debian/copyright ucto-0.14/debian/copyright --- ucto-0.9.6/debian/copyright 2017-01-11 09:47:34.000000000 +0000 +++ ucto-0.14/debian/copyright 2018-12-16 19:03:00.000000000 +0000 @@ -3,7 +3,7 @@ Files: * Copyright: - Copyright © 2015 - 2017, Centre for Language and Speech Technology + Copyright © 2015 - 2018, Centre for Language and Speech Technology (Radboud University Nijmegen, The Netherlands) Copyright © 2009 - 2015, ILK Research Group (Tilburg University, The Netherlands) @@ -13,8 +13,21 @@ Files: debian/* Comment: Joost van Baal Copyright: Copyright © 2010 Joost van Baal + Copyright © 2016 - 2018 Maarten van Gompel License: GPL-3+ +Files: m4/* +Copyright: Copyright (C) 2004-2005, 2007-2009, 2011-2015 Free Software Foundation, Inc. +License: permissive + +Files: m4/ac_*.m4 +Copyright: Copyright © 2018 Ko van der Sloot +License: GPL-3+ + +Files: m4/ax_lib_readline.m4 +Copyright: Copyright (c) 2008 Ville Laurikari +License: permissive + License: GPL-3+ Ucto is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -28,3 +41,9 @@ . On Debian systems, the complete text of the GNU General Public License can be found in `/usr/share/common-licenses/GPL-3'. + +License: permissive + Copying and distribution of this file, with or without modification, are + permitted in any medium without royalty provided the copyright notice + and this notice are preserved. This file is offered as-is, without any + warranty diff -Nru ucto-0.9.6/debian/libucto2.install ucto-0.14/debian/libucto2.install --- ucto-0.9.6/debian/libucto2.install 2017-01-23 14:10:00.000000000 +0000 +++ ucto-0.14/debian/libucto2.install 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -/usr/share/ucto -/usr/lib/*/libucto.so.2* diff -Nru ucto-0.9.6/debian/libucto2.maintscript ucto-0.14/debian/libucto2.maintscript --- ucto-0.9.6/debian/libucto2.maintscript 2017-01-23 14:10:00.000000000 +0000 +++ ucto-0.14/debian/libucto2.maintscript 1970-01-01 00:00:00.000000000 +0000 @@ -1,6 +0,0 @@ -rm_conffile /etc/ucto/e-mail.rule 0.9.6~ -rm_conffile /etc/ucto/smiley.rule 0.9.6~ -rm_conffile /etc/ucto/url.rule 0.9.6~ -rm_conffile /etc/ucto/standard-eos.eos 0.9.6~ -rm_conffile /etc/ucto/standard-quotes.quote 0.9.6~ -rm_conffile /etc/ucto/tokconfig-generic 0.9.6~ diff -Nru ucto-0.9.6/debian/libucto3.install ucto-0.14/debian/libucto3.install --- ucto-0.9.6/debian/libucto3.install 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/debian/libucto3.install 2018-12-16 19:03:00.000000000 +0000 @@ -0,0 +1,2 @@ +/usr/share/ucto +/usr/lib/*/libucto.so.3* diff -Nru ucto-0.9.6/debian/libucto3.maintscript ucto-0.14/debian/libucto3.maintscript --- ucto-0.9.6/debian/libucto3.maintscript 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/debian/libucto3.maintscript 2018-12-16 19:03:00.000000000 +0000 @@ -0,0 +1,7 @@ +rm_conffile /etc/ucto/e-mail.rule 0.9.6~ +rm_conffile /etc/ucto/smiley.rule 0.9.6~ +rm_conffile /etc/ucto/url.rule 0.9.6~ +rm_conffile /etc/ucto/standard-eos.eos 0.9.6~ +rm_conffile /etc/ucto/standard-quotes.quote 0.9.6~ +rm_conffile /etc/ucto/tokconfig-generic 0.9.6~ +rm_conffile /etc/ucto/textcat.cfg 0.9.6-2~ diff -Nru ucto-0.9.6/debian/patches/installdatahook ucto-0.14/debian/patches/installdatahook --- ucto-0.9.6/debian/patches/installdatahook 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/debian/patches/installdatahook 2018-12-16 19:03:00.000000000 +0000 @@ -0,0 +1,18 @@ +Description: Patch for install-data-hook + The upstream install-data-hook did not take staged installation into account. Fixed. + +--- ucto-0.14.orig/config/Makefile.am ++++ ucto-0.14/config/Makefile.am +@@ -5,9 +5,9 @@ configdir = $(datadir)/$(PACKAGE) + EXTRA_DIST = $(config_DATA) + + install-data-hook: +- rm -f $(configdir)/textcat.cfg ++ rm -f $(DESTDIR)$(configdir)/textcat.cfg + if OLD_LM +- $(LN_S) $(configdir)/textcat_alt.cfg $(configdir)/textcat.cfg ++ cp -f $(DESTDIR)$(configdir)/textcat_alt.cfg $(DESTDIR)$(configdir)/textcat.cfg + else +- $(LN_S) $(configdir)/textcat_normal.cfg $(configdir)/textcat.cfg ++ cp -f $(DESTDIR)$(configdir)/textcat_normal.cfg $(DESTDIR)$(configdir)/textcat.cfg + endif diff -Nru ucto-0.9.6/debian/patches/series ucto-0.14/debian/patches/series --- ucto-0.9.6/debian/patches/series 2018-11-14 02:48:31.000000000 +0000 +++ ucto-0.14/debian/patches/series 2018-12-16 19:03:00.000000000 +0000 @@ -1 +1 @@ -ucto_ICU_63.1.patch +installdatahook diff -Nru ucto-0.9.6/debian/patches/ucto_ICU_63.1.patch ucto-0.14/debian/patches/ucto_ICU_63.1.patch --- ucto-0.9.6/debian/patches/ucto_ICU_63.1.patch 2018-11-14 02:48:31.000000000 +0000 +++ ucto-0.14/debian/patches/ucto_ICU_63.1.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,1081 +0,0 @@ -Description: fix FTBFS with ICU 63.1 - Add icu namespace. -Author: Laszlo Boszormenyi (GCS) -Last-Update: 2018-11-09 - ---- - ---- ucto-0.9.6.orig/include/ucto/setting.h -+++ ucto-0.9.6/include/ucto/setting.h -@@ -38,15 +38,15 @@ namespace Tokenizer { - public: - Rule(): regexp(0){ - }; -- Rule( const UnicodeString& id, const UnicodeString& pattern); -+ Rule( const icu::UnicodeString& id, const icu::UnicodeString& pattern); - ~Rule(); -- UnicodeString id; -- UnicodeString pattern; -+ icu::UnicodeString id; -+ icu::UnicodeString pattern; - UnicodeRegexMatcher *regexp; -- bool matchAll( const UnicodeString&, -- UnicodeString&, -- UnicodeString&, -- std::vector& ); -+ bool matchAll( const icu::UnicodeString&, -+ icu::UnicodeString&, -+ icu::UnicodeString&, -+ std::vector& ); - private: - Rule( const Rule& ); // inhibit copies - Rule& operator=( const Rule& ); // inhibit copies -@@ -56,17 +56,17 @@ namespace Tokenizer { - class Quoting { - friend std::ostream& operator<<( std::ostream&, const Quoting& ); - struct QuotePair { -- UnicodeString openQuote; -- UnicodeString closeQuote; -+ icu::UnicodeString openQuote; -+ icu::UnicodeString closeQuote; - }; - public: -- void add( const UnicodeString&, const UnicodeString& ); -- UnicodeString lookupOpen( const UnicodeString &) const; -- UnicodeString lookupClose( const UnicodeString & ) const; -+ void add( const icu::UnicodeString&, const icu::UnicodeString& ); -+ icu::UnicodeString lookupOpen( const icu::UnicodeString &) const; -+ icu::UnicodeString lookupClose( const icu::UnicodeString & ) const; - bool empty() const { return _quotes.empty(); }; - bool emptyStack() const { return quotestack.empty(); }; - void clearStack() { quoteindexstack.clear(); quotestack.clear(); }; -- int lookup( const UnicodeString&, int& ); -+ int lookup( const icu::UnicodeString&, int& ); - void eraseAtPos( int pos ) { - quotestack.erase( quotestack.begin()+pos ); - quoteindexstack.erase( quoteindexstack.begin()+pos ); -@@ -90,14 +90,14 @@ namespace Tokenizer { - bool readfilters( const std::string& ); - bool readquotes( const std::string& ); - bool readeosmarkers( const std::string& ); -- bool readabbreviations( const std::string&, UnicodeString& ); -- void add_rule( const UnicodeString&, const std::vector& ); -- void sortRules( std::map&, -- const std::vector& ); -- UnicodeString eosmarkers; -+ bool readabbreviations( const std::string&, icu::UnicodeString& ); -+ void add_rule( const icu::UnicodeString&, const std::vector& ); -+ void sortRules( std::map&, -+ const std::vector& ); -+ icu::UnicodeString eosmarkers; - std::vector rules; -- std::map rulesmap; -- std::map rules_index; -+ std::map rulesmap; -+ std::map rules_index; - Quoting quotes; - UnicodeFilter filter; - std::string set_file; // the name of the settingsfile ---- ucto-0.9.6.orig/include/ucto/tokenize.h -+++ ucto-0.9.6/include/ucto/tokenize.h -@@ -78,11 +78,11 @@ namespace Tokenizer { - class Token { - friend std::ostream& operator<< (std::ostream&, const Token& ); - public: -- UnicodeString type; -- UnicodeString us; -+ icu::UnicodeString type; -+ icu::UnicodeString us; - TokenRole role; -- Token( const UnicodeString&, -- const UnicodeString&, -+ Token( const icu::UnicodeString&, -+ const icu::UnicodeString&, - TokenRole role = NOROLE, - const std::string& = "" ); - std::string lc; // ISO 639-3 language code -@@ -130,12 +130,12 @@ namespace Tokenizer { - - // Tokenize a line (a line is NOT a sentence, but an arbitrary string - // of characters, inclusive EOS markers, Newlines etc.) -- int tokenizeLine( const UnicodeString&, -+ int tokenizeLine( const icu::UnicodeString&, - const std::string& = "default" ); // Unicode chars - int tokenizeLine( const std::string&, - const std::string& = "default" ); // UTF8 chars - -- void passthruLine( const UnicodeString&, bool& ); -+ void passthruLine( const icu::UnicodeString&, bool& ); - void passthruLine( const std::string&, bool& ); - - //Processes tokens and initialises the sentence buffer. Returns the amount of sentences found -@@ -209,8 +209,8 @@ namespace Tokenizer { - void setLanguage( const std::string& l ){ default_language = l; }; - - // set eos marker -- UnicodeString setEosMarker( const std::string& s = "") { UnicodeString t = eosmark; eosmark = folia::UTF8ToUnicode(s); return t; }; -- UnicodeString getEosMarker( ) const { return eosmark; } -+ icu::UnicodeString setEosMarker( const std::string& s = "") { icu::UnicodeString t = eosmark; eosmark = folia::UTF8ToUnicode(s); return t; }; -+ icu::UnicodeString getEosMarker( ) const { return eosmark; } - - bool setNormSet( const std::string& ); - -@@ -255,14 +255,14 @@ namespace Tokenizer { - private: - TokenizerClass( const TokenizerClass& ); // inhibit copies - TokenizerClass& operator=( const TokenizerClass& ); // inhibit copies -- void add_rule( const UnicodeString&, -- const std::vector& ); -- void tokenizeWord( const UnicodeString&, -+ void add_rule( const icu::UnicodeString&, -+ const std::vector& ); -+ void tokenizeWord( const icu::UnicodeString&, - bool, - const std::string&, -- const UnicodeString& ="" ); -+ const icu::UnicodeString& ="" ); - -- bool detectEos( size_t, const UnicodeString&, const Quoting& ) const; -+ bool detectEos( size_t, const icu::UnicodeString&, const Quoting& ) const; - void detectSentenceBounds( const int offset, - const std::string& = "default" ); - void detectQuotedSentenceBounds( const int offset, -@@ -272,7 +272,7 @@ namespace Tokenizer { - //Signal the tokeniser that a paragraph is detected - void signalParagraph( bool b=true ) { paragraphsignal = b; }; - -- bool resolveQuote( int, const UnicodeString&, Quoting& ); -+ bool resolveQuote( int, const icu::UnicodeString&, Quoting& ); - bool u_isquote( UChar32, - const Quoting& ) const; - std::string checkBOM( std::istream& ); -@@ -289,9 +289,9 @@ namespace Tokenizer { - UnicodeNormalizer normalizer; - std::string inputEncoding; - -- UnicodeString eosmark; -+ icu::UnicodeString eosmark; - std::vector tokens; -- std::set norm_set; -+ std::set norm_set; - TiCC::LogStream *theErrLog; - - std::string default_language; ---- ucto-0.9.6.orig/include/ucto/unicode.h -+++ ucto-0.9.6/include/ucto/unicode.h -@@ -42,7 +42,7 @@ namespace Tokenizer { - class UnicodeNormalizer { - public: - UnicodeNormalizer(): mode(UNORM_NFC){}; -- UnicodeString normalize( const UnicodeString& ); -+ icu::UnicodeString normalize( const icu::UnicodeString& ); - std::string getMode( ) const; - std::string setMode( const std::string& ); - private: -@@ -52,34 +52,34 @@ namespace Tokenizer { - class UnicodeFilter { - friend std::ostream& operator<<( std::ostream&, const UnicodeFilter& ); - public: -- UnicodeString filter( const UnicodeString& ); -+ icu::UnicodeString filter( const icu::UnicodeString& ); - bool fill( const std::string& ); -- bool add( const UnicodeString& ); -+ bool add( const icu::UnicodeString& ); - bool add( const std::string& ); - bool empty() const { return the_map.empty(); }; - private: -- void add( UChar uc, const UnicodeString& us ) { the_map[uc] = us; }; -- std::map the_map; -+ void add( UChar uc, const icu::UnicodeString& us ) { the_map[uc] = us; }; -+ std::map the_map; - }; - - class UnicodeRegexMatcher { - public: -- UnicodeRegexMatcher( const UnicodeString&, const UnicodeString& name="" ); -+ UnicodeRegexMatcher( const icu::UnicodeString&, const icu::UnicodeString& name="" ); - ~UnicodeRegexMatcher(); -- bool match_all( const UnicodeString&, UnicodeString&, UnicodeString& ); -- const UnicodeString get_match( unsigned int ) const; -+ bool match_all( const icu::UnicodeString&, icu::UnicodeString&, icu::UnicodeString& ); -+ const icu::UnicodeString get_match( unsigned int ) const; - int NumOfMatches() const; -- int split( const UnicodeString&, std::vector& ); -- UnicodeString Pattern() const; -+ int split( const icu::UnicodeString&, std::vector& ); -+ icu::UnicodeString Pattern() const; - private: - UnicodeRegexMatcher( const UnicodeRegexMatcher& ); // inhibit copies - UnicodeRegexMatcher& operator=( const UnicodeRegexMatcher& ); // inhibit copies - std::string failString; -- RegexPattern *pattern; -- RegexMatcher *matcher; -+ icu::RegexPattern *pattern; -+ icu::RegexMatcher *matcher; - UnicodeRegexMatcher(); -- std::vector results; -- const UnicodeString _name; -+ std::vector results; -+ const icu::UnicodeString _name; - }; - - } // namespace ---- ucto-0.9.6.orig/src/setting.cxx -+++ ucto-0.9.6/src/setting.cxx -@@ -54,7 +54,7 @@ namespace Tokenizer { - ORDINALS, EOSMARKERS, QUOTES, CURRENCY, - FILTER, RULEORDER, METARULES }; - -- ConfigMode getMode( const UnicodeString& line ) { -+ ConfigMode getMode( const icu::UnicodeString& line ) { - ConfigMode mode = NONE; - if (line == "[RULES]") { - mode = RULES; -@@ -111,7 +111,7 @@ namespace Tokenizer { - public: - uConfigError( const string& s, const string& f ): - invalid_argument( "ucto: " + s + " (" + f + ")" ){}; -- uConfigError( const UnicodeString& us, const string& f ): -+ uConfigError( const icu::UnicodeString& us, const string& f ): - uConfigError( folia::UnicodeToUTF8(us), f ){}; - }; - -@@ -143,14 +143,14 @@ namespace Tokenizer { - } - } - -- void Quoting::add( const UnicodeString& o, const UnicodeString& c ){ -+ void Quoting::add( const icu::UnicodeString& o, const icu::UnicodeString& c ){ - QuotePair quote; - quote.openQuote = o; - quote.closeQuote = c; - _quotes.push_back( quote ); - } - -- int Quoting::lookup( const UnicodeString& open, int& stackindex ){ -+ int Quoting::lookup( const icu::UnicodeString& open, int& stackindex ){ - if (quotestack.empty() || (quotestack.size() != quoteindexstack.size())) return -1; - auto it = quotestack.crbegin(); - size_t i = quotestack.size(); -@@ -165,7 +165,7 @@ namespace Tokenizer { - return -1; - } - -- UnicodeString Quoting::lookupOpen( const UnicodeString &q ) const { -+ icu::UnicodeString Quoting::lookupOpen( const icu::UnicodeString &q ) const { - for ( const auto& quote : _quotes ){ - if ( quote.openQuote.indexOf(q) >=0 ) - return quote.closeQuote; -@@ -173,8 +173,8 @@ namespace Tokenizer { - return ""; - } - -- UnicodeString Quoting::lookupClose( const UnicodeString &q ) const { -- UnicodeString res; -+ icu::UnicodeString Quoting::lookupClose( const icu::UnicodeString &q ) const { -+ icu::UnicodeString res; - for ( const auto& quote : _quotes ){ - if ( quote.closeQuote.indexOf(q) >= 0 ) - return quote.openQuote; -@@ -186,7 +186,7 @@ namespace Tokenizer { - delete regexp; - } - -- Rule::Rule( const UnicodeString& _id, const UnicodeString& _pattern): -+ Rule::Rule( const icu::UnicodeString& _id, const icu::UnicodeString& _pattern): - id(_id), pattern(_pattern) { - regexp = new UnicodeRegexMatcher( pattern, id ); - } -@@ -200,10 +200,10 @@ namespace Tokenizer { - return os; - } - -- bool Rule::matchAll( const UnicodeString& line, -- UnicodeString& pre, -- UnicodeString& post, -- vector& matches ){ -+ bool Rule::matchAll( const icu::UnicodeString& line, -+ icu::UnicodeString& pre, -+ icu::UnicodeString& post, -+ vector& matches ){ - matches.clear(); - pre = ""; - post = ""; -@@ -244,7 +244,7 @@ namespace Tokenizer { - else { - string rawline; - while ( getline(f,rawline) ){ -- UnicodeString line = folia::UTF8ToUnicode(rawline); -+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline); - line.trim(); - if ((line.length() > 0) && (line[0] != '#')) { - if ( tokDebug >= 5 ){ -@@ -255,8 +255,8 @@ namespace Tokenizer { - throw uConfigError( "invalid RULES entry: " + line, - fname ); - } -- UnicodeString id = UnicodeString( line, 0,splitpoint); -- UnicodeString pattern = UnicodeString( line, splitpoint+1); -+ icu::UnicodeString id = icu::UnicodeString( line, 0,splitpoint); -+ icu::UnicodeString pattern = icu::UnicodeString( line, splitpoint+1); - rulesmap[id] = new Rule( id, pattern); - } - } -@@ -282,7 +282,7 @@ namespace Tokenizer { - else { - string rawline; - while ( getline(f,rawline) ){ -- UnicodeString line = folia::UTF8ToUnicode(rawline); -+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline); - line.trim(); - if ((line.length() > 0) && (line[0] != '#')) { - if ( tokDebug >= 5 ){ -@@ -296,8 +296,8 @@ namespace Tokenizer { - + " (missing whitespace)", - fname ); - } -- UnicodeString open = UnicodeString( line, 0,splitpoint); -- UnicodeString close = UnicodeString( line, splitpoint+1); -+ icu::UnicodeString open = icu::UnicodeString( line, 0,splitpoint); -+ icu::UnicodeString close = icu::UnicodeString( line, splitpoint+1); - open = open.trim().unescape(); - close = close.trim().unescape(); - if ( open.isEmpty() || close.isEmpty() ){ -@@ -323,7 +323,7 @@ namespace Tokenizer { - else { - string rawline; - while ( getline(f,rawline) ){ -- UnicodeString line = folia::UTF8ToUnicode(rawline); -+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline); - line.trim(); - if ((line.length() > 0) && (line[0] != '#')) { - if ( tokDebug >= 5 ){ -@@ -331,7 +331,7 @@ namespace Tokenizer { - } - if ( ( line.startsWith("\\u") && line.length() == 6 ) || - ( line.startsWith("\\U") && line.length() == 10 ) ){ -- UnicodeString uit = line.unescape(); -+ icu::UnicodeString uit = line.unescape(); - if ( uit.isEmpty() ){ - throw uConfigError( "Invalid EOSMARKERS entry: " + line, fname ); - } -@@ -344,7 +344,7 @@ namespace Tokenizer { - } - - bool Setting::readabbreviations( const string& fname, -- UnicodeString& abbreviations ){ -+ icu::UnicodeString& abbreviations ){ - if ( tokDebug > 0 ){ - *theErrLog << "%include " << fname << endl; - } -@@ -355,7 +355,7 @@ namespace Tokenizer { - else { - string rawline; - while ( getline(f,rawline) ){ -- UnicodeString line = folia::UTF8ToUnicode(rawline); -+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline); - line.trim(); - if ((line.length() > 0) && (line[0] != '#')) { - if ( tokDebug >= 5 ){ -@@ -370,17 +370,17 @@ namespace Tokenizer { - return true; - } - -- void Setting::add_rule( const UnicodeString& name, -- const vector& parts ){ -- UnicodeString pat; -+ void Setting::add_rule( const icu::UnicodeString& name, -+ const vector& parts ){ -+ icu::UnicodeString pat; - for ( auto const& part : parts ){ - pat += part; - } - rulesmap[name] = new Rule( name, pat ); - } - -- void Setting::sortRules( map& rulesmap, -- const vector& sort ){ -+ void Setting::sortRules( map& rulesmap, -+ const vector& sort ){ - // LOG << "rules voor sort : " << endl; - // for ( size_t i=0; i < rules.size(); ++i ){ - // LOG << "rule " << i << " " << *rules[i] << endl; -@@ -432,14 +432,14 @@ namespace Tokenizer { - return result; - } - -- void addOrder( vector& order, -- map& reverse_order, -+ void addOrder( vector& order, -+ map& reverse_order, - int& index, -- UnicodeString &line, -+ icu::UnicodeString &line, - const string& fn ){ - try { - UnicodeRegexMatcher m( "\\s+" ); -- vector usv; -+ vector usv; - m.split( line, usv ); - for ( const auto& us : usv ){ - if ( reverse_order.find( us ) != reverse_order.end() ){ -@@ -500,7 +500,7 @@ namespace Tokenizer { - int dbg, LogStream* ls ) { - tokDebug = dbg; - theErrLog = ls; -- map pattern = { { ABBREVIATIONS, "" }, -+ map pattern = { { ABBREVIATIONS, "" }, - { TOKENS, "" }, - { PREFIXES, "" }, - { SUFFIXES, "" }, -@@ -508,7 +508,7 @@ namespace Tokenizer { - { ATTACHEDSUFFIXES, "" }, - { UNITS, "" }, - { ORDINALS, "" } }; -- vector rules_order; -+ vector rules_order; - vector meta_rules; - string conffile = get_filename( settings_name ); - -@@ -572,7 +572,7 @@ namespace Tokenizer { - continue; - } - -- UnicodeString line = folia::UTF8ToUnicode(rawline); -+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline); - line.trim(); - if ((line.length() > 0) && (line[0] != '#')) { - if (line[0] == '[') { -@@ -580,7 +580,7 @@ namespace Tokenizer { - } - else { - if ( line[0] == '\\' && line.length() > 1 && line[1] == '[' ){ -- line = UnicodeString( line, 1 ); -+ line = icu::UnicodeString( line, 1 ); - } - switch( mode ){ - case RULES: { -@@ -589,8 +589,8 @@ namespace Tokenizer { - throw uConfigError( "invalid RULES entry: " + line, - set_file ); - } -- UnicodeString id = UnicodeString( line, 0,splitpoint); -- UnicodeString pattern = UnicodeString( line, splitpoint+1); -+ icu::UnicodeString id = icu::UnicodeString( line, 0,splitpoint); -+ icu::UnicodeString pattern = icu::UnicodeString( line, splitpoint+1); - rulesmap[id] = new Rule( id, pattern); - } - break; -@@ -617,7 +617,7 @@ namespace Tokenizer { - case EOSMARKERS: - if ( ( line.startsWith("\\u") && line.length() == 6 ) || - ( line.startsWith("\\U") && line.length() == 10 ) ){ -- UnicodeString uit = line.unescape(); -+ icu::UnicodeString uit = line.unescape(); - if ( uit.isEmpty() ){ - throw uConfigError( "Invalid EOSMARKERS entry: " + line, - set_file ); -@@ -634,8 +634,8 @@ namespace Tokenizer { - + " (missing whitespace)", - set_file ); - } -- UnicodeString open = UnicodeString( line, 0,splitpoint); -- UnicodeString close = UnicodeString( line, splitpoint+1); -+ icu::UnicodeString open = icu::UnicodeString( line, 0,splitpoint); -+ icu::UnicodeString close = icu::UnicodeString( line, splitpoint+1); - open = open.trim().unescape(); - close = close.trim().unescape(); - if ( open.isEmpty() || close.isEmpty() ){ -@@ -702,7 +702,7 @@ namespace Tokenizer { - } - continue; - } -- UnicodeString name = folia::UTF8ToUnicode( nam ); -+ icu::UnicodeString name = folia::UTF8ToUnicode( nam ); - string rule = mr.substr( pos+1 ); - if ( tokDebug > 5 ){ - LOG << "SPLIT using: '" << split << "'" << endl; -@@ -712,11 +712,11 @@ namespace Tokenizer { - for ( auto& str : parts ){ - str = TiCC::trim( str ); - } -- vector new_parts; -- vector undef_parts; -+ vector new_parts; -+ vector undef_parts; - bool skip_rule = false; - for ( const auto& part : parts ){ -- UnicodeString meta = folia::UTF8ToUnicode( part ); -+ icu::UnicodeString meta = folia::UTF8ToUnicode( part ); - ConfigMode mode = getMode( "[" + meta + "]" ); - switch ( mode ){ - case ORDINALS: ---- ucto-0.9.6.orig/src/tokenize.cxx -+++ ucto-0.9.6/src/tokenize.cxx -@@ -88,11 +88,11 @@ namespace Tokenizer { - }; - - -- UnicodeString convert( const string& line, -+ icu::UnicodeString convert( const string& line, - const string& inputEncoding ){ -- UnicodeString result; -+ icu::UnicodeString result; - try { -- result = UnicodeString( line.c_str(), -+ result = icu::UnicodeString( line.c_str(), - line.length(), - inputEncoding.c_str() ); - } -@@ -108,17 +108,17 @@ namespace Tokenizer { - return result; - } - -- const UnicodeString type_space = "SPACE"; -- const UnicodeString type_currency = "CURRENCY"; -- const UnicodeString type_emoticon = "EMOTICON"; -- const UnicodeString type_word = "WORD"; -- const UnicodeString type_symbol = "SYMBOL"; -- const UnicodeString type_punctuation = "PUNCTUATION"; -- const UnicodeString type_number = "NUMBER"; -- const UnicodeString type_unknown = "UNKNOWN"; -+ const icu::UnicodeString type_space = "SPACE"; -+ const icu::UnicodeString type_currency = "CURRENCY"; -+ const icu::UnicodeString type_emoticon = "EMOTICON"; -+ const icu::UnicodeString type_word = "WORD"; -+ const icu::UnicodeString type_symbol = "SYMBOL"; -+ const icu::UnicodeString type_punctuation = "PUNCTUATION"; -+ const icu::UnicodeString type_number = "NUMBER"; -+ const icu::UnicodeString type_unknown = "UNKNOWN"; - -- Token::Token( const UnicodeString& _type, -- const UnicodeString& _s, -+ Token::Token( const icu::UnicodeString& _type, -+ const icu::UnicodeString& _s, - TokenRole _role, const string& _lc ): - type(_type), us(_s), role(_role), lc(_lc) {} - -@@ -226,7 +226,7 @@ namespace Tokenizer { - << "'" << endl; - } - stripCR( line ); -- UnicodeString input_line; -+ icu::UnicodeString input_line; - if ( line.size() > 0 && line[0] == 0 ){ - // when processing UTF16LE, '0' bytes show up at pos 0 - // we discard them, not for UTF16BE! -@@ -273,7 +273,7 @@ namespace Tokenizer { - LOG << "use textCat to guess language from: " - << input_line << endl; - } -- UnicodeString temp = input_line; -+ icu::UnicodeString temp = input_line; - temp.toLower(); - string lan = tc->get_language( folia::UnicodeToUTF8(temp) ); - if ( settings.find( lan ) != settings.end() ){ -@@ -531,7 +531,7 @@ namespace Tokenizer { - if ( root->hastext( outputclass ) ){ - return; - } -- UnicodeString utxt = root->text( outputclass, false, false ); -+ icu::UnicodeString utxt = root->text( outputclass, false, false ); - // cerr << "untok: '" << utxt << "'" << endl; - // UnicodeString txt = root->text( outputclass, true ); - // cerr << " tok: '" << txt << "'" << endl; -@@ -664,7 +664,7 @@ namespace Tokenizer { - if ( tokDebug > 0 ){ - cerr << "tokenize sentence element: " << element->id() << endl; - } -- UnicodeString line = element->stricttext( inputclass ); -+ icu::UnicodeString line = element->stricttext( inputclass ); - if ( line.isEmpty() ){ - // so no usefull text in this element. skip it - return; -@@ -837,7 +837,7 @@ namespace Tokenizer { - args["space"]= "no"; - } - folia::FoliaElement *w = new folia::Word( args, root->doc() ); -- UnicodeString out = token.us; -+ icu::UnicodeString out = token.us; - if (lowercase) { - out.toLower(); - } -@@ -898,7 +898,7 @@ namespace Tokenizer { - OUT << endl << endl; - } - } -- UnicodeString s = token.us; -+ icu::UnicodeString s = token.us; - if (lowercase) { - s = s.toLower(); - } -@@ -1118,12 +1118,12 @@ namespace Tokenizer { - quote = true; - } - else { -- UnicodeString opening = quotes.lookupOpen( c ); -+ icu::UnicodeString opening = quotes.lookupOpen( c ); - if (!opening.isEmpty()) { - quote = true; - } - else { -- UnicodeString closing = quotes.lookupClose( c ); -+ icu::UnicodeString closing = quotes.lookupClose( c ); - if (!closing.isEmpty()) { - quote = true; - } -@@ -1151,7 +1151,7 @@ namespace Tokenizer { - } - - bool TokenizerClass::resolveQuote( int endindex, -- const UnicodeString& open, -+ const icu::UnicodeString& open, - Quoting& quotes ) { - //resolve a quote - int stackindex = -1; -@@ -1250,7 +1250,7 @@ namespace Tokenizer { - } - - bool TokenizerClass::detectEos( size_t i, -- const UnicodeString& eosmarkers, -+ const icu::UnicodeString& eosmarkers, - const Quoting& quotes ) const { - bool is_eos = false; - UChar32 c = tokens[i].us.char32At(0); -@@ -1288,7 +1288,7 @@ namespace Tokenizer { - Quoting& quotes ) { - UChar32 c = tokens[i].us.char32At(0); - //Detect Quotation marks -- if ((c == '"') || ( UnicodeString(c) == """) ) { -+ if ((c == '"') || ( icu::UnicodeString(c) == """) ) { - if (tokDebug > 1 ){ - LOG << "[detectQuoteBounds] Standard double-quote (ambiguous) found @i="<< i << endl; - } -@@ -1311,7 +1311,7 @@ namespace Tokenizer { - } - } - else { -- UnicodeString close = quotes.lookupOpen( c ); -+ icu::UnicodeString close = quotes.lookupOpen( c ); - if ( !close.isEmpty() ){ // we have a opening quote - if ( tokDebug > 1 ) { - LOG << "[detectQuoteBounds] Opening quote found @i="<< i << ", pushing to stack for resolution later..." << endl; -@@ -1319,7 +1319,7 @@ namespace Tokenizer { - quotes.push( i, c ); // remember it - } - else { -- UnicodeString open = quotes.lookupClose( c ); -+ icu::UnicodeString open = quotes.lookupClose( c ); - if ( !open.isEmpty() ) { // we have a closeing quote - if (tokDebug > 1 ) { - LOG << "[detectQuoteBounds] Closing quote found @i="<< i << ", attempting to resolve..." << endl; -@@ -1484,17 +1484,17 @@ namespace Tokenizer { - - void TokenizerClass::passthruLine( const string& s, bool& bos ) { - // string wrapper -- UnicodeString us = convert( s, inputEncoding );; -+ icu::UnicodeString us = convert( s, inputEncoding );; - passthruLine( us, bos ); - } - -- void TokenizerClass::passthruLine( const UnicodeString& input, bool& bos ) { -+ void TokenizerClass::passthruLine( const icu::UnicodeString& input, bool& bos ) { - if (tokDebug) { - LOG << "[passthruLine] input: line=[" << input << "]" << endl; - } - bool alpha = false, num = false, punct = false; -- UnicodeString word; -- StringCharacterIterator sit(input); -+ icu::UnicodeString word; -+ icu::StringCharacterIterator sit(input); - while ( sit.hasNext() ){ - UChar32 c = sit.current32(); - if ( u_isspace(c)) { -@@ -1514,7 +1514,7 @@ namespace Tokenizer { - bos = true; - } - else { -- UnicodeString type; -+ icu::UnicodeString type; - if (alpha && !num && !punct) { - type = type_word; - } -@@ -1577,7 +1577,7 @@ namespace Tokenizer { - tokens.back().role |= ENDOFSENTENCE; - } - else { -- UnicodeString type; -+ icu::UnicodeString type; - if (alpha && !num && !punct) { - type = type_word; - } -@@ -1653,7 +1653,7 @@ namespace Tokenizer { - // string wrapper - int TokenizerClass::tokenizeLine( const string& s, - const string& lang ){ -- UnicodeString uinputstring = convert( s, inputEncoding ); -+ icu::UnicodeString uinputstring = convert( s, inputEncoding ); - return tokenizeLine( uinputstring, lang ); - } - -@@ -1673,7 +1673,7 @@ namespace Tokenizer { - || u_charType( c ) == U_OTHER_SYMBOL; - } - -- const UnicodeString& detect_type( UChar32 c ){ -+ const icu::UnicodeString& detect_type( UChar32 c ){ - if ( u_isspace(c)) { - return type_space; - } -@@ -1768,7 +1768,7 @@ namespace Tokenizer { - } - } - -- int TokenizerClass::tokenizeLine( const UnicodeString& originput, -+ int TokenizerClass::tokenizeLine( const icu::UnicodeString& originput, - const string& _lang ){ - string lang = _lang; - if ( lang.empty() ){ -@@ -1786,7 +1786,7 @@ namespace Tokenizer { - LOG << "[tokenizeLine] input: line=[" - << originput << "] (" << lang << ")" << endl; - } -- UnicodeString input = normalizer.normalize( originput ); -+ icu::UnicodeString input = normalizer.normalize( originput ); - if ( doFilter ){ - input = settings[lang]->filter.filter( input ); - } -@@ -1808,13 +1808,13 @@ namespace Tokenizer { - bool tokenizeword = false; - bool reset = false; - //iterate over all characters -- UnicodeString word; -- StringCharacterIterator sit(input); -+ icu::UnicodeString word; -+ icu::StringCharacterIterator sit(input); - long int i = 0; - while ( sit.hasNext() ){ - UChar32 c = sit.current32(); - if ( tokDebug > 8 ){ -- UnicodeString s = c; -+ icu::UnicodeString s = c; - int8_t charT = u_charType( c ); - LOG << "examine character: " << s << " type= " - << toString( charT ) << endl; -@@ -1855,7 +1855,7 @@ namespace Tokenizer { - } - int eospos = tokens.size()-1; - if (expliciteosfound > 0) { -- UnicodeString realword; -+ icu::UnicodeString realword; - word.extract(0,expliciteosfound,realword); - if (tokDebug >= 2) { - LOG << "[tokenizeLine] Prefix before EOS: " -@@ -1865,7 +1865,7 @@ namespace Tokenizer { - eospos++; - } - if ( expliciteosfound + eosmark.length() < word.length() ){ -- UnicodeString realword; -+ icu::UnicodeString realword; - word.extract( expliciteosfound+eosmark.length(), - word.length() - expliciteosfound - eosmark.length(), - realword ); -@@ -1941,10 +1941,10 @@ namespace Tokenizer { - return numNewTokens; - } - -- void TokenizerClass::tokenizeWord( const UnicodeString& input, -+ void TokenizerClass::tokenizeWord( const icu::UnicodeString& input, - bool space, - const string& lang, -- const UnicodeString& assigned_type ) { -+ const icu::UnicodeString& assigned_type ) { - bool recurse = !assigned_type.isEmpty(); - - int32_t inpLen = input.countChar32(); -@@ -1977,7 +1977,7 @@ namespace Tokenizer { - if ( inpLen == 1) { - //single character, no need to process all rules, do some simpler (faster) detection - UChar32 c = input.char32At(0); -- UnicodeString type = detect_type( c ); -+ icu::UnicodeString type = detect_type( c ); - if ( type == type_space ){ - return; - } -@@ -1993,7 +1993,7 @@ namespace Tokenizer { - } - } - else { -- UnicodeString word = input; -+ icu::UnicodeString word = input; - if ( norm_set.find( type ) != norm_set.end() ){ - word = "{{" + type + "}}"; - } -@@ -2010,10 +2010,10 @@ namespace Tokenizer { - if ( tokDebug >= 4){ - LOG << "\tTESTING " << rule->id << endl; - } -- UnicodeString type = rule->id; -+ icu::UnicodeString type = rule->id; - //Find first matching rule -- UnicodeString pre, post; -- vector matches; -+ icu::UnicodeString pre, post; -+ vector matches; - if ( rule->matchAll( input, pre, post, matches ) ){ - a_rule_matched = true; - if ( tokDebug >= 4 ){ -@@ -2083,7 +2083,7 @@ namespace Tokenizer { - if ( post.length() > 0 ) { - internal_space = false; - } -- UnicodeString word = matches[m]; -+ icu::UnicodeString word = matches[m]; - if ( norm_set.find( type ) != norm_set.end() ){ - word = "{{" + type + "}}"; - tokens.push_back( Token( type, word, internal_space ? NOROLE : NOSPACE, lang ) ); ---- ucto-0.9.6.orig/src/unicode.cxx -+++ ucto-0.9.6/src/unicode.cxx -@@ -84,10 +84,10 @@ namespace Tokenizer { - return res; - } - -- UnicodeString UnicodeNormalizer::normalize( const UnicodeString& us ){ -- UnicodeString r; -+ icu::UnicodeString UnicodeNormalizer::normalize( const icu::UnicodeString& us ){ -+ icu::UnicodeString r; - UErrorCode status=U_ZERO_ERROR; -- Normalizer::normalize( us, mode, 0, r, status ); -+ icu::Normalizer::normalize( us, mode, 0, r, status ); - if (U_FAILURE(status)){ - throw std::invalid_argument("Normalizer"); - } -@@ -101,18 +101,18 @@ namespace Tokenizer { - else { - auto it=q.the_map.cbegin(); - while ( it != q.the_map.cend() ){ -- os << folia::UnicodeToUTF8(UnicodeString(it->first)) << "\t" << it->second << endl; -+ os << folia::UnicodeToUTF8(icu::UnicodeString(it->first)) << "\t" << it->second << endl; - ++it; - } - } - return os; - } - -- UnicodeString UnicodeFilter::filter( const UnicodeString& s ){ -+ icu::UnicodeString UnicodeFilter::filter( const icu::UnicodeString& s ){ - if ( empty() ) - return s; - else { -- UnicodeString result; -+ icu::UnicodeString result; - for ( int i=0; i < s.length(); ++i ){ - auto it=the_map.find(s[i]); - if ( it != the_map.cend() ) -@@ -125,16 +125,16 @@ namespace Tokenizer { - } - - bool UnicodeFilter::add( const string& s ){ -- UnicodeString line = folia::UTF8ToUnicode(s); -+ icu::UnicodeString line = folia::UTF8ToUnicode(s); - return add( line ); - } - -- bool UnicodeFilter::add( const UnicodeString& s ){ -- UnicodeString line = s; -+ bool UnicodeFilter::add( const icu::UnicodeString& s ){ -+ icu::UnicodeString line = s; - line.trim(); - if ((line.length() > 0) && (line[0] != '#')) { -- UnicodeString open = ""; -- UnicodeString close = ""; -+ icu::UnicodeString open = ""; -+ icu::UnicodeString close = ""; - int splitpoint = line.indexOf(" "); - if ( splitpoint == -1 ) - splitpoint = line.indexOf("\t"); -@@ -142,8 +142,8 @@ namespace Tokenizer { - open = line; - } - else { -- open = UnicodeString( line, 0,splitpoint); -- close = UnicodeString( line, splitpoint+1); -+ open = icu::UnicodeString( line, 0,splitpoint); -+ close = icu::UnicodeString( line, splitpoint+1); - } - open = open.trim().unescape(); - close = close.trim().unescape(); -@@ -175,29 +175,29 @@ namespace Tokenizer { - class uConfigError: public std::invalid_argument { - public: - uConfigError( const string& s ): invalid_argument( "ucto: config file:" + s ){}; -- uConfigError( const UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){}; -+ uConfigError( const icu::UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){}; - }; - - -- UnicodeString UnicodeRegexMatcher::Pattern() const{ -+ icu::UnicodeString UnicodeRegexMatcher::Pattern() const{ - return pattern->pattern(); - } - -- UnicodeRegexMatcher::UnicodeRegexMatcher( const UnicodeString& pat, -- const UnicodeString& name ): -+ UnicodeRegexMatcher::UnicodeRegexMatcher( const icu::UnicodeString& pat, -+ const icu::UnicodeString& name ): - _name(name) - { - failString.clear(); - matcher = NULL; - UErrorCode u_stat = U_ZERO_ERROR; - UParseError errorInfo; -- pattern = RegexPattern::compile( pat, 0, errorInfo, u_stat ); -+ pattern = icu::RegexPattern::compile( pat, 0, errorInfo, u_stat ); - if ( U_FAILURE(u_stat) ){ - string spat = folia::UnicodeToUTF8(pat); - failString = folia::UnicodeToUTF8(_name); - if ( errorInfo.offset >0 ){ - failString += " Invalid regular expression at position " + TiCC::toString( errorInfo.offset ) + "\n"; -- UnicodeString pat1 = UnicodeString( pat, 0, errorInfo.offset -1 ); -+ icu::UnicodeString pat1 = icu::UnicodeString( pat, 0, errorInfo.offset -1 ); - failString += folia::UnicodeToUTF8(pat1) + " <== HERE\n"; - } - else { -@@ -222,9 +222,9 @@ namespace Tokenizer { - - //#define MATCH_DEBUG 1 - -- bool UnicodeRegexMatcher::match_all( const UnicodeString& line, -- UnicodeString& pre, -- UnicodeString& post ){ -+ bool UnicodeRegexMatcher::match_all( const icu::UnicodeString& line, -+ icu::UnicodeString& pre, -+ icu::UnicodeString& post ){ - UErrorCode u_stat = U_ZERO_ERROR; - pre = ""; - post = ""; -@@ -243,21 +243,21 @@ namespace Tokenizer { - #endif - if ( matcher->groupCount() == 0 ){ - // case 1: a rule without capture groups matches -- UnicodeString us = matcher->group(0,u_stat) ; -+ icu::UnicodeString us = matcher->group(0,u_stat) ; - #ifdef MATCH_DEBUG - cerr << "case 1, result = " << us << endl; - #endif - results.push_back( us ); - int start = matcher->start( 0, u_stat ); - if ( start > 0 ){ -- pre = UnicodeString( line, 0, start ); -+ pre = icu::UnicodeString( line, 0, start ); - #ifdef MATCH_DEBUG - cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl; - #endif - } - int end = matcher->end( 0, u_stat ); - if ( end < line.length() ){ -- post = UnicodeString( line, end ); -+ post = icu::UnicodeString( line, end ); - #ifdef MATCH_DEBUG - cerr << "found post " << folia::UnicodeToUTF8(post) << endl; - #endif -@@ -268,20 +268,20 @@ namespace Tokenizer { - // case 2: a rule with one capture group matches - int start = matcher->start( 1, u_stat ); - if ( start >= 0 ){ -- UnicodeString us = matcher->group(1,u_stat) ; -+ icu::UnicodeString us = matcher->group(1,u_stat) ; - #ifdef MATCH_DEBUG - cerr << "case 2a , result = " << us << endl; - #endif - results.push_back( us ); - if ( start > 0 ){ -- pre = UnicodeString( line, 0, start ); -+ pre = icu::UnicodeString( line, 0, start ); - #ifdef MATCH_DEBUG - cerr << "found pre " << pre << endl; - #endif - } - int end = matcher->end( 1, u_stat ); - if ( end < line.length() ){ -- post = UnicodeString( line, end ); -+ post = icu::UnicodeString( line, end ); - #ifdef MATCH_DEBUG - cerr << "found post " << post << endl; - #endif -@@ -289,21 +289,21 @@ namespace Tokenizer { - } - else { - // group 1 is empty, return group 0 -- UnicodeString us = matcher->group(0,u_stat) ; -+ icu::UnicodeString us = matcher->group(0,u_stat) ; - #ifdef MATCH_DEBUG - cerr << "case 2b , result = " << us << endl; - #endif - results.push_back( us ); - start = matcher->start( 0, u_stat ); - if ( start > 0 ){ -- pre = UnicodeString( line, 0, start ); -+ pre = icu::UnicodeString( line, 0, start ); - #ifdef MATCH_DEBUG - cerr << "found pre " << pre << endl; - #endif - } - int end = matcher->end( 0, u_stat ); - if ( end < line.length() ){ -- post = UnicodeString( line, end ); -+ post = icu::UnicodeString( line, end ); - #ifdef MATCH_DEBUG - cerr << "found post " << post << endl; - #endif -@@ -332,7 +332,7 @@ namespace Tokenizer { - else - break; - if ( start > end ){ -- pre = UnicodeString( line, end, start ); -+ pre = icu::UnicodeString( line, end, start ); - #ifdef MATCH_DEBUG - cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl; - #endif -@@ -342,7 +342,7 @@ namespace Tokenizer { - cerr << "end = " << end << endl; - #endif - if (!U_FAILURE(u_stat)){ -- results.push_back( UnicodeString( line, start, end - start ) ); -+ results.push_back( icu::UnicodeString( line, start, end - start ) ); - #ifdef MATCH_DEBUG - cerr << "added result " << folia::UnicodeToUTF8( results.back() ) << endl; - #endif -@@ -351,7 +351,7 @@ namespace Tokenizer { - break; - } - if ( end < line.length() ){ -- post = UnicodeString( line, end ); -+ post = icu::UnicodeString( line, end ); - #ifdef MATCH_DEBUG - cerr << "found post " << folia::UnicodeToUTF8(post) << endl; - #endif -@@ -364,7 +364,7 @@ namespace Tokenizer { - return false; - } - -- const UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{ -+ const icu::UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{ - if ( n < results.size() ) - return results[n]; - else -@@ -378,11 +378,11 @@ namespace Tokenizer { - return 0; - } - -- int UnicodeRegexMatcher::split( const UnicodeString& us, -- vector& result ){ -+ int UnicodeRegexMatcher::split( const icu::UnicodeString& us, -+ vector& result ){ - result.clear(); - const int maxWords = 256; -- UnicodeString words[maxWords]; -+ icu::UnicodeString words[maxWords]; - UErrorCode status = U_ZERO_ERROR; - int numWords = matcher->split( us, words, maxWords, status ); - for ( int i = 0; i < numWords; ++i ) diff -Nru ucto-0.9.6/debian/rules ucto-0.14/debian/rules --- ucto-0.9.6/debian/rules 2017-01-11 09:47:34.000000000 +0000 +++ ucto-0.14/debian/rules 2018-12-16 19:03:00.000000000 +0000 @@ -1,7 +1,9 @@ #!/usr/bin/make -f -DEB_CONFIGURE_EXTRA_FLAGS += --libdir=\$${prefix}/lib/$(DEB_HOST_MULTIARCH) +include /usr/share/dpkg/architecture.mk -include /usr/share/cdbs/1/rules/debhelper.mk -include /usr/share/cdbs/1/class/autotools.mk -include /usr/share/cdbs/1/rules/autoreconf.mk +%: + dh $@ + +override_dh_auto_test: + echo "(tests disabled because they reference /usr/share/ucto/textcat.cfg and fail to find it)" #TODO diff -Nru ucto-0.9.6/debian/ucto.maintscript ucto-0.14/debian/ucto.maintscript --- ucto-0.9.6/debian/ucto.maintscript 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/debian/ucto.maintscript 2018-12-16 19:03:00.000000000 +0000 @@ -0,0 +1,20 @@ +rm_conffile /etc/ucto/exotic-eos.eos 0.9.6-2~ +rm_conffile /etc/ucto/nl_afk.abr 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-nl 0.9.6-2~ +rm_conffile /etc/ucto/smiley.rule 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-it 0.9.6-2~ +rm_conffile /etc/ucto/standard-eos.eos 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-sv 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-fr 0.9.6-2~ +rm_conffile /etc/ucto/exotic-quotes.quote 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-nl-twitter 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-es 0.9.6-2~ +rm_conffile /etc/ucto/url.rule 0.9.6-2~ +rm_conffile /etc/ucto/e-mail.rule 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-nl-sonarchat 0.9.6-2~ +rm_conffile /etc/ucto/es.abr 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-fy 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-de 0.9.6-2~ +rm_conffile /etc/ucto/tokconfig-en 0.9.6-2~ +rm_conffile /etc/ucto/ligatures.filter 0.9.6-2~ +rm_conffile /etc/ucto/standard-quotes.quote 0.9.6-2~ diff -Nru ucto-0.9.6/depcomp ucto-0.14/depcomp --- ucto-0.9.6/depcomp 2016-09-09 05:06:00.000000000 +0000 +++ ucto-0.14/depcomp 2018-03-25 23:44:27.000000000 +0000 @@ -1,9 +1,9 @@ #! /bin/sh # depcomp - compile a program generating dependencies as side-effects -scriptversion=2013-05-30.07; # UTC +scriptversion=2016-01-11.22; # UTC -# Copyright (C) 1999-2014 Free Software Foundation, Inc. +# Copyright (C) 1999-2017 Free Software Foundation, Inc. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -786,6 +786,6 @@ # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC" +# time-stamp-time-zone: "UTC0" # time-stamp-end: "; # UTC" # End: diff -Nru ucto-0.9.6/docs/Makefile.in ucto-0.14/docs/Makefile.in --- ucto-0.9.6/docs/Makefile.in 2017-01-23 12:52:24.000000000 +0000 +++ ucto-0.14/docs/Makefile.in 2018-11-20 09:07:54.000000000 +0000 @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. +# Makefile.in generated by automake 1.15.1 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2014 Free Software Foundation, Inc. +# Copyright (C) 1994-2017 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -92,7 +92,9 @@ host_triplet = @host@ subdir = docs ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_old_textcat.m4 \ + $(top_srcdir)/m4/ac_osx_pkg.m4 \ + $(top_srcdir)/m4/ac_search_lm.m4 \ $(top_srcdir)/m4/ax_lib_readline.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ @@ -189,13 +191,7 @@ FGREP = @FGREP@ GREP = @GREP@ ICU_CFLAGS = @ICU_CFLAGS@ -ICU_CONFIG = @ICU_CONFIG@ -ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@ -ICU_CXXFLAGS = @ICU_CXXFLAGS@ -ICU_IOLIBS = @ICU_IOLIBS@ -ICU_LIBPATH = @ICU_LIBPATH@ ICU_LIBS = @ICU_LIBS@ -ICU_VERSION = @ICU_VERSION@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ @@ -213,6 +209,7 @@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +MODULE_PREFIX = @MODULE_PREFIX@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ @@ -236,6 +233,8 @@ SHELL = @SHELL@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ +TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ +TEXTCAT_LIBS = @TEXTCAT_LIBS@ VERSION = @VERSION@ XML2_CFLAGS = @XML2_CFLAGS@ XML2_LIBS = @XML2_LIBS@ @@ -286,6 +285,7 @@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +runstatedir = @runstatedir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ diff -Nru ucto-0.9.6/docs/ucto.1 ucto-0.14/docs/ucto.1 --- ucto-0.9.6/docs/ucto.1 2017-01-23 12:18:29.000000000 +0000 +++ ucto-0.14/docs/ucto.1 2018-11-13 16:30:47.000000000 +0000 @@ -1,4 +1,4 @@ -.TH ucto 1 "2014 december 2" +.TH ucto 1 "2018 nov 13" .SH NAME ucto \- Unicode Tokenizer @@ -33,18 +33,31 @@ set UTF8 output normalization. (default NFC) .RE +.BR \-\-filter =[YES|NO] +.RS +disable filtering of special characters, (default YES) +These special characters can be specified in the [FILTER] block of the +configuration file. +.RE + .BR \-f .RS -disable filtering of special characters +OBSOLETE. use --filter=NO .RE .BR \-L " language" .RS - Automatically selects a configuration file by language code. - The language code is generally a three-letter iso-639-3 code. +Automatically selects a configuration file by language code. +The language code is generally a three-letter iso-639-3 code. For example, 'fra' will select the file tokconfig\(hyfra from the installation directory .RE +.BR \-\-detectlanguages = +.RS +try to detect all the specified languages. The default language will be 'lang1'. +(only useful for FoLiA output) +.RE + .BR \-l .RS Convert to all lowercase @@ -65,6 +78,17 @@ Assume one sentence per line on input .RE +.BR \-\-normalize =class1,class2,..,classn +.RS +map all occurrences of tokens with class1,...class to their generic names. e.g \-\-normalize=DATE will map all dates to the word {{DATE}}. Very useful to normalize tokens like URL's, DATE's, E\-mail addresses and so on. +.RE + +.BR \-\-add\-tokens ="file" +.RS +Add additional tokens to the [TOKENS] block of the default language. +The file should contain one TOKEN per line. +.RE + .BR \-\-passthru .RS Don't tokenize, but perform input decoding and simple token role detection @@ -72,7 +96,7 @@ .BR \-\-filterpunct .RS -remove most of the punctuation from the output. (not from abreviations!) +remove most of the punctuation from the output. (not from abreviations and embeddded punctuation like John's) .RE .B \-P @@ -85,11 +109,6 @@ Enable Quote Detection. (this is experimental and may lead to unexpected results) .RE -.B \-S -.RS -Disable Sentence Detection -.RE - .B \-s .RS @@ -108,17 +127,33 @@ .B \-F .RS -Read a FoLiA XML document, tokenize it, and output the modified doc. (this disables usage of most other options: \-nulPQvsS) +Read a FoLiA XML document, tokenize it, and output the modified doc. (this disables usage of most other options: \-nPQvs) +For files with an '.xml' extension, \-F is the default. .RE -.BR \-\-textclass "cls" +.BR \-\-inputclass ="cls" .RS -When tokenizing a FoLiA XML document, search for text nodes of class 'cls' +When tokenizing a FoLiA XML document, search for text nodes of class 'cls'. +The default is "current". +.RE + +.BR \-\-outputclass ="cls" +.RS +When tokenizing a FoLiA XML document, output the tokenized text in text nodes with 'cls'. +The default is "current". +It is recommended to have different classes for input and output. +.RE + +.BR \-\-textclass ="cls" (obsolete) +.RS +use 'cls' for input and output of text from FoLiA. Equivalent to both \-\-inputclass='cls' and \-\-outputclass='cls') + +This option is obsolete and NOT recommended. Please use the separate \-\-inputclass= and \-\-outputclass options. .RE .B \-X .RS -Output FoLiA XML. (this disables usage of most other options: \-nulPQvsS) +Output FoLiA XML. (this disables usage of most other options: \-nPQvs) .RE .B \-\-id @@ -131,7 +166,7 @@ .B (obsolete) .RS -Output FoLiA XML, use the specified Document ID. (this disables usage of most other options: \-nulPQvsS) +Output FoLiA XML, use the specified Document ID. (this disables usage of most other options: \-nPQvs). .B obsolete Use diff -Nru ucto-0.9.6/include/Makefile.in ucto-0.14/include/Makefile.in --- ucto-0.9.6/include/Makefile.in 2017-01-23 12:52:24.000000000 +0000 +++ ucto-0.14/include/Makefile.in 2018-11-20 09:07:54.000000000 +0000 @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. +# Makefile.in generated by automake 1.15.1 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2014 Free Software Foundation, Inc. +# Copyright (C) 1994-2017 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -91,7 +91,9 @@ host_triplet = @host@ subdir = include ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_old_textcat.m4 \ + $(top_srcdir)/m4/ac_osx_pkg.m4 \ + $(top_srcdir)/m4/ac_search_lm.m4 \ $(top_srcdir)/m4/ax_lib_readline.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ @@ -217,13 +219,7 @@ FGREP = @FGREP@ GREP = @GREP@ ICU_CFLAGS = @ICU_CFLAGS@ -ICU_CONFIG = @ICU_CONFIG@ -ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@ -ICU_CXXFLAGS = @ICU_CXXFLAGS@ -ICU_IOLIBS = @ICU_IOLIBS@ -ICU_LIBPATH = @ICU_LIBPATH@ ICU_LIBS = @ICU_LIBS@ -ICU_VERSION = @ICU_VERSION@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ @@ -241,6 +237,7 @@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +MODULE_PREFIX = @MODULE_PREFIX@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ @@ -264,6 +261,8 @@ SHELL = @SHELL@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ +TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ +TEXTCAT_LIBS = @TEXTCAT_LIBS@ VERSION = @VERSION@ XML2_CFLAGS = @XML2_CFLAGS@ XML2_LIBS = @XML2_LIBS@ @@ -314,6 +313,7 @@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +runstatedir = @runstatedir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ diff -Nru ucto-0.9.6/include/ucto/Makefile.am ucto-0.14/include/ucto/Makefile.am --- ucto-0.9.6/include/ucto/Makefile.am 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/include/ucto/Makefile.am 2018-11-13 16:30:47.000000000 +0000 @@ -1 +1 @@ -pkginclude_HEADERS = textcat.h setting.h tokenize.h unicode.h +pkginclude_HEADERS = my_textcat.h setting.h tokenize.h diff -Nru ucto-0.9.6/include/ucto/Makefile.in ucto-0.14/include/ucto/Makefile.in --- ucto-0.9.6/include/ucto/Makefile.in 2017-01-23 12:52:24.000000000 +0000 +++ ucto-0.14/include/ucto/Makefile.in 2018-11-20 09:07:54.000000000 +0000 @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. +# Makefile.in generated by automake 1.15.1 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2014 Free Software Foundation, Inc. +# Copyright (C) 1994-2017 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -90,7 +90,9 @@ host_triplet = @host@ subdir = include/ucto ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_old_textcat.m4 \ + $(top_srcdir)/m4/ac_osx_pkg.m4 \ + $(top_srcdir)/m4/ac_search_lm.m4 \ $(top_srcdir)/m4/ax_lib_readline.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ @@ -204,13 +206,7 @@ FGREP = @FGREP@ GREP = @GREP@ ICU_CFLAGS = @ICU_CFLAGS@ -ICU_CONFIG = @ICU_CONFIG@ -ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@ -ICU_CXXFLAGS = @ICU_CXXFLAGS@ -ICU_IOLIBS = @ICU_IOLIBS@ -ICU_LIBPATH = @ICU_LIBPATH@ ICU_LIBS = @ICU_LIBS@ -ICU_VERSION = @ICU_VERSION@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ @@ -228,6 +224,7 @@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +MODULE_PREFIX = @MODULE_PREFIX@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ @@ -251,6 +248,8 @@ SHELL = @SHELL@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ +TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ +TEXTCAT_LIBS = @TEXTCAT_LIBS@ VERSION = @VERSION@ XML2_CFLAGS = @XML2_CFLAGS@ XML2_LIBS = @XML2_LIBS@ @@ -301,6 +300,7 @@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +runstatedir = @runstatedir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ @@ -313,7 +313,7 @@ top_srcdir = @top_srcdir@ uctodata_CFLAGS = @uctodata_CFLAGS@ uctodata_LIBS = @uctodata_LIBS@ -pkginclude_HEADERS = textcat.h setting.h tokenize.h unicode.h +pkginclude_HEADERS = my_textcat.h setting.h tokenize.h all: all-am .SUFFIXES: diff -Nru ucto-0.9.6/include/ucto/my_textcat.h ucto-0.14/include/ucto/my_textcat.h --- ucto-0.9.6/include/ucto/my_textcat.h 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/include/ucto/my_textcat.h 2018-11-13 16:30:47.000000000 +0000 @@ -0,0 +1,59 @@ +/* + Copyright (c) 2018 + CLST - Radboud University + ILK - Tilburg University + + This file is part of Ucto + + Ucto is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + Ucto is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + For questions and suggestions, see: + https://github.com/LanguageMachines/ucto/issues + or send mail to: + lamasoftware (at ) science.ru.nl +*/ + +#ifndef TEXTCAT_H +#define TEXTCAT_H + +#include + +#ifdef HAVE_TEXTCAT + #ifdef HAVE_OLD_TEXTCAT + #ifdef __cplusplus +extern "C" { + #endif + #endif +#include "textcat.h" + #ifdef HAVE_OLD_TEXTCAT + #ifdef __cplusplus +} + #endif + #endif +#endif + +class TextCat { + public: + explicit TextCat( const std::string& ); + TextCat( const TextCat& ); + ~TextCat(); + bool isInit() const { return TC != 0; }; + std::string get_language( const std::string& ) const; + std::vector get_languages( const std::string& ) const; + private: + void *TC; + std::string cfName; +}; + +#endif // TEXTCAT_H diff -Nru ucto-0.9.6/include/ucto/setting.h ucto-0.14/include/ucto/setting.h --- ucto-0.9.6/include/ucto/setting.h 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/include/ucto/setting.h 2018-11-15 16:37:00.000000000 +0000 @@ -1,5 +1,5 @@ /* - Copyright (c) 2006 - 2017 + Copyright (c) 2006 - 2018 CLST - Radboud University ILK - Tilburg University @@ -29,10 +29,14 @@ namespace TiCC { class LogStream; + class UnicodeRegexMatcher; + class UniFilter; } namespace Tokenizer { + using namespace icu; + class Rule { friend std::ostream& operator<< (std::ostream&, const Rule& ); public: @@ -42,7 +46,7 @@ ~Rule(); UnicodeString id; UnicodeString pattern; - UnicodeRegexMatcher *regexp; + TiCC::UnicodeRegexMatcher *regexp; bool matchAll( const UnicodeString&, UnicodeString&, UnicodeString&, @@ -85,21 +89,23 @@ class Setting { public: ~Setting(); - bool read( const std::string&, int, TiCC::LogStream* ); + bool read( const std::string&, const std::string&, int, TiCC::LogStream* ); bool readrules( const std::string& ); bool readfilters( const std::string& ); bool readquotes( const std::string& ); bool readeosmarkers( const std::string& ); bool readabbreviations( const std::string&, UnicodeString& ); - void add_rule( const UnicodeString&, const std::vector& ); + void add_rule( const UnicodeString&, + const std::vector& ); void sortRules( std::map&, const std::vector& ); + static std::set installed_languages(); UnicodeString eosmarkers; std::vector rules; std::map rulesmap; std::map rules_index; Quoting quotes; - UnicodeFilter filter; + TiCC::UniFilter filter; std::string set_file; // the name of the settingsfile std::string version; // the version of the datafile int tokDebug; diff -Nru ucto-0.9.6/include/ucto/textcat.h ucto-0.14/include/ucto/textcat.h --- ucto-0.9.6/include/ucto/textcat.h 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/include/ucto/textcat.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,69 +0,0 @@ -/* - Copyright (c) 2017 - CLST - Radboud University - ILK - Tilburg University - - This file is part of Ucto - - Ucto is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - Ucto is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - - For questions and suggestions, see: - https://github.com/LanguageMachines/ucto/issues - or send mail to: - lamasoftware (at ) science.ru.nl -*/ - -#ifndef TEXTCAT_H -#define TEXTCAT_H - -#include - -#ifdef HAVE_TEXTCAT_H -#define ENABLE_TEXTCAT -#ifdef __cplusplus -extern "C" { -#endif - -#include "textcat.h" - -#ifdef __cplusplus -} -#endif - -#else -#ifdef HAVE_LIBTEXTCAT_TEXTCAT_H -#include "libtextcat/textcat.h" -#define ENABLE_TEXTCAT -#else -#ifdef HAVE_LIBEXTTEXTCAT_TEXTCAT_H -#include "libexttextcat/textcat.h" -#define ENABLE_TEXTCAT -#endif -#endif -#endif - -class TextCat { - public: - TextCat( const std::string& cf ); - TextCat( const TextCat& in ); - ~TextCat(); - bool isInit() const { return TC != 0; }; - std::string get_language( const std::string& ) const; - std::vector get_languages( const std::string& ) const; - private: - void *TC; - std::string cfName; -}; - -#endif // TEXTCAT_H diff -Nru ucto-0.9.6/include/ucto/tokenize.h ucto-0.14/include/ucto/tokenize.h --- ucto-0.9.6/include/ucto/tokenize.h 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/include/ucto/tokenize.h 2018-11-15 16:36:17.000000000 +0000 @@ -1,5 +1,5 @@ /* - Copyright (c) 2006 - 2017 + Copyright (c) 2006 - 2018 CLST - Radboud University ILK - Tilburg University @@ -28,17 +28,21 @@ #define UCTO_TOKENIZE_H #include +#include #include #include #include -#include "ucto/unicode.h" -#include "ucto/setting.h" +#include "libfolia/folia.h" #include "ticcutils/LogStream.h" +#include "ticcutils/Unicode.h" +#include "ucto/setting.h" class TextCat; namespace Tokenizer { + using namespace icu; + std::string Version(); std::string VersionName(); @@ -51,8 +55,7 @@ BEGINQUOTE = 16, ENDQUOTE = 32, TEMPENDOFSENTENCE = 64, - LISTITEM = 128, //reserved for future use - TITLE = 256 //reserved for future use + LINEBREAK = 128 }; std::ostream& operator<<( std::ostream&, const TokenRole& ); @@ -96,8 +99,10 @@ public: TokenizerClass(); ~TokenizerClass(); - bool init( const std::string& ); // init from a configfile - bool init( const std::vector& ); // init 1 or more languages + bool init( const std::string&, + const std::string& ="" ); // init from a configfile + bool init( const std::vector&, + const std::string& ="" ); // init 1 or more languages bool reset( const std::string& = "default" ); void setErrorLog( TiCC::LogStream *os ); @@ -109,12 +114,16 @@ //Tokenize from input stream to a vector of Tokens // non greedy. Stops after the first full sentence is detected. + // NOTE: may return more then one sentence, when more sentences are present + // on 1 line! // should be called multiple times until EOF std::vector tokenizeStream( std::istream&, const std::string& = "default" ); // Tokenize from an input stream to a UTF8 string (representing a sentence) // non greedy. Stops after the first full sentence is detected. + // NOTE: may return more then one sentence, when more sentences are present + // on 1 line! // should be called multiple times until EOF std::string tokenizeSentenceStream( std::istream&, const std::string& = "default" ); @@ -149,6 +158,7 @@ //return the sentence with the specified index in a Token vector; std::vector getSentence( int ); + void extractSentencesAndFlush( int, std::vector&, const std::string& ); //Get all sentences as a vector of strings (UTF-8 encoded) std::vector getSentences(); @@ -173,9 +183,9 @@ bool setUppercase( bool b=true ) { bool t = uppercase; uppercase = b; if (b) lowercase = false; return t; }; bool getUppercase() const { return uppercase; } - //Enable sentence-bound detection - bool setSentenceDetection( bool b=true ) { bool t = detectBounds; detectBounds = b; return t; } - bool getSentenceDetection() const { return detectBounds; } + //Enable sentence splitting only + bool setSentenceSplit( bool b=true ) { bool t = splitOnly; splitOnly = b; return t; } + bool getSentenceSplit() const { return splitOnly; } //Enable paragraph detection bool setParagraphDetection( bool b=true ) { bool t = detectPar; detectPar = b; return t; } @@ -185,6 +195,10 @@ bool setQuoteDetection( bool b=true ) { bool t = detectQuotes; detectQuotes = b; return t; } bool getQuoteDetection() const { return detectQuotes; } + //Enable language detection + bool setLangDetection( bool b=true ) { bool t = doDetectLang; doDetectLang = b; return t; } + bool getLangDetection() const { return doDetectLang; } + //Enable filtering bool setFiltering( bool b=true ) { bool t = doFilter; doFilter = b; return t; @@ -196,6 +210,8 @@ } bool getPunctFilter() const { return doPunctFilter; }; + std::string setTextRedundancy( const std::string& ); + // set normalization mode std::string setNormalization( const std::string& s ) { return normalizer.setMode( s ); @@ -209,7 +225,7 @@ void setLanguage( const std::string& l ){ default_language = l; }; // set eos marker - UnicodeString setEosMarker( const std::string& s = "") { UnicodeString t = eosmark; eosmark = folia::UTF8ToUnicode(s); return t; }; + UnicodeString setEosMarker( const std::string& s = "") { UnicodeString t = eosmark; eosmark = TiCC::UnicodeFromUTF8(s); return t; }; UnicodeString getEosMarker( ) const { return eosmark; } bool setNormSet( const std::string& ); @@ -227,6 +243,7 @@ const std::string setTextClass( const std::string& cls) { std::string res = inputclass; inputclass = cls; + outputclass = cls; return res; } const std::string getInputClass( ) const { return inputclass; } @@ -261,6 +278,9 @@ bool, const std::string&, const UnicodeString& ="" ); + int tokenizeLine( const UnicodeString&, + const std::string&, + const std::string& ); bool detectEos( size_t, const UnicodeString&, const Quoting& ) const; void detectSentenceBounds( const int offset, @@ -276,7 +296,6 @@ bool u_isquote( UChar32, const Quoting& ) const; std::string checkBOM( std::istream& ); - void outputTokensDoc( folia::Document&, const std::vector& ) const; void outputTokensDoc_init( folia::Document& ) const; int outputTokensXML( folia::FoliaElement *, @@ -286,7 +305,7 @@ void tokenizeSentenceElement( folia::FoliaElement *, const std::string& ); - UnicodeNormalizer normalizer; + TiCC::UnicodeNormalizer normalizer; std::string inputEncoding; UnicodeString eosmark; @@ -303,9 +322,6 @@ //verbose tokenisation mode bool verbose; - //detect sentence bounds? - bool detectBounds; - //detect quotes? bool detectQuotes; @@ -315,12 +331,22 @@ //filter all punctuation characters (default off) bool doPunctFilter; + // only sentence spliiting? + bool splitOnly; + //detect paragraphs? bool detectPar; //has a paragraph been signaled? bool paragraphsignal; + //has do we attempt to assign languages? + bool doDetectLang; + + //has do we percolate text up from to and

nodes? (FoLiA) + // values should be: 'full', 'minimal' or 'none' + std::string text_redundancy; + //one sentence per line output bool sentenceperlineoutput; bool sentenceperlineinput; diff -Nru ucto-0.9.6/include/ucto/unicode.h ucto-0.14/include/ucto/unicode.h --- ucto-0.9.6/include/ucto/unicode.h 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/include/ucto/unicode.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,87 +0,0 @@ -/* - Copyright (c) 2006 - 2017 - CLST - Radboud University - ILK - Tilburg University - - This file is part of Ucto - - Ucto is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - Ucto is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - - For questions and suggestions, see: - https://github.com/LanguageMachines/ucto/issues - or send mail to: - lamasoftware (at ) science.ru.nl -*/ - -#ifndef UCTO_UNICODE_H -#define UCTO_UNICODE_H - -#include -#include -#include -#include "unicode/ucnv.h" -#include "unicode/ustream.h" -#include "unicode/uchar.h" -#include "unicode/unistr.h" -#include "unicode/normlzr.h" -#include "unicode/regex.h" - -namespace Tokenizer { - - class UnicodeNormalizer { - public: - UnicodeNormalizer(): mode(UNORM_NFC){}; - UnicodeString normalize( const UnicodeString& ); - std::string getMode( ) const; - std::string setMode( const std::string& ); - private: - UNormalizationMode mode; - }; - - class UnicodeFilter { - friend std::ostream& operator<<( std::ostream&, const UnicodeFilter& ); - public: - UnicodeString filter( const UnicodeString& ); - bool fill( const std::string& ); - bool add( const UnicodeString& ); - bool add( const std::string& ); - bool empty() const { return the_map.empty(); }; - private: - void add( UChar uc, const UnicodeString& us ) { the_map[uc] = us; }; - std::map the_map; - }; - - class UnicodeRegexMatcher { - public: - UnicodeRegexMatcher( const UnicodeString&, const UnicodeString& name="" ); - ~UnicodeRegexMatcher(); - bool match_all( const UnicodeString&, UnicodeString&, UnicodeString& ); - const UnicodeString get_match( unsigned int ) const; - int NumOfMatches() const; - int split( const UnicodeString&, std::vector& ); - UnicodeString Pattern() const; - private: - UnicodeRegexMatcher( const UnicodeRegexMatcher& ); // inhibit copies - UnicodeRegexMatcher& operator=( const UnicodeRegexMatcher& ); // inhibit copies - std::string failString; - RegexPattern *pattern; - RegexMatcher *matcher; - UnicodeRegexMatcher(); - std::vector results; - const UnicodeString _name; - }; - -} // namespace - -#endif // UCTO_UNICODE_H diff -Nru ucto-0.9.6/INSTALL ucto-0.14/INSTALL --- ucto-0.9.6/INSTALL 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/INSTALL 2018-03-25 23:44:27.000000000 +0000 @@ -0,0 +1,368 @@ +Installation Instructions +************************* + + Copyright (C) 1994-1996, 1999-2002, 2004-2016 Free Software +Foundation, Inc. + + Copying and distribution of this file, with or without modification, +are permitted in any medium without royalty provided the copyright +notice and this notice are preserved. This file is offered as-is, +without warranty of any kind. + +Basic Installation +================== + + Briefly, the shell command './configure && make && make install' +should configure, build, and install this package. The following +more-detailed instructions are generic; see the 'README' file for +instructions specific to this package. Some packages provide this +'INSTALL' file but do not implement all of the features documented +below. The lack of an optional feature in a given package is not +necessarily a bug. More recommendations for GNU packages can be found +in *note Makefile Conventions: (standards)Makefile Conventions. + + The 'configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation. It uses +those values to create a 'Makefile' in each directory of the package. +It may also create one or more '.h' files containing system-dependent +definitions. Finally, it creates a shell script 'config.status' that +you can run in the future to recreate the current configuration, and a +file 'config.log' containing compiler output (useful mainly for +debugging 'configure'). + + It can also use an optional file (typically called 'config.cache' and +enabled with '--cache-file=config.cache' or simply '-C') that saves the +results of its tests to speed up reconfiguring. Caching is disabled by +default to prevent problems with accidental use of stale cache files. + + If you need to do unusual things to compile the package, please try +to figure out how 'configure' could check whether to do them, and mail +diffs or instructions to the address given in the 'README' so they can +be considered for the next release. If you are using the cache, and at +some point 'config.cache' contains results you don't want to keep, you +may remove or edit it. + + The file 'configure.ac' (or 'configure.in') is used to create +'configure' by a program called 'autoconf'. You need 'configure.ac' if +you want to change it or regenerate 'configure' using a newer version of +'autoconf'. + + The simplest way to compile this package is: + + 1. 'cd' to the directory containing the package's source code and type + './configure' to configure the package for your system. + + Running 'configure' might take a while. While running, it prints + some messages telling which features it is checking for. + + 2. Type 'make' to compile the package. + + 3. Optionally, type 'make check' to run any self-tests that come with + the package, generally using the just-built uninstalled binaries. + + 4. Type 'make install' to install the programs and any data files and + documentation. When installing into a prefix owned by root, it is + recommended that the package be configured and built as a regular + user, and only the 'make install' phase executed with root + privileges. + + 5. Optionally, type 'make installcheck' to repeat any self-tests, but + this time using the binaries in their final installed location. + This target does not install anything. Running this target as a + regular user, particularly if the prior 'make install' required + root privileges, verifies that the installation completed + correctly. + + 6. You can remove the program binaries and object files from the + source code directory by typing 'make clean'. To also remove the + files that 'configure' created (so you can compile the package for + a different kind of computer), type 'make distclean'. There is + also a 'make maintainer-clean' target, but that is intended mainly + for the package's developers. If you use it, you may have to get + all sorts of other programs in order to regenerate files that came + with the distribution. + + 7. Often, you can also type 'make uninstall' to remove the installed + files again. In practice, not all packages have tested that + uninstallation works correctly, even though it is required by the + GNU Coding Standards. + + 8. Some packages, particularly those that use Automake, provide 'make + distcheck', which can by used by developers to test that all other + targets like 'make install' and 'make uninstall' work correctly. + This target is generally not run by end users. + +Compilers and Options +===================== + + Some systems require unusual options for compilation or linking that +the 'configure' script does not know about. Run './configure --help' +for details on some of the pertinent environment variables. + + You can give 'configure' initial values for configuration parameters +by setting variables in the command line or in the environment. Here is +an example: + + ./configure CC=c99 CFLAGS=-g LIBS=-lposix + + *Note Defining Variables::, for more details. + +Compiling For Multiple Architectures +==================================== + + You can compile the package for more than one kind of computer at the +same time, by placing the object files for each architecture in their +own directory. To do this, you can use GNU 'make'. 'cd' to the +directory where you want the object files and executables to go and run +the 'configure' script. 'configure' automatically checks for the source +code in the directory that 'configure' is in and in '..'. This is known +as a "VPATH" build. + + With a non-GNU 'make', it is safer to compile the package for one +architecture at a time in the source code directory. After you have +installed the package for one architecture, use 'make distclean' before +reconfiguring for another architecture. + + On MacOS X 10.5 and later systems, you can create libraries and +executables that work on multiple system types--known as "fat" or +"universal" binaries--by specifying multiple '-arch' options to the +compiler but only a single '-arch' option to the preprocessor. Like +this: + + ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ + CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ + CPP="gcc -E" CXXCPP="g++ -E" + + This is not guaranteed to produce working output in all cases, you +may have to build one architecture at a time and combine the results +using the 'lipo' tool if you have problems. + +Installation Names +================== + + By default, 'make install' installs the package's commands under +'/usr/local/bin', include files under '/usr/local/include', etc. You +can specify an installation prefix other than '/usr/local' by giving +'configure' the option '--prefix=PREFIX', where PREFIX must be an +absolute file name. + + You can specify separate installation prefixes for +architecture-specific files and architecture-independent files. If you +pass the option '--exec-prefix=PREFIX' to 'configure', the package uses +PREFIX as the prefix for installing programs and libraries. +Documentation and other data files still use the regular prefix. + + In addition, if you use an unusual directory layout you can give +options like '--bindir=DIR' to specify different values for particular +kinds of files. Run 'configure --help' for a list of the directories +you can set and what kinds of files go in them. In general, the default +for these options is expressed in terms of '${prefix}', so that +specifying just '--prefix' will affect all of the other directory +specifications that were not explicitly provided. + + The most portable way to affect installation locations is to pass the +correct locations to 'configure'; however, many packages provide one or +both of the following shortcuts of passing variable assignments to the +'make install' command line to change installation locations without +having to reconfigure or recompile. + + The first method involves providing an override variable for each +affected directory. For example, 'make install +prefix=/alternate/directory' will choose an alternate location for all +directory configuration variables that were expressed in terms of +'${prefix}'. Any directories that were specified during 'configure', +but not in terms of '${prefix}', must each be overridden at install time +for the entire installation to be relocated. The approach of makefile +variable overrides for each directory variable is required by the GNU +Coding Standards, and ideally causes no recompilation. However, some +platforms have known limitations with the semantics of shared libraries +that end up requiring recompilation when using this method, particularly +noticeable in packages that use GNU Libtool. + + The second method involves providing the 'DESTDIR' variable. For +example, 'make install DESTDIR=/alternate/directory' will prepend +'/alternate/directory' before all installation names. The approach of +'DESTDIR' overrides is not required by the GNU Coding Standards, and +does not work on platforms that have drive letters. On the other hand, +it does better at avoiding recompilation issues, and works well even +when some directory options were not specified in terms of '${prefix}' +at 'configure' time. + +Optional Features +================= + + If the package supports it, you can cause programs to be installed +with an extra prefix or suffix on their names by giving 'configure' the +option '--program-prefix=PREFIX' or '--program-suffix=SUFFIX'. + + Some packages pay attention to '--enable-FEATURE' options to +'configure', where FEATURE indicates an optional part of the package. +They may also pay attention to '--with-PACKAGE' options, where PACKAGE +is something like 'gnu-as' or 'x' (for the X Window System). The +'README' should mention any '--enable-' and '--with-' options that the +package recognizes. + + For packages that use the X Window System, 'configure' can usually +find the X include and library files automatically, but if it doesn't, +you can use the 'configure' options '--x-includes=DIR' and +'--x-libraries=DIR' to specify their locations. + + Some packages offer the ability to configure how verbose the +execution of 'make' will be. For these packages, running './configure +--enable-silent-rules' sets the default to minimal output, which can be +overridden with 'make V=1'; while running './configure +--disable-silent-rules' sets the default to verbose, which can be +overridden with 'make V=0'. + +Particular systems +================== + + On HP-UX, the default C compiler is not ANSI C compatible. If GNU CC +is not installed, it is recommended to use the following options in +order to use an ANSI C compiler: + + ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" + +and if that doesn't work, install pre-built binaries of GCC for HP-UX. + + HP-UX 'make' updates targets which have the same time stamps as their +prerequisites, which makes it generally unusable when shipped generated +files such as 'configure' are involved. Use GNU 'make' instead. + + On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot +parse its '' header file. The option '-nodtk' can be used as a +workaround. If GNU CC is not installed, it is therefore recommended to +try + + ./configure CC="cc" + +and if that doesn't work, try + + ./configure CC="cc -nodtk" + + On Solaris, don't put '/usr/ucb' early in your 'PATH'. This +directory contains several dysfunctional programs; working variants of +these programs are available in '/usr/bin'. So, if you need '/usr/ucb' +in your 'PATH', put it _after_ '/usr/bin'. + + On Haiku, software installed for all users goes in '/boot/common', +not '/usr/local'. It is recommended to use the following options: + + ./configure --prefix=/boot/common + +Specifying the System Type +========================== + + There may be some features 'configure' cannot figure out +automatically, but needs to determine by the type of machine the package +will run on. Usually, assuming the package is built to be run on the +_same_ architectures, 'configure' can figure that out, but if it prints +a message saying it cannot guess the machine type, give it the +'--build=TYPE' option. TYPE can either be a short name for the system +type, such as 'sun4', or a canonical name which has the form: + + CPU-COMPANY-SYSTEM + +where SYSTEM can have one of these forms: + + OS + KERNEL-OS + + See the file 'config.sub' for the possible values of each field. If +'config.sub' isn't included in this package, then this package doesn't +need to know the machine type. + + If you are _building_ compiler tools for cross-compiling, you should +use the option '--target=TYPE' to select the type of system they will +produce code for. + + If you want to _use_ a cross compiler, that generates code for a +platform different from the build platform, you should specify the +"host" platform (i.e., that on which the generated programs will +eventually be run) with '--host=TYPE'. + +Sharing Defaults +================ + + If you want to set default values for 'configure' scripts to share, +you can create a site shell script called 'config.site' that gives +default values for variables like 'CC', 'cache_file', and 'prefix'. +'configure' looks for 'PREFIX/share/config.site' if it exists, then +'PREFIX/etc/config.site' if it exists. Or, you can set the +'CONFIG_SITE' environment variable to the location of the site script. +A warning: not all 'configure' scripts look for a site script. + +Defining Variables +================== + + Variables not defined in a site shell script can be set in the +environment passed to 'configure'. However, some packages may run +configure again during the build, and the customized values of these +variables may be lost. In order to avoid this problem, you should set +them in the 'configure' command line, using 'VAR=value'. For example: + + ./configure CC=/usr/local2/bin/gcc + +causes the specified 'gcc' to be used as the C compiler (unless it is +overridden in the site shell script). + +Unfortunately, this technique does not work for 'CONFIG_SHELL' due to an +Autoconf limitation. Until the limitation is lifted, you can use this +workaround: + + CONFIG_SHELL=/bin/bash ./configure CONFIG_SHELL=/bin/bash + +'configure' Invocation +====================== + + 'configure' recognizes the following options to control how it +operates. + +'--help' +'-h' + Print a summary of all of the options to 'configure', and exit. + +'--help=short' +'--help=recursive' + Print a summary of the options unique to this package's + 'configure', and exit. The 'short' variant lists options used only + in the top level, while the 'recursive' variant lists options also + present in any nested packages. + +'--version' +'-V' + Print the version of Autoconf used to generate the 'configure' + script, and exit. + +'--cache-file=FILE' + Enable the cache: use and save the results of the tests in FILE, + traditionally 'config.cache'. FILE defaults to '/dev/null' to + disable caching. + +'--config-cache' +'-C' + Alias for '--cache-file=config.cache'. + +'--quiet' +'--silent' +'-q' + Do not print messages saying which checks are being made. To + suppress all normal output, redirect it to '/dev/null' (any error + messages will still be shown). + +'--srcdir=DIR' + Look for the package's source code in directory DIR. Usually + 'configure' can determine that directory automatically. + +'--prefix=DIR' + Use DIR as the installation prefix. *note Installation Names:: for + more details, including other options available for fine-tuning the + installation locations. + +'--no-create' +'-n' + Run the configure checks, but stop before creating any output + files. + +'configure' also accepts some other, not widely useful, options. Run +'configure --help' for more details. diff -Nru ucto-0.9.6/install-sh ucto-0.14/install-sh --- ucto-0.9.6/install-sh 2016-09-09 05:06:00.000000000 +0000 +++ ucto-0.14/install-sh 2018-03-25 23:44:27.000000000 +0000 @@ -1,7 +1,7 @@ #!/bin/sh # install - install a program, script, or datafile -scriptversion=2013-12-25.23; # UTC +scriptversion=2014-09-12.12; # UTC # This originates from X11R5 (mit/util/scripts/install.sh), which was # later released in X11R6 (xc/config/util/install.sh) with the @@ -324,34 +324,41 @@ # is incompatible with FreeBSD 'install' when (umask & 300) != 0. ;; *) + # $RANDOM is not portable (e.g. dash); use it when possible to + # lower collision chance tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ - trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0 + trap 'ret=$?; rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" 2>/dev/null; exit $ret' 0 + # As "mkdir -p" follows symlinks and we work in /tmp possibly; so + # create the $tmpdir first (and fail if unsuccessful) to make sure + # that nobody tries to guess the $tmpdir name. if (umask $mkdir_umask && - exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1 + $mkdirprog $mkdir_mode "$tmpdir" && + exec $mkdirprog $mkdir_mode -p -- "$tmpdir/a/b") >/dev/null 2>&1 then if test -z "$dir_arg" || { # Check for POSIX incompatibilities with -m. # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or # other-writable bit of parent directory when it shouldn't. # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. - ls_ld_tmpdir=`ls -ld "$tmpdir"` + test_tmpdir="$tmpdir/a" + ls_ld_tmpdir=`ls -ld "$test_tmpdir"` case $ls_ld_tmpdir in d????-?r-*) different_mode=700;; d????-?--*) different_mode=755;; *) false;; esac && - $mkdirprog -m$different_mode -p -- "$tmpdir" && { - ls_ld_tmpdir_1=`ls -ld "$tmpdir"` + $mkdirprog -m$different_mode -p -- "$test_tmpdir" && { + ls_ld_tmpdir_1=`ls -ld "$test_tmpdir"` test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" } } then posix_mkdir=: fi - rmdir "$tmpdir/d" "$tmpdir" + rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" else # Remove any dirs left behind by ancient mkdir implementations. - rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null + rmdir ./$mkdir_mode ./-p ./-- "$tmpdir" 2>/dev/null fi trap '' 0;; esac;; diff -Nru ucto-0.9.6/ltmain.sh ucto-0.14/ltmain.sh --- ucto-0.9.6/ltmain.sh 2017-01-12 01:42:28.000000000 +0000 +++ ucto-0.14/ltmain.sh 2016-08-20 12:34:31.000000000 +0000 @@ -31,7 +31,7 @@ PROGRAM=libtool PACKAGE=libtool -VERSION=2.4.6 +VERSION="2.4.6 Debian-2.4.6-2" package_revision=2.4.6 @@ -2068,12 +2068,12 @@ compiler: $LTCC compiler flags: $LTCFLAGS linker: $LD (gnu? $with_gnu_ld) - version: $progname (GNU libtool) 2.4.6 + version: $progname $scriptversion Debian-2.4.6-2 automake: `($AUTOMAKE --version) 2>/dev/null |$SED 1q` autoconf: `($AUTOCONF --version) 2>/dev/null |$SED 1q` Report bugs to . -GNU libtool home page: . +GNU libtool home page: . General help using GNU software: ." exit 0 } @@ -7272,10 +7272,13 @@ # -tp=* Portland pgcc target processor selection # --sysroot=* for sysroot support # -O*, -g*, -flto*, -fwhopr*, -fuse-linker-plugin GCC link-time optimization + # -specs=* GCC specs files # -stdlib=* select c++ std lib with clang + # -fsanitize=* Clang/GCC memory and address sanitizer -64|-mips[0-9]|-r[0-9][0-9]*|-xarch=*|-xtarget=*|+DA*|+DD*|-q*|-m*| \ -t[45]*|-txscale*|-p|-pg|--coverage|-fprofile-*|-F*|@*|-tp=*|--sysroot=*| \ - -O*|-g*|-flto*|-fwhopr*|-fuse-linker-plugin|-fstack-protector*|-stdlib=*) + -O*|-g*|-flto*|-fwhopr*|-fuse-linker-plugin|-fstack-protector*|-stdlib=*| \ + -specs=*|-fsanitize=*) func_quote_for_eval "$arg" arg=$func_quote_for_eval_result func_append compile_command " $arg" @@ -7568,7 +7571,10 @@ case $pass in dlopen) libs=$dlfiles ;; dlpreopen) libs=$dlprefiles ;; - link) libs="$deplibs %DEPLIBS% $dependency_libs" ;; + link) + libs="$deplibs %DEPLIBS%" + test "X$link_all_deplibs" != Xno && libs="$libs $dependency_libs" + ;; esac fi if test lib,dlpreopen = "$linkmode,$pass"; then @@ -7887,19 +7893,19 @@ # It is a libtool convenience library, so add in its objects. func_append convenience " $ladir/$objdir/$old_library" func_append old_convenience " $ladir/$objdir/$old_library" + tmp_libs= + for deplib in $dependency_libs; do + deplibs="$deplib $deplibs" + if $opt_preserve_dup_deps; then + case "$tmp_libs " in + *" $deplib "*) func_append specialdeplibs " $deplib" ;; + esac + fi + func_append tmp_libs " $deplib" + done elif test prog != "$linkmode" && test lib != "$linkmode"; then func_fatal_error "'$lib' is not a convenience library" fi - tmp_libs= - for deplib in $dependency_libs; do - deplibs="$deplib $deplibs" - if $opt_preserve_dup_deps; then - case "$tmp_libs " in - *" $deplib "*) func_append specialdeplibs " $deplib" ;; - esac - fi - func_append tmp_libs " $deplib" - done continue fi # $pass = conv @@ -8823,6 +8829,9 @@ revision=$number_minor lt_irix_increment=no ;; + *) + func_fatal_configuration "$modename: unknown library version type '$version_type'" + ;; esac ;; no) diff -Nru ucto-0.9.6/m4/ac_check_old_textcat.m4 ucto-0.14/m4/ac_check_old_textcat.m4 --- ucto-0.9.6/m4/ac_check_old_textcat.m4 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/m4/ac_check_old_textcat.m4 2018-11-13 16:30:47.000000000 +0000 @@ -0,0 +1,36 @@ +# check_old_textcat_pkg.m4 - Macro to detect 'old' style libtextcat. -*- Autoconf -*- +# serial 1 +# +# Copyright © 2018 Ko van der Sloot +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# AC_CHECK_OLD_TEXTCAT() +# ---------------------------------- +AC_DEFUN([AC_CHECK_OLD_TEXTCAT], + [AC_CHECK_HEADERS( [libtextcat/textcat.h], + [CXXFLAGS="$CXXFLAGS -I$prefix/include/libtextcat" + AC_SEARCH_LIBS( [textcat_Init], + [textcat], + [TEXTCAT_FOUND=1], + [AC_MSG_NOTICE([textcat library not available])])], + [AC_MSG_NOTICE([textcat include file not available])]) + ] +) diff -Nru ucto-0.9.6/m4/ac_osx_pkg.m4 ucto-0.14/m4/ac_osx_pkg.m4 --- ucto-0.9.6/m4/ac_osx_pkg.m4 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/m4/ac_osx_pkg.m4 2018-11-13 16:30:47.000000000 +0000 @@ -0,0 +1,68 @@ +# osx_pkg.m4 - Macros to add OSX brew locations to pkg-config. -*- Autoconf -*- +# serial 1 (pkg-config-0.24) +# +# Copyright © 2018 Ko van der Sloot +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# AC_OSX_PKG_ALL() +# add all /opt/{package} directories to the PKG_CONFIG search path +# ---------------------------------- +AC_DEFUN([AC_OSX_PKG_ALL], +[ +case ${host_os} in + linux*) + # linux is wellbehaved + ;; + darwin*) + # darwin isn't + for i in `ls /usr/local/opt/` + do + if test -d "/usr/local/opt/$i/lib/pkgconfig" + then + export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/usr/local/opt/$i/lib/pkgconfig" + fi + done + ;; +esac +]) + +# AC_OSX_PKG_ALL([LIST_OF_PACKAGES]) +# fore every packake in LIST_OF_PACKAGES, add the /opt/{package} directory +# to the PKG_CONFIG search path +# ---------------------------------- +AC_DEFUN([AC_OSX_PKG], +[ +case ${host_os} in + linux*) + # linux is wellbehaved + ;; + darwin*) + # darwin isn't + for i in $* + do + if test -d "/usr/local/opt/$i/lib/pkgconfig" + then + export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/usr/local/opt/$i/lib/pkgconfig" + fi + done + ;; +esac +]) diff -Nru ucto-0.9.6/m4/ac_search_lm.m4 ucto-0.14/m4/ac_search_lm.m4 --- ucto-0.9.6/m4/ac_search_lm.m4 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/m4/ac_search_lm.m4 2018-11-13 16:30:47.000000000 +0000 @@ -0,0 +1,61 @@ +# check_search_lm.m4 - Macro to locate textcat.lm files. -*- Autoconf -*- +# serial 1 +# +# Copyright © 2018 Ko van der Sloot +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# AC_SEARCH_LM() +# ---------------------------------- +AC_DEFUN([AC_SEARCH_LM], + [tcdirs="/usr/share/libtextcat /usr/share/libexttextcat /usr/local/share/libtextcat /usr/local/share/libexttextcat /usr/local/Cellar/libtextcat/2.2/share/LM " + + for d in $tcdirs + do + if test -f ${d}/nl.lm + then + MODULE_PREFIX=$d + AC_SUBST([MODULE_PREFIX]) + AM_CONDITIONAL([OLD_LM], [test 1 = 0]) + break + fi + done + + if test "x$MODULE_PREFIX" = "x" + then + for d in $tcdirs + do + if test -f ${d}/dutch.lm + then + MODULE_PREFIX=$d + AC_SUBST([MODULE_PREFIX]) + AM_CONDITIONAL([OLD_LM], [test 1 = 1]) + break + fi + done + fi + if test "x$MODULE_PREFIX" = "x" + then + AC_MSG_NOTICE([textcat Language Model files not found. Textcat disabled.]) + else + TEXTCAT_FOUND=1 + fi + + ]) diff -Nru ucto-0.9.6/m4/ax_icu_check.m4 ucto-0.14/m4/ax_icu_check.m4 --- ucto-0.9.6/m4/ax_icu_check.m4 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/m4/ax_icu_check.m4 1970-01-01 00:00:00.000000000 +0000 @@ -1,86 +0,0 @@ -dnl @synopsis AX_ICU_CHECK([version], [action-if], [action-if-not]) -dnl -dnl Test for ICU support -dnl -dnl This will define ICU_LIBS, ICU_CFLAGS, ICU_CXXFLAGS, ICU_IOLIBS. -dnl -dnl Based on ac_check_icu (http://autoconf-archive.cryp.to/ac_check_icu.html) -dnl by Akos Maroy . -dnl -dnl Portions Copyright 2005 Akos Maroy -dnl Copying and distribution of this file, with or without modification, -dnl are permitted in any medium without royalty provided the copyright -dnl notice and this notice are preserved. -dnl -dnl @author Hunter Morris -dnl @version 2008-03-18 -AC_DEFUN([AX_ICU_CHECK], [ - succeeded=no - - if test -z "$ICU_CONFIG"; then - AC_PATH_PROG(ICU_CONFIG, icu-config, no) - fi - - if test "$ICU_CONFIG" = "no" ; then - echo "*** The icu-config script could not be found. Make sure it is" - echo "*** in your path, and that taglib is properly installed." - echo "*** Or see http://www.icu-project.org/" - else - ICU_VERSION=`$ICU_CONFIG --version` - AC_MSG_CHECKING(for ICU >= $1) - VERSION_CHECK=`expr $ICU_VERSION \>\= $1` - if test "$VERSION_CHECK" = "1" ; then - AC_MSG_RESULT(yes) - succeeded=yes - - AC_MSG_CHECKING(ICU_CFLAGS) - ICU_CFLAGS=`$ICU_CONFIG --cflags` - AC_MSG_RESULT($ICU_CFLAGS) - - AC_MSG_CHECKING(ICU_CPPSEARCHPATH) - ICU_CPPSEARCHPATH=`$ICU_CONFIG --cppflags-searchpath` - AC_MSG_RESULT($ICU_CPPSEARCHPATH) - - AC_MSG_CHECKING(ICU_CXXFLAGS) - ICU_CXXFLAGS=`$ICU_CONFIG --cxxflags` - AC_MSG_RESULT($ICU_CXXFLAGS) - - AC_MSG_CHECKING(ICU_LIBS) - ICU_LIBS=`$ICU_CONFIG --ldflags-libsonly` - AC_MSG_RESULT($ICU_LIBS) - - AC_MSG_CHECKING(ICU_LIBPATH) - ICU_LIBPATH=`$ICU_CONFIG --ldflags-searchpath` - AC_MSG_RESULT($ICU_LIBPATH) - - AC_MSG_CHECKING(ICU_IOLIBS) - ICU_IOLIBS=`$ICU_CONFIG --ldflags-icuio` - AC_MSG_RESULT($ICU_IOLIBS) - else - ICU_CFLAGS="" - ICU_CXXFLAGS="" - ICU_CPPSEARCHPATH="" - ICU_LIBPATH="" - ICU_LIBS="" - ICU_IOLIBS="" - ## If we have a custom action on failure, don't print errors, but - ## do set a variable so people can do so. - ifelse([$3], ,echo "can't find ICU >= $1",) - fi - - AC_SUBST(ICU_CFLAGS) - AC_SUBST(ICU_CXXFLAGS) - AC_SUBST(ICU_CPPSEARCHPATH) - AC_SUBST(ICU_VERSION) - AC_SUBST(ICU_LIBPATH) - AC_SUBST(ICU_LIBS) - AC_SUBST(ICU_IOLIBS) - fi - - if test $succeeded = yes; then - ifelse([$2], , :, [$2]) - else - ifelse([$3], , AC_MSG_ERROR([Library requirements (ICU) not met.]), [$3]) - fi -]) - diff -Nru ucto-0.9.6/m4/ax_lib_readline.m4 ucto-0.14/m4/ax_lib_readline.m4 --- ucto-0.9.6/m4/ax_lib_readline.m4 2016-07-30 16:52:30.000000000 +0000 +++ ucto-0.14/m4/ax_lib_readline.m4 2017-10-31 09:15:02.000000000 +0000 @@ -1,5 +1,5 @@ # =========================================================================== -# http://www.gnu.org/software/autoconf-archive/ax_lib_readline.html +# https://www.gnu.org/software/autoconf-archive/ax_lib_readline.html # =========================================================================== # # SYNOPSIS @@ -58,7 +58,7 @@ # and this notice are preserved. This file is offered as-is, without any # warranty. -#serial 6 +#serial 7 AU_ALIAS([VL_LIB_READLINE], [AX_LIB_READLINE]) AC_DEFUN([AX_LIB_READLINE], [ diff -Nru ucto-0.9.6/m4/libtool.m4 ucto-0.14/m4/libtool.m4 --- ucto-0.9.6/m4/libtool.m4 2016-07-30 16:52:30.000000000 +0000 +++ ucto-0.14/m4/libtool.m4 2016-08-10 08:12:25.000000000 +0000 @@ -2887,6 +2887,18 @@ dynamic_linker='GNU/Linux ld.so' ;; +netbsdelf*-gnu) + version_type=linux + need_lib_prefix=no + need_version=no + library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}' + soname_spec='${libname}${release}${shared_ext}$major' + shlibpath_var=LD_LIBRARY_PATH + shlibpath_overrides_runpath=no + hardcode_into_libs=yes + dynamic_linker='NetBSD ld.elf_so' + ;; + netbsd*) version_type=sunos need_lib_prefix=no @@ -3546,7 +3558,7 @@ lt_cv_deplibs_check_method=pass_all ;; -netbsd*) +netbsd* | netbsdelf*-gnu) if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$' else @@ -4424,7 +4436,7 @@ ;; esac ;; - netbsd*) + netbsd* | netbsdelf*-gnu) ;; *qnx* | *nto*) # QNX uses GNU C++, but need to define -shared option too, otherwise @@ -4936,6 +4948,9 @@ ;; esac ;; + linux* | k*bsd*-gnu | gnu*) + _LT_TAGVAR(link_all_deplibs, $1)=no + ;; *) _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols' ;; @@ -4998,6 +5013,9 @@ openbsd* | bitrig*) with_gnu_ld=no ;; + linux* | k*bsd*-gnu | gnu*) + _LT_TAGVAR(link_all_deplibs, $1)=no + ;; esac _LT_TAGVAR(ld_shlibs, $1)=yes @@ -5252,7 +5270,7 @@ fi ;; - netbsd*) + netbsd* | netbsdelf*-gnu) if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib' wlarc= @@ -5773,6 +5791,7 @@ if test yes = "$lt_cv_irix_exported_symbol"; then _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations $wl-exports_file $wl$export_symbols -o $lib' fi + _LT_TAGVAR(link_all_deplibs, $1)=no else _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -exports_file $export_symbols -o $lib' @@ -5794,7 +5813,7 @@ esac ;; - netbsd*) + netbsd* | netbsdelf*-gnu) if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' # a.out else diff -Nru ucto-0.9.6/m4/lt~obsolete.m4 ucto-0.14/m4/lt~obsolete.m4 --- ucto-0.9.6/m4/lt~obsolete.m4 2016-07-30 16:52:30.000000000 +0000 +++ ucto-0.14/m4/lt~obsolete.m4 2015-11-25 08:55:54.000000000 +0000 @@ -1,7 +1,6 @@ # lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*- # -# Copyright (C) 2004-2005, 2007, 2009, 2011-2015 Free Software -# Foundation, Inc. +# Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc. # Written by Scott James Remnant, 2004. # # This file is free software; the Free Software Foundation gives @@ -12,7 +11,7 @@ # These exist entirely to fool aclocal when bootstrapping libtool. # -# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN), +# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN) # which have later been changed to m4_define as they aren't part of the # exported API, or moved to Autoconf or Automake where they belong. # @@ -26,7 +25,7 @@ # included after everything else. This provides aclocal with the # AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything # because those macros already exist, or will be overwritten later. -# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. +# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. # # Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here. # Yes, that means every name once taken will need to remain here until diff -Nru ucto-0.9.6/m4/ltsugar.m4 ucto-0.14/m4/ltsugar.m4 --- ucto-0.9.6/m4/ltsugar.m4 2016-07-30 16:52:30.000000000 +0000 +++ ucto-0.14/m4/ltsugar.m4 2015-11-25 08:55:54.000000000 +0000 @@ -1,7 +1,6 @@ # ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*- # -# Copyright (C) 2004-2005, 2007-2008, 2011-2015 Free Software -# Foundation, Inc. +# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc. # Written by Gary V. Vaughan, 2004 # # This file is free software; the Free Software Foundation gives @@ -34,7 +33,7 @@ # ------------ # Manipulate m4 lists. # These macros are necessary as long as will still need to support -# Autoconf-2.59, which quotes differently. +# Autoconf-2.59 which quotes differently. m4_define([lt_car], [[$1]]) m4_define([lt_cdr], [m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])], @@ -45,7 +44,7 @@ # lt_append(MACRO-NAME, STRING, [SEPARATOR]) # ------------------------------------------ -# Redefine MACRO-NAME to hold its former content plus 'SEPARATOR''STRING'. +# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'. # Note that neither SEPARATOR nor STRING are expanded; they are appended # to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked). # No SEPARATOR is output if MACRO-NAME was previously undefined (different diff -Nru ucto-0.9.6/m4/Makefile.am ucto-0.14/m4/Makefile.am --- ucto-0.9.6/m4/Makefile.am 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/m4/Makefile.am 2018-11-13 16:30:47.000000000 +0000 @@ -1,4 +1 @@ -# $Id: $ -# $URL: $ - -extra_DIST = ax_icu_check.m4 \ No newline at end of file +extra_DIST = ax_icu_check.m4 diff -Nru ucto-0.9.6/m4/Makefile.in ucto-0.14/m4/Makefile.in --- ucto-0.9.6/m4/Makefile.in 2017-01-23 12:52:24.000000000 +0000 +++ ucto-0.14/m4/Makefile.in 2018-11-20 09:07:54.000000000 +0000 @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. +# Makefile.in generated by automake 1.15.1 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2014 Free Software Foundation, Inc. +# Copyright (C) 1994-2017 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -13,9 +13,6 @@ # PARTICULAR PURPOSE. @SET_MAKE@ - -# $Id: $ -# $URL: $ VPATH = @srcdir@ am__is_gnu_make = { \ if test -z '$(MAKELEVEL)'; then \ @@ -92,7 +89,9 @@ host_triplet = @host@ subdir = m4 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_old_textcat.m4 \ + $(top_srcdir)/m4/ac_osx_pkg.m4 \ + $(top_srcdir)/m4/ac_search_lm.m4 \ $(top_srcdir)/m4/ax_lib_readline.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ @@ -158,13 +157,7 @@ FGREP = @FGREP@ GREP = @GREP@ ICU_CFLAGS = @ICU_CFLAGS@ -ICU_CONFIG = @ICU_CONFIG@ -ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@ -ICU_CXXFLAGS = @ICU_CXXFLAGS@ -ICU_IOLIBS = @ICU_IOLIBS@ -ICU_LIBPATH = @ICU_LIBPATH@ ICU_LIBS = @ICU_LIBS@ -ICU_VERSION = @ICU_VERSION@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ @@ -182,6 +175,7 @@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +MODULE_PREFIX = @MODULE_PREFIX@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ @@ -205,6 +199,8 @@ SHELL = @SHELL@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ +TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ +TEXTCAT_LIBS = @TEXTCAT_LIBS@ VERSION = @VERSION@ XML2_CFLAGS = @XML2_CFLAGS@ XML2_LIBS = @XML2_LIBS@ @@ -255,6 +251,7 @@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +runstatedir = @runstatedir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ diff -Nru ucto-0.9.6/m4/pkg.m4 ucto-0.14/m4/pkg.m4 --- ucto-0.9.6/m4/pkg.m4 2016-07-30 16:52:30.000000000 +0000 +++ ucto-0.14/m4/pkg.m4 2015-11-25 08:55:54.000000000 +0000 @@ -1,60 +1,29 @@ -dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- -dnl serial 11 (pkg-config-0.29.1) -dnl -dnl Copyright © 2004 Scott James Remnant . -dnl Copyright © 2012-2015 Dan Nicholson -dnl -dnl This program is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU General Public License as published by -dnl the Free Software Foundation; either version 2 of the License, or -dnl (at your option) any later version. -dnl -dnl This program is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -dnl General Public License for more details. -dnl -dnl You should have received a copy of the GNU General Public License -dnl along with this program; if not, write to the Free Software -dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -dnl 02111-1307, USA. -dnl -dnl As a special exception to the GNU General Public License, if you -dnl distribute this file as part of a program that contains a -dnl configuration script generated by Autoconf, you may include it under -dnl the same distribution terms that you use for the rest of that -dnl program. - -dnl PKG_PREREQ(MIN-VERSION) -dnl ----------------------- -dnl Since: 0.29 -dnl -dnl Verify that the version of the pkg-config macros are at least -dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's -dnl installed version of pkg-config, this checks the developer's version -dnl of pkg.m4 when generating configure. -dnl -dnl To ensure that this macro is defined, also add: -dnl m4_ifndef([PKG_PREREQ], -dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])]) -dnl -dnl See the "Since" comment for each macro you use to see what version -dnl of the macros you require. -m4_defun([PKG_PREREQ], -[m4_define([PKG_MACROS_VERSION], [0.29.1]) -m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1, - [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])]) -])dnl PKG_PREREQ - -dnl PKG_PROG_PKG_CONFIG([MIN-VERSION]) -dnl ---------------------------------- -dnl Since: 0.16 -dnl -dnl Search for the pkg-config tool and set the PKG_CONFIG variable to -dnl first found in the path. Checks that the version of pkg-config found -dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is -dnl used since that's the first version where most current features of -dnl pkg-config existed. +# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- +# serial 1 (pkg-config-0.24) +# +# Copyright © 2004 Scott James Remnant . +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# PKG_PROG_PKG_CONFIG([MIN-VERSION]) +# ---------------------------------- AC_DEFUN([PKG_PROG_PKG_CONFIG], [m4_pattern_forbid([^_?PKG_[A-Z_]+$]) m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) @@ -76,19 +45,18 @@ PKG_CONFIG="" fi fi[]dnl -])dnl PKG_PROG_PKG_CONFIG +])# PKG_PROG_PKG_CONFIG -dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------------------------------- -dnl Since: 0.18 -dnl -dnl Check to see whether a particular set of modules exists. Similar to -dnl PKG_CHECK_MODULES(), but does not set variables or print errors. -dnl -dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -dnl only at the first occurence in configure.ac, so if the first place -dnl it's called might be skipped (such as if it is within an "if", you -dnl have to call PKG_CHECK_EXISTS manually +# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +# +# Check to see whether a particular set of modules exists. Similar +# to PKG_CHECK_MODULES(), but does not set variables or print errors. +# +# Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) +# only at the first occurence in configure.ac, so if the first place +# it's called might be skipped (such as if it is within an "if", you +# have to call PKG_CHECK_EXISTS manually +# -------------------------------------------------------------- AC_DEFUN([PKG_CHECK_EXISTS], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl if test -n "$PKG_CONFIG" && \ @@ -98,10 +66,8 @@ $3])dnl fi]) -dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) -dnl --------------------------------------------- -dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting -dnl pkg_failed based on the result. +# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) +# --------------------------------------------- m4_define([_PKG_CONFIG], [if test -n "$$1"; then pkg_cv_[]$1="$$1" @@ -113,11 +79,10 @@ else pkg_failed=untried fi[]dnl -])dnl _PKG_CONFIG +])# _PKG_CONFIG -dnl _PKG_SHORT_ERRORS_SUPPORTED -dnl --------------------------- -dnl Internal check to see if pkg-config supports short errors. +# _PKG_SHORT_ERRORS_SUPPORTED +# ----------------------------- AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], [AC_REQUIRE([PKG_PROG_PKG_CONFIG]) if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then @@ -125,17 +90,19 @@ else _pkg_short_errors_supported=no fi[]dnl -])dnl _PKG_SHORT_ERRORS_SUPPORTED +])# _PKG_SHORT_ERRORS_SUPPORTED -dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl -------------------------------------------------------------- -dnl Since: 0.4.0 -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES might not happen, you should be sure to include an -dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac +# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], +# [ACTION-IF-NOT-FOUND]) +# +# +# Note that if there is a possibility the first call to +# PKG_CHECK_MODULES might not happen, you should be sure to include an +# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac +# +# +# -------------------------------------------------------------- AC_DEFUN([PKG_CHECK_MODULES], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl @@ -189,40 +156,16 @@ AC_MSG_RESULT([yes]) $3 fi[]dnl -])dnl PKG_CHECK_MODULES +])# PKG_CHECK_MODULES -dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl --------------------------------------------------------------------- -dnl Since: 0.29 -dnl -dnl Checks for existence of MODULES and gathers its build flags with -dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags -dnl and VARIABLE-PREFIX_LIBS from --libs. -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to -dnl include an explicit call to PKG_PROG_PKG_CONFIG in your -dnl configure.ac. -AC_DEFUN([PKG_CHECK_MODULES_STATIC], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -_save_PKG_CONFIG=$PKG_CONFIG -PKG_CONFIG="$PKG_CONFIG --static" -PKG_CHECK_MODULES($@) -PKG_CONFIG=$_save_PKG_CONFIG[]dnl -])dnl PKG_CHECK_MODULES_STATIC - - -dnl PKG_INSTALLDIR([DIRECTORY]) -dnl ------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable pkgconfigdir as the location where a module -dnl should install pkg-config .pc files. By default the directory is -dnl $libdir/pkgconfig, but the default can be changed by passing -dnl DIRECTORY. The user can override through the --with-pkgconfigdir -dnl parameter. +# PKG_INSTALLDIR(DIRECTORY) +# ------------------------- +# Substitutes the variable pkgconfigdir as the location where a module +# should install pkg-config .pc files. By default the directory is +# $libdir/pkgconfig, but the default can be changed by passing +# DIRECTORY. The user can override through the --with-pkgconfigdir +# parameter. AC_DEFUN([PKG_INSTALLDIR], [m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])]) m4_pushdef([pkg_description], @@ -233,18 +176,16 @@ AC_SUBST([pkgconfigdir], [$with_pkgconfigdir]) m4_popdef([pkg_default]) m4_popdef([pkg_description]) -])dnl PKG_INSTALLDIR +]) dnl PKG_INSTALLDIR -dnl PKG_NOARCH_INSTALLDIR([DIRECTORY]) -dnl -------------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable noarch_pkgconfigdir as the location where a -dnl module should install arch-independent pkg-config .pc files. By -dnl default the directory is $datadir/pkgconfig, but the default can be -dnl changed by passing DIRECTORY. The user can override through the -dnl --with-noarch-pkgconfigdir parameter. +# PKG_NOARCH_INSTALLDIR(DIRECTORY) +# ------------------------- +# Substitutes the variable noarch_pkgconfigdir as the location where a +# module should install arch-independent pkg-config .pc files. By +# default the directory is $datadir/pkgconfig, but the default can be +# changed by passing DIRECTORY. The user can override through the +# --with-noarch-pkgconfigdir parameter. AC_DEFUN([PKG_NOARCH_INSTALLDIR], [m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])]) m4_pushdef([pkg_description], @@ -255,15 +196,13 @@ AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir]) m4_popdef([pkg_default]) m4_popdef([pkg_description]) -])dnl PKG_NOARCH_INSTALLDIR +]) dnl PKG_NOARCH_INSTALLDIR -dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, -dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------- -dnl Since: 0.28 -dnl -dnl Retrieves the value of the pkg-config variable for the given module. +# PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, +# [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +# ------------------------------------------- +# Retrieves the value of the pkg-config variable for the given module. AC_DEFUN([PKG_CHECK_VAR], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl @@ -272,4 +211,4 @@ AS_VAR_COPY([$1], [pkg_cv_][$1]) AS_VAR_IF([$1], [""], [$5], [$4])dnl -])dnl PKG_CHECK_VAR +])# PKG_CHECK_VAR diff -Nru ucto-0.9.6/Makefile.am ucto-0.14/Makefile.am --- ucto-0.9.6/Makefile.am 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/Makefile.am 2018-11-13 16:30:47.000000000 +0000 @@ -2,10 +2,10 @@ SUBDIRS = src include m4 config docs tests -EXTRA_DIST = bootstrap.sh AUTHORS TODO NEWS ucto.pc.in ucto-icu.pc.in +EXTRA_DIST = bootstrap.sh AUTHORS TODO NEWS README.md ucto.pc.in ucto-icu.pc.in codemeta.json pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = ucto.pc ucto-icu.pc +pkgconfig_DATA = ucto.pc ChangeLog: NEWS git pull; git2cl > ChangeLog diff -Nru ucto-0.9.6/Makefile.in ucto-0.14/Makefile.in --- ucto-0.9.6/Makefile.in 2017-01-23 12:52:24.000000000 +0000 +++ ucto-0.14/Makefile.in 2018-11-20 09:07:54.000000000 +0000 @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. +# Makefile.in generated by automake 1.15.1 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2014 Free Software Foundation, Inc. +# Copyright (C) 1994-2017 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -90,7 +90,9 @@ host_triplet = @host@ subdir = . ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_old_textcat.m4 \ + $(top_srcdir)/m4/ac_osx_pkg.m4 \ + $(top_srcdir)/m4/ac_search_lm.m4 \ $(top_srcdir)/m4/ax_lib_readline.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ @@ -104,7 +106,7 @@ configure.lineno config.status.lineno mkinstalldirs = $(install_sh) -d CONFIG_HEADER = config.h -CONFIG_CLEAN_FILES = ucto.pc ucto-icu.pc +CONFIG_CLEAN_FILES = ucto.pc CONFIG_CLEAN_VPATH_FILES = AM_V_P = $(am__v_P_@AM_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) @@ -193,9 +195,9 @@ CSCOPE = cscope DIST_SUBDIRS = $(SUBDIRS) am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/config.h.in \ - $(srcdir)/ucto-icu.pc.in $(srcdir)/ucto.pc.in AUTHORS COPYING \ - ChangeLog NEWS README TODO compile config.guess config.sub \ - depcomp install-sh ltmain.sh missing + $(srcdir)/ucto.pc.in AUTHORS COPYING ChangeLog INSTALL NEWS \ + README TODO compile config.guess config.sub depcomp install-sh \ + ltmain.sh missing DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) distdir = $(PACKAGE)-$(VERSION) top_distdir = $(distdir) @@ -269,13 +271,7 @@ FGREP = @FGREP@ GREP = @GREP@ ICU_CFLAGS = @ICU_CFLAGS@ -ICU_CONFIG = @ICU_CONFIG@ -ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@ -ICU_CXXFLAGS = @ICU_CXXFLAGS@ -ICU_IOLIBS = @ICU_IOLIBS@ -ICU_LIBPATH = @ICU_LIBPATH@ ICU_LIBS = @ICU_LIBS@ -ICU_VERSION = @ICU_VERSION@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ @@ -293,6 +289,7 @@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +MODULE_PREFIX = @MODULE_PREFIX@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ @@ -316,6 +313,8 @@ SHELL = @SHELL@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ +TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ +TEXTCAT_LIBS = @TEXTCAT_LIBS@ VERSION = @VERSION@ XML2_CFLAGS = @XML2_CFLAGS@ XML2_LIBS = @XML2_LIBS@ @@ -366,6 +365,7 @@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +runstatedir = @runstatedir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ @@ -380,9 +380,9 @@ uctodata_LIBS = @uctodata_LIBS@ ACLOCAL_AMFLAGS = -I m4 --install SUBDIRS = src include m4 config docs tests -EXTRA_DIST = bootstrap.sh AUTHORS TODO NEWS ucto.pc.in ucto-icu.pc.in +EXTRA_DIST = bootstrap.sh AUTHORS TODO NEWS README.md ucto.pc.in ucto-icu.pc.in codemeta.json pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = ucto.pc ucto-icu.pc +pkgconfig_DATA = ucto.pc all: config.h $(MAKE) $(AM_MAKEFLAGS) all-recursive @@ -437,8 +437,6 @@ -rm -f config.h stamp-h1 ucto.pc: $(top_builddir)/config.status $(srcdir)/ucto.pc.in cd $(top_builddir) && $(SHELL) ./config.status $@ -ucto-icu.pc: $(top_builddir)/config.status $(srcdir)/ucto-icu.pc.in - cd $(top_builddir) && $(SHELL) ./config.status $@ mostlyclean-libtool: -rm -f *.lo diff -Nru ucto-0.9.6/NEWS ucto-0.14/NEWS --- ucto-0.9.6/NEWS 2017-01-16 11:13:43.000000000 +0000 +++ ucto-0.14/NEWS 2018-11-29 13:41:36.000000000 +0000 @@ -1,3 +1,94 @@ +0.14 2018-11-29 +[Ko van der Sloot] +* updated usage() and removed -S option (never used) +* make sure the right textclass is assigned to nodes in FoLiA +* minor code fixes/refactorings +* added more tests +* updated man.1 page + +[Maarten van Gompel] +* updated README.md + +[Iris Hendrickx] +* Updated and extended the manual + +0.13.2 2018-05-17 +[Ko van der Sloot] +Bug fix release: +* uctodata is mandatory. So don't install default rules anymore + +0.13.1 2018-05-17 +[Ko van der Sloot] +Bug fix release: +* configure now finds out the location of the uctodata files. + should make it work on Mac systems too + +0.13 2018-05-16 +[Ko van der Sloot] +* improved configure/build/test +* added a --split option +* fixed -P option +* removed -S option (never used, and only half implemented) +* added a --add-tokens option, to add special tokens for the default language +* generally use the icu:: namespace +* added more tests +* fixed uninitialized variable. +* added code to use an alternative search-path for uctodata + +[Maarten van Gompel] +* added codemeta.json + +0.12 2018-02-19 +[Ko van der Sloot] +* now use the UniFilter Unicode Filter from ticcutils +* now use the UnicodeNormalizer from ticcutils +* improved configuration. Support vor Mac OSX added + +0.11 2017-12-04 +[Ko van der Sloot] +Bug fix release: +* problems with text inside Cell elements + +0.10 2017-11-07 +[Ko van der Sloot] +New release due to outdated files in the previous release. + +0.9.9 2017-11-06 +[Ko van der Sloot] +Minor fix: +* bumped the .so version to 3.0.0 + +0.9.8 2017-10-23 +[Ko van der Sloot] +Bug-fix release +* fixed utterance handling in FoLiA input. Don't try sentence detection! + +0.9.7 2017-10-17 +[Ko van der Sloot] + * added textredundancy option, default is 'minimal' + * small adaptations to work with FoLiA 1.5 specs + - set textclass on words when outputclass != inputclass + - DON'T filter special characters when inputclass == outputclass + * -F (folia input) is automatically set for .xml files + * more robust against texts with embedded tabs, etc. + * more and better tests added + * better logging and error messaging + * improved language handling. TODO: Language detection in FoLiA + * bug fixes: + - correctly handle xml-comment inside a + - better id generation when parent has no id + - better reaction on overly long 'words' + +0.9.6 2017-01-23 +[Maarten van Gompel] +* Moving data files from etc/ to share/, as they are more data files than + configuration files that should be edited. +* Requires uctodata >= 0.4. +* Should solve debian packaging issues (#18) +* Minor updates to the manual (#2) +* Some refactoring/code cleanup, temper expectations regarding ucto's + date-tagging abilities (#16, thanks also to @sanmai-NL) + 0.9.5 2017-01-06 [Ko van der Sloot] Bug fix release: diff -Nru ucto-0.9.6/README ucto-0.14/README --- ucto-0.9.6/README 2017-01-23 12:07:58.000000000 +0000 +++ ucto-0.14/README 2018-11-13 16:30:47.000000000 +0000 @@ -1,113 +1 @@ -[![Build Status](https://travis-ci.org/LanguageMachines/ucto.svg?branch=master)](https://travis-ci.org/LanguageMachines/ucto) [![Language Machines Badge](http://applejack.science.ru.nl/lamabadge.php/ucto)](http://applejack.science.ru.nl/languagemachines/) - -================================ -Ucto - A rule-based tokeniser -================================ - - Centre for Language and Speech technology, Radboud University Nijmegen - Induction of Linguistic Knowledge Research Group, Tilburg University - -Website: https://languagemachines.github.io/ucto/ - -Ucto tokenizes text files: it separates words from punctuation, and splits -sentences. This is one of the first tasks for almost any Natural Language -Processing application. Ucto offers several other basic preprocessing steps -such as changing case that you can all use to make your text suited for further -processing such as indexing, part-of-speech tagging, or machine translation. - -Ucto comes with tokenisation rules for several languages (packaged separately) -and can be easily extended to suit other languages. It has been incorporated -for tokenizing Dutch text in Frog (https://languagemachines.github.io/frog), -our Dutch morpho-syntactic processor. - -The software is intended to be used from the command-line by researchers in -Natural Language Processing or related areas, as well as software developers. -An [Ucto python binding](https://github.com/proycon/python-ucto) is also available -separately. - -Features: - -- Comes with tokenization rules for English, Dutch, French, Italian, Turkish, - Spanish, Portuguese and Swedish; easily extendible to other languages. Rules - consists of regular expressions and lists. They are - packaged separately as [uctodata](https://github.com/LanguageMachines/uctodata). -- Recognizes units, currencies, abbreviations, and simple dates and times like dd-mm-yyyy -- Recognizes paired quote spans, sentences, and paragraphs. -- Produces UTF8 encoding and NFC output normalization, optionally accepting - other input encodings as well. -- Ligature normalization (can undo for isntance fi,fl as single codepoints). -- Optional conversion to all lowercase or uppercase. -- Supports [FoLiA XML](https://proycon.github.io/folia) - -Ucto was written by Maarten van Gompel and Ko van der Sloot. Work on Ucto was -funded by NWO, the Netherlands Organisation for Scientific Research, under the -Implicit Linguistics project, the CLARIN-NL program, and the CLARIAH project. - -This software is available under the GNU Public License v3 (see the file -COPYING). - ------------------------------------------------------------- -Installation ------------------------------------------------------------- - -To install ucto, first consult whether your distribution's package manager has an up-to-date package for it. -If not, for easy installation of ucto and all dependencies, it is included as part of our software -distribution [LaMachine](https://proycon.github.io/LaMachine). - -To compile and install manually from source, provided you have all the -dependencies installed: - - $ bash bootstrap.sh - $ ./configure - $ make - $ sudo make install - -You will need current versions of the following dependencies of our software: - -* [ticcutils](https://github.com/LanguageMachine/ticcutils) - A shared utility library -* [libfolia](https://github.com/LanguageMachines/libfolia) - A library for the FoLiA format. -* [uctodata](https://github.com/LanguageMachines/uctodata) - Data files for ucto, packaged separately - -As well as the following 3rd party dependencies: - -* ``icu`` - A C++ library for Unicode and Globalization support. On Debian/Ubuntu systems, install the package libicu-dev. -* ``libxml2`` - An XML library. On Debian/Ubuntu systems install the package libxml2-dev. -* A sane build environment with a C++ compiler (e.g. gcc or clang), autotools, libtool, pkg-config - ------------------------------------------------------------- -Usage ------------------------------------------------------------- - -Tokenize an english text file to standard output, tokens will be -space-seperated, sentences delimiter by ````: - - $ ucto -L eng yourfile.txt - -The -L flag specifies the language (as a three letter iso-639-3 code), provided -a configuration file exists for that language. The configurations are provided -separately, for various languages, in the -[uctodata](https://github.com/LanguageMachines/uctodata) package. Note that -older versions of ucto used different two-letter codes, so you may need to -update the way you invoke ucto. - -To output to file instead of standard output, just add another -positional argument with the desired output filename. - -If you want each sentence on a separate line (i.e. newline delimited rather than delimited by -````), then pass the ``-n`` flag. If each sentence is already on one line -in the input and you want to leave it at that, pass the ``-m`` flag. - -Tokenize plaintext to [FoLiA XML](https://proycon.github.io/folia) using the ``-X`` flag, you can specify an ID -for the FoLiA document using the ``--id=`` flag. - - $ ucto -L eng -X --id=hamlet hamlet.txt hamlet.folia.xml - -Note that in the FoLiA XML output, ucto encodes the class of the token (date, url, smiley, etc...) based -on the rule that matched. - -For further documentation consult the [ucto -manual](https://github.com/LanguageMachines/ucto/blob/master/docs/ucto_manual.pdf) -for further documentation. - - - +Please see README.md for more information diff -Nru ucto-0.9.6/README.md ucto-0.14/README.md --- ucto-0.9.6/README.md 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/README.md 2018-11-20 09:07:46.000000000 +0000 @@ -0,0 +1,107 @@ +[![Build Status](https://travis-ci.org/LanguageMachines/ucto.svg?branch=master)](https://travis-ci.org/LanguageMachines/ucto) [![Language Machines Badge](http://applejack.science.ru.nl/lamabadge.php/ucto)](http://applejack.science.ru.nl/languagemachines/) + +Ucto - A rule-based tokeniser +================================ + + Centre for Language and Speech technology, Radboud University Nijmegen + Induction of Linguistic Knowledge Research Group, Tilburg University + +Website: https://languagemachines.github.io/ucto/ + +Ucto tokenizes text files: it separates words from punctuation, and splits +sentences. This is one of the first tasks for almost any Natural Language +Processing application. Ucto offers several other basic preprocessing steps +such as changing case that you can all use to make your text suited for further +processing such as indexing, part-of-speech tagging, or machine translation. + +Ucto comes with tokenisation rules for several languages (packaged separately) +and can be easily extended to suit other languages. It has been incorporated +for tokenizing Dutch text in Frog (https://languagemachines.github.io/frog), +our Dutch morpho-syntactic processor. + +The software is intended to be used from the command-line by researchers in +Natural Language Processing or related areas, as well as software developers. +An [Ucto python binding](https://github.com/proycon/python-ucto) is also available +separately. + +Features: + +- Comes with tokenization rules for English, Dutch, French, Italian, Turkish, + Spanish, Portuguese and Swedish; easily extendible to other languages. Rules + consists of regular expressions and lists. They are + packaged separately as [uctodata](https://github.com/LanguageMachines/uctodata). +- Recognizes units, currencies, abbreviations, and simple dates and times like dd-mm-yyyy +- Recognizes paired quote spans, sentences, and paragraphs. +- Produces UTF8 encoding and NFC output normalization, optionally accepting + other input encodings as well. +- Ligature normalization (can undo for isntance fi,fl as single codepoints). +- Optional conversion to all lowercase or uppercase. +- Supports [FoLiA XML](https://proycon.github.io/folia) + +Ucto was written by Maarten van Gompel and Ko van der Sloot. Work on Ucto was +funded by NWO, the Netherlands Organisation for Scientific Research, under the +Implicit Linguistics project, the CLARIN-NL program, and the CLARIAH project. + +This software is available under the GNU Public License v3 (see the file +COPYING). + +Installation +------------------------------------------------------------ + +To install ucto, first consult whether your distribution's package manager has an up-to-date package for it. +If not, for easy installation of ucto and all dependencies, it is included as part of our software +distribution [LaMachine](https://proycon.github.io/LaMachine). + +To compile and install manually from source, provided you have all the +dependencies installed: + + $ bash bootstrap.sh + $ ./configure + $ make + $ sudo make install + +You will need current versions of the following dependencies of our software: + +* [ticcutils](https://github.com/LanguageMachine/ticcutils) - A shared utility library +* [libfolia](https://github.com/LanguageMachines/libfolia) - A library for the FoLiA format. +* [uctodata](https://github.com/LanguageMachines/uctodata) - Data files for ucto, packaged separately + +As well as the following 3rd party dependencies: + +* ``icu`` - A C++ library for Unicode and Globalization support. On Debian/Ubuntu systems, install the package libicu-dev. +* ``libxml2`` - An XML library. On Debian/Ubuntu systems install the package libxml2-dev. +* ``libtextcat`` - A language detection package. On Debian/Ubuntu systems it is called libexttextcat-dev. +* A sane build environment with a C++ compiler (e.g. gcc or clang), autotools, libtool, pkg-config + +Usage +------------------------------------------------------------ + +Tokenize an english text file to standard output, tokens will be +space-seperated, sentences delimiter by ````: + + $ ucto -L eng yourfile.txt + +The -L flag specifies the language (as a three letter iso-639-3 code), provided +a configuration file exists for that language. The configurations are provided +separately, for various languages, in the +[uctodata](https://github.com/LanguageMachines/uctodata) package. Note that +older versions of ucto used different two-letter codes, so you may need to +update the way you invoke ucto. + +To output to file instead of standard output, just add another +positional argument with the desired output filename. + +If you want each sentence on a separate line (i.e. newline delimited rather than delimited by +````), then pass the ``-n`` flag. If each sentence is already on one line +in the input and you want to leave it at that, pass the ``-m`` flag. + +Tokenize plaintext to [FoLiA XML](https://proycon.github.io/folia) using the ``-X`` flag, you can specify an ID +for the FoLiA document using the ``--id=`` flag. + + $ ucto -L eng -X --id=hamlet hamlet.txt hamlet.folia.xml + +Note that in the FoLiA XML output, ucto encodes the class of the token (date, url, smiley, etc...) based +on the rule that matched. + +For further documentation consult the [ucto +documentation](https://ucto.readthedocs.io/en/latest/). diff -Nru ucto-0.9.6/src/Makefile.am ucto-0.14/src/Makefile.am --- ucto-0.9.6/src/Makefile.am 2017-01-23 12:03:47.000000000 +0000 +++ ucto-0.14/src/Makefile.am 2018-11-13 16:30:47.000000000 +0000 @@ -1,8 +1,5 @@ -# $Id$ -# $URL $ - AM_CPPFLAGS = -I@top_srcdir@/include -AM_CXXFLAGS = -DSYSCONF_PATH=\"$(datadir)\" -std=c++0x # -Weffc++ +AM_CXXFLAGS = -DSYSCONF_PATH=\"$(datadir)\" -std=c++11 -W -Wall -pedantic -g -O3 bin_PROGRAMS = ucto @@ -11,9 +8,9 @@ ucto_SOURCES = ucto.cxx lib_LTLIBRARIES = libucto.la -libucto_la_LDFLAGS = -version-info 2:0:0 +libucto_la_LDFLAGS = -version-info 3:0:0 -libucto_la_SOURCES = textcat.cxx unicode.cxx setting.cxx tokenize.cxx +libucto_la_SOURCES = my_textcat.cxx setting.cxx tokenize.cxx TESTS = tst.sh diff -Nru ucto-0.9.6/src/Makefile.in ucto-0.14/src/Makefile.in --- ucto-0.9.6/src/Makefile.in 2017-01-23 12:52:24.000000000 +0000 +++ ucto-0.14/src/Makefile.in 2018-11-20 09:07:54.000000000 +0000 @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. +# Makefile.in generated by automake 1.15.1 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2014 Free Software Foundation, Inc. +# Copyright (C) 1994-2017 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -14,9 +14,6 @@ @SET_MAKE@ -# $Id$ -# $URL $ - VPATH = @srcdir@ am__is_gnu_make = { \ @@ -95,7 +92,9 @@ bin_PROGRAMS = ucto$(EXEEXT) subdir = src ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_old_textcat.m4 \ + $(top_srcdir)/m4/ac_osx_pkg.m4 \ + $(top_srcdir)/m4/ac_search_lm.m4 \ $(top_srcdir)/m4/ax_lib_readline.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ @@ -138,7 +137,7 @@ am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" LTLIBRARIES = $(lib_LTLIBRARIES) libucto_la_LIBADD = -am_libucto_la_OBJECTS = textcat.lo unicode.lo setting.lo tokenize.lo +am_libucto_la_OBJECTS = my_textcat.lo setting.lo tokenize.lo libucto_la_OBJECTS = $(am_libucto_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -423,13 +422,7 @@ FGREP = @FGREP@ GREP = @GREP@ ICU_CFLAGS = @ICU_CFLAGS@ -ICU_CONFIG = @ICU_CONFIG@ -ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@ -ICU_CXXFLAGS = @ICU_CXXFLAGS@ -ICU_IOLIBS = @ICU_IOLIBS@ -ICU_LIBPATH = @ICU_LIBPATH@ ICU_LIBS = @ICU_LIBS@ -ICU_VERSION = @ICU_VERSION@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ @@ -447,6 +440,7 @@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +MODULE_PREFIX = @MODULE_PREFIX@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ @@ -470,6 +464,8 @@ SHELL = @SHELL@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ +TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ +TEXTCAT_LIBS = @TEXTCAT_LIBS@ VERSION = @VERSION@ XML2_CFLAGS = @XML2_CFLAGS@ XML2_LIBS = @XML2_LIBS@ @@ -520,6 +516,7 @@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +runstatedir = @runstatedir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ @@ -533,12 +530,12 @@ uctodata_CFLAGS = @uctodata_CFLAGS@ uctodata_LIBS = @uctodata_LIBS@ AM_CPPFLAGS = -I@top_srcdir@/include -AM_CXXFLAGS = -DSYSCONF_PATH=\"$(datadir)\" -std=c++0x # -Weffc++ +AM_CXXFLAGS = -DSYSCONF_PATH=\"$(datadir)\" -std=c++11 -W -Wall -pedantic -g -O3 LDADD = libucto.la ucto_SOURCES = ucto.cxx lib_LTLIBRARIES = libucto.la -libucto_la_LDFLAGS = -version-info 2:0:0 -libucto_la_SOURCES = textcat.cxx unicode.cxx setting.cxx tokenize.cxx +libucto_la_LDFLAGS = -version-info 3:0:0 +libucto_la_SOURCES = my_textcat.cxx setting.cxx tokenize.cxx TESTS = tst.sh EXTRA_DIST = tst.sh CLEANFILES = tst.out @@ -673,11 +670,10 @@ distclean-compile: -rm -f *.tab.c +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/my_textcat.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/setting.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tokenize.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ucto.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unicode.Plo@am__quote@ .cxx.o: @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< diff -Nru ucto-0.9.6/src/my_textcat.cxx ucto-0.14/src/my_textcat.cxx --- ucto-0.9.6/src/my_textcat.cxx 1970-01-01 00:00:00.000000000 +0000 +++ ucto-0.14/src/my_textcat.cxx 2018-11-13 16:30:47.000000000 +0000 @@ -0,0 +1,90 @@ +/* + Copyright (c) 2018 + CLST - Radboud University + + This file is part of Ucto + + Ucto is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + Ucto is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + For questions and suggestions, see: + https://github.com/LanguageMachines/ucto/issues + or send mail to: + lamasoftware (at ) science.ru.nl + +*/ +#include +#include +#include "ticcutils/StringOps.h" +#include "config.h" +#include "ucto/my_textcat.h" + +using namespace std; + +#ifdef HAVE_TEXTCAT +TextCat::~TextCat() { textcat_Done( TC ); } + +TextCat::TextCat( const std::string& cf ) { + TC = textcat_Init( cf.c_str() ); + if ( TC == 0 ){ + throw runtime_error( "TextCat init failed: " + cf ); + } + cfName = cf; + +} + +TextCat::TextCat( const TextCat& in ) { + TC = textcat_Init( in.cfName.c_str() ); + cfName = in.cfName; +} + +vector TextCat::get_languages( const string& in ) const { + vector vals; + char *res = textcat_Classify( TC, in.c_str(), in.size() ); + if ( res && strlen(res) > 0 && strcmp( res, "SHORT" ) != 0 ){ + string val = res; + TiCC::split_at_first_of( val, vals, "[]" ); + } + return vals; +} + +string TextCat::get_language( const string& in ) const { + vector vals = get_languages( in ); + if ( vals.size() > 0 ){ + return vals[0]; + } + else { + return ""; + } +} + +#else +TextCat::~TextCat() {} + +TextCat::TextCat( const std::string& cf ): TC(0) { + throw runtime_error( "TextCat::TextCat(" + cf + "): TextCat Support not available" ); +} + +TextCat::TextCat( const TextCat& in ): TC(0) { + throw runtime_error( "TextCat::TextCat(): TextCat Support not available" ); +} + +vector TextCat::get_languages( const string& in ) const { + throw runtime_error( "TextCat::get_languages(): TextCat Support not available" ); +} + +string TextCat::get_language( const string& in ) const { + throw runtime_error( "TextCat::get_language(): TextCat Support not available" ); +} + +#endif diff -Nru ucto-0.9.6/src/setting.cxx ucto-0.14/src/setting.cxx --- ucto-0.9.6/src/setting.cxx 2017-01-23 12:03:47.000000000 +0000 +++ ucto-0.14/src/setting.cxx 2018-11-15 16:38:29.000000000 +0000 @@ -1,5 +1,5 @@ /* - Copyright (c) 2006 - 2017 + Copyright (c) 2006 - 2018 CLST - Radboud University ILK - Tilburg University @@ -37,17 +37,23 @@ #include "ticcutils/FileUtils.h" #include "ticcutils/PrettyPrint.h" #include "ticcutils/LogStream.h" +#include "ticcutils/Unicode.h" #include "libfolia/folia.h" -#include "ucto/unicode.h" #include "ucto/setting.h" using namespace std; -using namespace TiCC; -#define LOG *Log(theErrLog) +#define LOG *TiCC::Log(theErrLog) + +#ifndef UCTODATA_DIR +#define UCTODATA_DIR string(SYSCONF_PATH) + "/ucto/" +#endif namespace Tokenizer { - string defaultConfigDir = string(SYSCONF_PATH) + "/ucto/"; + + using namespace icu; + + string defaultConfigDir = UCTODATA_DIR; enum ConfigMode { NONE, RULES, ABBREVIATIONS, ATTACHEDPREFIXES, ATTACHEDSUFFIXES, PREFIXES, SUFFIXES, TOKENS, UNITS, @@ -112,12 +118,12 @@ uConfigError( const string& s, const string& f ): invalid_argument( "ucto: " + s + " (" + f + ")" ){}; uConfigError( const UnicodeString& us, const string& f ): - uConfigError( folia::UnicodeToUTF8(us), f ){}; + uConfigError( TiCC::UnicodeToUTF8(us), f ){}; }; class uLogicError: public std::logic_error { public: - uLogicError( const string& s ): logic_error( "ucto: logic error:" + s ){}; + explicit uLogicError( const string& s ): logic_error( "ucto: logic error:" + s ){}; }; ostream& operator<<( ostream& os, const Quoting& q ){ @@ -188,7 +194,7 @@ Rule::Rule( const UnicodeString& _id, const UnicodeString& _pattern): id(_id), pattern(_pattern) { - regexp = new UnicodeRegexMatcher( pattern, id ); + regexp = new TiCC::UnicodeRegexMatcher( pattern, id ); } ostream& operator<< (std::ostream& os, const Rule& r ){ @@ -230,12 +236,27 @@ delete rule; } rulesmap.clear(); - delete theErrLog; + } + + set Setting::installed_languages() { + // we only return 'languages' which are installed as 'tokconfig-*' + // + vector files = TiCC::searchFilesMatch( defaultConfigDir, "tokconfig-*" ); + set result; + for ( auto const& f : files ){ + string base = TiCC::basename(f); + size_t pos = base.find("tokconfig-"); + if ( pos == 0 ){ + string lang = base.substr( 10 ); + result.insert( lang ); + } + } + return result; } bool Setting::readrules( const string& fname ){ if ( tokDebug > 0 ){ - *theErrLog << "%include " << fname << endl; + LOG << "%include " << fname << endl; } ifstream f( fname ); if ( !f ){ @@ -243,12 +264,12 @@ } else { string rawline; - while ( getline(f,rawline) ){ - UnicodeString line = folia::UTF8ToUnicode(rawline); + while ( getline( f, rawline ) ){ + UnicodeString line = TiCC::UnicodeFromUTF8(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if ( tokDebug >= 5 ){ - *theErrLog << "include line = " << rawline << endl; + LOG << "include line = " << rawline << endl; } const int splitpoint = line.indexOf("="); if ( splitpoint < 0 ){ @@ -266,14 +287,14 @@ bool Setting::readfilters( const string& fname ){ if ( tokDebug > 0 ){ - *theErrLog << "%include " << fname << endl; + LOG << "%include " << fname << endl; } return filter.fill( fname ); } bool Setting::readquotes( const string& fname ){ if ( tokDebug > 0 ){ - *theErrLog << "%include " << fname << endl; + LOG << "%include " << fname << endl; } ifstream f( fname ); if ( !f ){ @@ -281,12 +302,12 @@ } else { string rawline; - while ( getline(f,rawline) ){ - UnicodeString line = folia::UTF8ToUnicode(rawline); + while ( getline( f, rawline ) ){ + UnicodeString line = TiCC::UnicodeFromUTF8(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if ( tokDebug >= 5 ){ - *theErrLog << "include line = " << rawline << endl; + LOG << "include line = " << rawline << endl; } int splitpoint = line.indexOf(" "); if ( splitpoint == -1 ) @@ -314,7 +335,7 @@ bool Setting::readeosmarkers( const string& fname ){ if ( tokDebug > 0 ){ - *theErrLog << "%include " << fname << endl; + LOG << "%include " << fname << endl; } ifstream f( fname ); if ( !f ){ @@ -322,12 +343,12 @@ } else { string rawline; - while ( getline(f,rawline) ){ - UnicodeString line = folia::UTF8ToUnicode(rawline); + while ( getline( f, rawline ) ){ + UnicodeString line = TiCC::UnicodeFromUTF8(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if ( tokDebug >= 5 ){ - *theErrLog << "include line = " << rawline << endl; + LOG << "include line = " << rawline << endl; } if ( ( line.startsWith("\\u") && line.length() == 6 ) || ( line.startsWith("\\U") && line.length() == 10 ) ){ @@ -343,10 +364,40 @@ return true; } + UnicodeString escape_regex( const UnicodeString& entry ){ + UnicodeString result; + for ( int i=0; i < entry.length(); ++i ){ + switch ( entry[i] ){ + case '?': + case '^': + case '$': + case '[': + case ']': + case '(': + case ')': + case '{': + case '}': + case '*': + case '.': + case '+': + case '|': + case '-': + if ( i == 0 || entry[i-1] != '\\' ){ + // not escaped + result += "\\"; + } + // fallthrough + default: + result += entry[i]; + } + } + return result; + } + bool Setting::readabbreviations( const string& fname, UnicodeString& abbreviations ){ if ( tokDebug > 0 ){ - *theErrLog << "%include " << fname << endl; + LOG << "%include " << fname << endl; } ifstream f( fname ); if ( !f ){ @@ -354,15 +405,17 @@ } else { string rawline; - while ( getline(f,rawline) ){ - UnicodeString line = folia::UTF8ToUnicode(rawline); + while ( getline( f, rawline ) ){ + UnicodeString line = TiCC::UnicodeFromUTF8(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if ( tokDebug >= 5 ){ - *theErrLog << "include line = " << rawline << endl; + LOG << "include line = " << rawline << endl; } - if ( !abbreviations.isEmpty()) + line = escape_regex( line ); + if ( !abbreviations.isEmpty()){ abbreviations += '|'; + } abbreviations += line; } } @@ -438,7 +491,7 @@ UnicodeString &line, const string& fn ){ try { - UnicodeRegexMatcher m( "\\s+" ); + TiCC::UnicodeRegexMatcher m( "\\s+" ); vector usv; m.split( line, usv ); for ( const auto& us : usv ){ @@ -459,7 +512,7 @@ void split( const string& version, int& major, int& minor, string& sub ){ vector parts; - size_t num = split_at( version, parts, "." ); + size_t num = TiCC::split_at( version, parts, "." ); major = 0; minor = 0; sub.clear(); @@ -497,7 +550,8 @@ } bool Setting::read( const string& settings_name, - int dbg, LogStream* ls ) { + const string& add_tokens, + int dbg, TiCC::LogStream* ls ) { tokDebug = dbg; theErrLog = ls; map pattern = { { ABBREVIATIONS, "" }, @@ -512,6 +566,14 @@ vector meta_rules; string conffile = get_filename( settings_name ); + if ( !TiCC::isFile( conffile ) ){ + LOG << "Unable to open configfile: " << conffile << endl; + return false; + } + if ( !add_tokens.empty() && !TiCC::isFile( add_tokens ) ){ + LOG << "Unable to open additional tokens file: " << add_tokens << endl; + return false; + } ifstream f( conffile ); if ( f ){ ConfigMode mode = NONE; @@ -521,7 +583,7 @@ } int rule_count = 0; string rawline; - while ( getline(f,rawline) ){ + while ( getline( f, rawline ) ){ if ( rawline.find( "%include" ) != string::npos ){ string file = rawline.substr( 9 ); switch ( mode ){ @@ -572,7 +634,7 @@ continue; } - UnicodeString line = folia::UTF8ToUnicode(rawline); + UnicodeString line = TiCC::UnicodeFromUTF8(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if (line[0] == '[') { @@ -599,7 +661,7 @@ rule_count, line, set_file ); break; case METARULES: - meta_rules.push_back( folia::UnicodeToUTF8(line) ); + meta_rules.push_back( TiCC::UnicodeToUTF8(line) ); break; case ABBREVIATIONS: case ATTACHEDPREFIXES: @@ -652,7 +714,7 @@ break; case NONE: { vector parts; - split_at( rawline, parts, "=" ); + TiCC::split_at( rawline, parts, "=" ); if ( parts.size() == 2 ) { if ( parts[0] == "version" ){ version = parts[1]; @@ -661,7 +723,7 @@ } break; default: - throw uLogicError("unhandled case in switch"); + throw uLogicError( "unhandled case in switch" ); } } } @@ -678,7 +740,20 @@ quotes.add( "“„‟", "”" ); } - string split = "%"; + if ( !add_tokens.empty() ){ + ifstream adt( add_tokens ); + string line; + while ( getline( adt, line ) ){ + UnicodeString entry = TiCC::UnicodeFromUTF8(line); + entry = escape_regex( entry ); + if ( !entry.isEmpty() ){ + if ( !pattern[TOKENS].isEmpty() ){ + pattern[TOKENS] += '|'; + } + pattern[TOKENS] += entry; + } + } + } // Create Rules for every pattern that is set // first the meta rules... for ( const auto& mr : meta_rules ){ @@ -688,6 +763,7 @@ set_file ); } string nam = TiCC::trim( mr.substr( 0, pos ) ); + string split = "%"; if ( nam == "SPLITTER" ){ split = mr.substr( pos+1 ); if ( split.empty() ) { @@ -702,7 +778,7 @@ } continue; } - UnicodeString name = folia::UTF8ToUnicode( nam ); + UnicodeString name = TiCC::UnicodeFromUTF8( nam ); string rule = mr.substr( pos+1 ); if ( tokDebug > 5 ){ LOG << "SPLIT using: '" << split << "'" << endl; @@ -716,7 +792,7 @@ vector undef_parts; bool skip_rule = false; for ( const auto& part : parts ){ - UnicodeString meta = folia::UTF8ToUnicode( part ); + UnicodeString meta = TiCC::UnicodeFromUTF8( part ); ConfigMode mode = getMode( "[" + meta + "]" ); switch ( mode ){ case ORDINALS: @@ -738,7 +814,7 @@ break; case NONE: default: - new_parts.push_back( folia::UTF8ToUnicode(part) ); + new_parts.push_back( TiCC::UnicodeFromUTF8(part) ); break; } } @@ -784,7 +860,11 @@ } LOG << "EOS markers: " << eosmarkers << endl; LOG << "Quotations: " << quotes << endl; - LOG << "Filter: " << filter << endl; + try { + LOG << "Filter: " << filter << endl; + } + catch (...){ + } } return true; } diff -Nru ucto-0.9.6/src/textcat.cxx ucto-0.14/src/textcat.cxx --- ucto-0.9.6/src/textcat.cxx 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/src/textcat.cxx 1970-01-01 00:00:00.000000000 +0000 @@ -1,90 +0,0 @@ -/* - Copyright (c) 2017 - CLST - Radboud University - - This file is part of Ucto - - Ucto is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - Ucto is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - - For questions and suggestions, see: - https://github.com/LanguageMachines/ucto/issues - or send mail to: - lamasoftware (at ) science.ru.nl - -*/ -#include -#include -#include "ticcutils/StringOps.h" -#include "config.h" -#include "ucto/textcat.h" - -using namespace std; - -#ifdef ENABLE_TEXTCAT -TextCat::~TextCat() { textcat_Done( TC ); } - -TextCat::TextCat( const std::string& cf ) { - TC = textcat_Init( cf.c_str() ); - if ( TC == 0 ){ - throw runtime_error( "TextCat init failed: " + cf ); - } - cfName = cf; - -} - -TextCat::TextCat( const TextCat& in ) { - TC = textcat_Init( in.cfName.c_str() ); - cfName = in.cfName; -} - -vector TextCat::get_languages( const string& in ) const { - vector vals; - char *res = textcat_Classify( TC, in.c_str(), in.size() ); - if ( res && strlen(res) > 0 && strcmp( res, "SHORT" ) != 0 ){ - string val = res; - TiCC::split_at_first_of( val, vals, "[]" ); - } - return vals; -} - -string TextCat::get_language( const string& in ) const { - vector vals = get_languages( in ); - if ( vals.size() > 0 ){ - return vals[0]; - } - else { - return ""; - } -} - -#else -TextCat::~TextCat() {} - -TextCat::TextCat( const std::string& cf ) { - throw runtime_error( "TextCat::TextCat(" + cf + "): TextCat Support not available" ); -} - -TextCat::TextCat( const TextCat& in ) { - throw runtime_error( "TextCat::TextCat(): TextCat Support not available" ); -} - -vector TextCat::get_languages( const string& in ) const { - throw runtime_error( "TextCat::get_languages(): TextCat Support not available" ); -} - -string TextCat::get_language( const string& in ) const { - throw runtime_error( "TextCat::get_language(): TextCat Support not available" ); -} - -#endif diff -Nru ucto-0.9.6/src/tokenize.cxx ucto-0.14/src/tokenize.cxx --- ucto-0.9.6/src/tokenize.cxx 2017-01-23 12:03:47.000000000 +0000 +++ ucto-0.14/src/tokenize.cxx 2018-11-28 15:23:08.000000000 +0000 @@ -1,5 +1,5 @@ /* - Copyright (c) 2006 - 2017 + Copyright (c) 2006 - 2018 CLST - Radboud University ILK - Tilburg University @@ -25,19 +25,19 @@ */ +#include "ucto/tokenize.h" + #include #include #include #include #include "config.h" #include "unicode/schriter.h" +#include "unicode/ucnv.h" #include "ticcutils/StringOps.h" #include "ticcutils/PrettyPrint.h" -#include "libfolia/folia.h" -#include "ucto/unicode.h" -#include "ucto/textcat.h" -#include "ucto/setting.h" -#include "ucto/tokenize.h" +#include "ticcutils/Unicode.h" +#include "ucto/my_textcat.h" #define DO_READLINE #ifdef HAVE_LIBREADLINE @@ -61,12 +61,13 @@ #endif /* HAVE_READLINE_HISTORY */ using namespace std; -using namespace TiCC; -#define LOG *Log(theErrLog) +#define LOG *TiCC::Log(theErrLog) namespace Tokenizer { + using namespace icu; + const string ISO_SET = "http://raw.github.com/proycon/folia/master/setdefinitions/iso639_3.foliaset"; std::string Version() { return VERSION; } @@ -74,17 +75,17 @@ class uRangeError: public std::out_of_range { public: - uRangeError( const string& s ): out_of_range( "ucto: out of range:" + s ){}; + explicit uRangeError( const string& s ): out_of_range( "ucto: out of range:" + s ){}; }; class uLogicError: public std::logic_error { public: - uLogicError( const string& s ): logic_error( "ucto: logic error:" + s ){}; + explicit uLogicError( const string& s ): logic_error( "ucto: logic error:" + s ){}; }; class uCodingError: public std::runtime_error { public: - uCodingError( const string& s ): runtime_error( "ucto: coding problem:" + s ){}; + explicit uCodingError( const string& s ): runtime_error( "ucto: coding problem:" + s ){}; }; @@ -111,6 +112,7 @@ const UnicodeString type_space = "SPACE"; const UnicodeString type_currency = "CURRENCY"; const UnicodeString type_emoticon = "EMOTICON"; + const UnicodeString type_picto = "PICTOGRAM"; const UnicodeString type_word = "WORD"; const UnicodeString type_symbol = "SYMBOL"; const UnicodeString type_punctuation = "PUNCTUATION"; @@ -120,11 +122,13 @@ Token::Token( const UnicodeString& _type, const UnicodeString& _s, TokenRole _role, const string& _lc ): - type(_type), us(_s), role(_role), lc(_lc) {} + type(_type), us(_s), role(_role), lc(_lc) { + // cerr << "Created " << *this << endl; + } - std::string Token::texttostring() { return folia::UnicodeToUTF8(us); } - std::string Token::typetostring() { return folia::UnicodeToUTF8(type); } + std::string Token::texttostring() { return TiCC::UnicodeToUTF8(us); } + std::string Token::typetostring() { return TiCC::UnicodeToUTF8(type); } ostream& operator<< (std::ostream& os, const Token& t ){ os << t.type << " : " << t.role << ":" << t.us; @@ -147,33 +151,49 @@ eosmark(""), tokDebug(0), verbose(false), - detectBounds(true), detectQuotes(false), doFilter(true), doPunctFilter(false), + splitOnly( false ), detectPar(true), paragraphsignal(true), + doDetectLang(false), + text_redundancy("minimal"), sentenceperlineoutput(false), sentenceperlineinput(false), lowercase(false), uppercase(false), xmlout(false), + xmlin(false), passthru(false), inputclass("current"), outputclass("current"), tc( 0 ) { - theErrLog = new TiCC::LogStream(cerr); - theErrLog->setstamp( NoStamp ); -#ifdef ENABLE_TEXTCAT + theErrLog = new TiCC::LogStream(cerr, "ucto" ); + theErrLog->setstamp( StampMessage ); +#ifdef HAVE_TEXTCAT string textcat_cfg = string(SYSCONF_PATH) + "/ucto/textcat.cfg"; tc = new TextCat( textcat_cfg ); #endif } TokenizerClass::~TokenizerClass(){ - // delete setting; + Setting *d = 0; + for ( const auto& s : settings ){ + if ( s.first == "default" ){ + // the 'default' may also return as a real 'language' + // avoud delettng it twice + d = s.second; + delete d; + } + if ( s.second != d ){ + delete s.second; + } + + } delete theErrLog; + delete tc; } bool TokenizerClass::reset( const string& lang ){ @@ -183,10 +203,9 @@ } bool TokenizerClass::setNormSet( const std::string& values ){ - vector parts; - TiCC::split_at( values, parts, "," ); + vector parts = TiCC::split_at( values, "," ); for ( const auto& val : parts ){ - norm_set.insert( folia::UTF8ToUnicode( val ) ); + norm_set.insert( TiCC::UnicodeFromUTF8( val ) ); } return true; } @@ -204,6 +223,18 @@ return old; } + string TokenizerClass::setTextRedundancy( const std::string& tr ){ + if ( tr == "none" || tr == "minimal" || tr == "full" ){ + string s = text_redundancy; + text_redundancy = tr; + return s; + } + else { + throw runtime_error( "illegal value '" + tr + "' for textredundancy. " + "expected 'full', 'minimal' or 'none'." ); + } + } + void stripCR( string& s ){ string::size_type pos = s.rfind( '\r' ); if ( pos != string::npos ){ @@ -211,6 +242,63 @@ } } + void TokenizerClass::extractSentencesAndFlush( int numS, + vector& outputTokens, + const string& lang ){ + int count = 0; + const int size = tokens.size(); + short quotelevel = 0; + size_t begin = 0; + size_t end = 0; + for ( int i = 0; i < size; ++i ) { + if (tokens[i].role & NEWPARAGRAPH) { + quotelevel = 0; + } + else if (tokens[i].role & ENDQUOTE) { + --quotelevel; + } + if ( (tokens[i].role & BEGINOFSENTENCE) + && (quotelevel == 0)) { + begin = i; + } + //FBK: QUOTELEVEL GOES UP BEFORE begin IS UPDATED... RESULTS IN DUPLICATE OUTPUT + if (tokens[i].role & BEGINQUOTE) { + ++quotelevel; + } + if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) { + end = i+1; + tokens[begin].role |= BEGINOFSENTENCE; //sanity check + if (tokDebug >= 1){ + LOG << "[tokenize] extracted sentence " << count << ", begin="<quotes.emptyStack() ) { + settings[lang]->quotes.flushStack( end ); + } + } + //After flushing, the first token still in buffer (if any) is always a BEGINOFSENTENCE: + if (!tokens.empty()) { + tokens[0].role |= BEGINOFSENTENCE; + } + return; + } + } + } + if ( count < numS ){ + throw uRangeError( "Not enough sentences exists in the buffer: (" + + toString( count ) + " found. " + toString( numS) + + " wanted)" ); + } + } + vector TokenizerClass::tokenizeStream( istream& IN, const string& lang ) { vector outputTokens; @@ -267,7 +355,7 @@ passthruLine( input_line, bos ); } else { - string language; + string language = "default"; if ( tc ){ if ( tokDebug > 3 ){ LOG << "use textCat to guess language from: " @@ -275,7 +363,7 @@ } UnicodeString temp = input_line; temp.toLower(); - string lan = tc->get_language( folia::UnicodeToUTF8(temp) ); + string lan = tc->get_language( TiCC::UnicodeToUTF8(temp) ); if ( settings.find( lan ) != settings.end() ){ if ( tokDebug > 3 ){ LOG << "found a supported language: " << lan << endl; @@ -289,7 +377,7 @@ } language = lan; } - tokenizeLine( input_line, language ); + tokenizeLine( input_line, language, "" ); } numS = countSentences(); //count full sentences in token buffer } @@ -297,15 +385,7 @@ if ( tokDebug > 0 ){ LOG << "[tokenize] " << numS << " sentence(s) in buffer, processing..." << endl; } - for (int i = 0; i < numS; i++) { - vector v = getSentence( i ); - outputTokens.insert( outputTokens.end(), v.begin(), v.end() ); - } - // clear processed sentences from buffer - if ( tokDebug > 0 ){ - LOG << "[tokenize] flushing " << numS << " sentence(s) from buffer..." << endl; - } - flushSentences(numS, lang ); + extractSentencesAndFlush( numS, outputTokens, lang ); return outputTokens; } else { @@ -346,7 +426,7 @@ } stripCR( line ); if ( sentenceperlineinput ) - line += string(" ") + folia::UnicodeToUTF8(eosmark); + line += string(" ") + TiCC::UnicodeToUTF8(eosmark); if ( (done) || (line.empty()) ){ signalParagraph(); numS = countSentences(true); //count full sentences in token buffer, force buffer to empty! @@ -355,7 +435,7 @@ if ( passthru ) passthruLine( line, bos ); else - tokenizeLine( line ); + tokenizeLine( line, lang ); numS = countSentences(); //count full sentences in token buffer } if ( numS > 0 ) { @@ -385,7 +465,7 @@ folia::Document *TokenizerClass::tokenize( istream& IN ) { inputEncoding = checkBOM( IN ); folia::Document *doc = new folia::Document( "id='" + docid + "'" ); - if ( default_language != "none" ){ + if ( /*doDetectLang &&*/ default_language != "none" ){ if ( tokDebug > 0 ){ LOG << "[tokenize](stream): SET document language=" << default_language << endl; } @@ -396,16 +476,23 @@ int parCount = 0; vector buffer; do { - vector v = tokenizeStream( IN ); - for ( auto const& token : v ) { - if ( token.role & NEWPARAGRAPH) { - //process the buffer - parCount = outputTokensXML( root, buffer, parCount ); - buffer.clear(); - } - buffer.push_back( token ); + if ( tokDebug > 0 ){ + LOG << "[tokenize] looping on stream" << endl; + } + vector v = tokenizeStream( IN ); + for ( auto const& token : v ) { + if ( token.role & NEWPARAGRAPH) { + //process the buffer + parCount = outputTokensXML( root, buffer, parCount ); + buffer.clear(); } - } while ( IN ); + buffer.push_back( token ); + } + } + while ( IN ); + if ( tokDebug > 0 ){ + LOG << "[tokenize] end of stream reached" << endl; + } if (!buffer.empty()){ outputTokensXML( root, buffer, parCount); } @@ -427,8 +514,8 @@ else { IN = new ifstream( ifile ); if ( !IN || !IN->good() ){ - cerr << "Error: problems opening inputfile " << ifile << endl; - cerr << "Courageously refusing to start..." << endl; + cerr << "ucto: problems opening inputfile " << ifile << endl; + cerr << "ucto: Courageously refusing to start..." << endl; throw runtime_error( "unable to find or read file: '" + ifile + "'" ); } } @@ -437,6 +524,11 @@ else { folia::Document doc; doc.readFromFile(ifile); + if ( xmlin && inputclass == outputclass ){ + LOG << "ucto: --filter=NO is automatically set. inputclass equals outputclass!" + << endl; + setFiltering(false); + } this->tokenize(doc); *OUT << doc << endl; } @@ -490,30 +582,50 @@ int i = 0; inputEncoding = checkBOM( IN ); do { + if ( tokDebug > 0 ){ + LOG << "[tokenize] looping on stream" << endl; + } vector v = tokenizeStream( IN ); if ( !v.empty() ) { outputTokens( OUT, v , (i>0) ); } ++i; } while ( IN ); + if ( tokDebug > 0 ){ + LOG << "[tokenize] end_of_stream" << endl; + } OUT << endl; } } bool TokenizerClass::tokenize( folia::Document& doc ) { + xmlin = true; // tautology if ( tokDebug >= 2 ){ LOG << "tokenize doc " << doc << endl; } - string lan = doc.doc()->language(); - if ( lan.empty() && default_language != "none" ){ - if ( tokDebug > 1 ){ - LOG << "[tokenize](FoLiA) SET document language=" << default_language << endl; - } - doc.set_metadata( "language", default_language ); + if ( inputclass == outputclass ){ + LOG << "ucto: --filter=NO is automatically set. inputclass equals outputclass!" + << endl; + setFiltering(false); } - else { - if ( tokDebug >= 2 ){ - LOG << "[tokenize](FoLiA) Document has language " << lan << endl; + if ( true /*doDetectLang*/ ){ + string lan = doc.doc()->language(); + if ( lan.empty() && default_language != "none" ){ + if ( tokDebug > 1 ){ + LOG << "[tokenize](FoLiA) SET document language=" << default_language << endl; + } + if ( doc.metadatatype() == "native" ){ + doc.set_metadata( "language", default_language ); + } + else { + LOG << "[WARNING] cannot set the language on FoLiA documents of type " + << doc.metadatatype() << endl; + } + } + else { + if ( tokDebug >= 2 ){ + LOG << "[tokenize](FoLiA) Document has language " << lan << endl; + } } } for ( size_t i = 0; i < doc.doc()->size(); i++) { @@ -527,25 +639,59 @@ void appendText( folia::FoliaElement *root, const string& outputclass ){ - // cerr << endl << "appendText:" << root->id() << endl; + // set the textcontent of root to that of it's children if ( root->hastext( outputclass ) ){ + // there is already text, bail out. + return; + } + if ( root->isSubClass( folia::Linebreak_t ) ){ + // exception return; } UnicodeString utxt = root->text( outputclass, false, false ); - // cerr << "untok: '" << utxt << "'" << endl; - // UnicodeString txt = root->text( outputclass, true ); - // cerr << " tok: '" << txt << "'" << endl; - root->settext( folia::UnicodeToUTF8(utxt), outputclass ); + // so get Untokenized text from the children, and set it + root->settext( TiCC::UnicodeToUTF8(utxt), outputclass ); + } + + void removeText( folia::FoliaElement *root, + const string& outputclass ){ + // remove the textcontent in outputclass of root + root->cleartextcontent( outputclass ); + } + + const string get_language( folia::FoliaElement *f ) { + // get the language of this element, if any, don't look up. + // we search in ALL possible sets! + string st = ""; + std::set exclude; + vector v + = f->select( st, exclude, false ); + string result; + if ( v.size() > 0 ){ + result = v[0]->cls(); + } + return result; } + void set_language( folia::FoliaElement* e, const string& lan ){ + // set or reset the language: append a LangAnnotation child of class 'lan' + folia::KWargs args; + args["class"] = lan; + args["set"] = ISO_SET; + folia::LangAnnotation *node = new folia::LangAnnotation( e->doc() ); + node->setAttributes( args ); + e->replace( node ); + } - void TokenizerClass::tokenizeElement(folia::FoliaElement * element) { + void TokenizerClass::tokenizeElement( folia::FoliaElement * element) { if ( element->isinstance(folia::Word_t) || element->isinstance(folia::TextContent_t)) // shortcut return; if ( tokDebug >= 2 ){ - LOG << "[tokenizeElement] Processing FoLiA element " << element->id() << endl; + LOG << "[tokenizeElement] Processing FoLiA element " << element->xmltag() + << "(" << element->id() << ")" << endl; + LOG << "[tokenizeElement] inputclass=" << inputclass << " outputclass=" << outputclass << endl; } if ( element->hastext( inputclass ) ) { // We have an element which contains text. That's nice @@ -597,15 +743,36 @@ } } // now let's check our language - string lan = element->language(); // remember thus recurses upward - // to get a language from the node, it's parents OR the doc - if ( lan.empty() || default_language == "none" ){ - lan = "default"; + string lan; + if ( doDetectLang ){ + lan = get_language( element ); // is there a local element language? + if ( lan.empty() ){ + // no, so try to detect it! + UnicodeString temp = element->text( inputclass ); + temp.toLower(); + lan = tc->get_language( TiCC::UnicodeToUTF8(temp) ); + if ( lan.empty() ){ + // too bad + lan = "default"; + } + else { + if ( tokDebug >= 2 ){ + LOG << "[tokenizeElement] textcat found a supported language: " << lan << endl; + } + } + } + } + else { + lan = element->language(); // remember thus recurses upward + // to get a language from the node, it's parents OR the doc + if ( lan.empty() || default_language == "none" ){ + lan = "default"; + } } auto const it = settings.find(lan); if ( it != settings.end() ){ if ( tokDebug >= 2 ){ - LOG << "[tokenizeElement] Found a supported language! " << lan << endl; + LOG << "[tokenizeElement] Found a supported language: " << lan << endl; } } else if ( !default_language.empty() ){ @@ -630,12 +797,7 @@ if ( tokDebug >= 2 ){ LOG << "[tokenizeElement] set language to " << lan << endl; } - folia::KWargs args; - args["class"] = lan; - args["set"] = ISO_SET; - folia::LangAnnotation *node = new folia::LangAnnotation( element->doc() ); - node->setAttributes( args ); - element->append( node ); + set_language( element, lan ); } tokenizeSentenceElement( element, lan ); return; @@ -647,9 +809,27 @@ for ( size_t i = 0; i < element->size(); i++) { tokenizeElement( element->index(i)); } + if ( text_redundancy == "full" ){ + if ( tokDebug > 0 ) { + LOG << "[tokenizeElement] Creating text on " << element->id() << endl; + } + appendText( element, outputclass ); + } + else if ( text_redundancy == "none" ){ + if ( tokDebug > 0 ) { + LOG << "[tokenizeElement] Removing text from: " << element->id() << endl; + } + removeText( element, outputclass ); + } return; } + int split_nl( const UnicodeString& line, + vector& parts ){ + static TiCC::UnicodeRegexMatcher nl_split( "\\n", "newline_splitter" ); + return nl_split.split( line, parts ); + } + void TokenizerClass::tokenizeSentenceElement( folia::FoliaElement *element, const string& lang ){ folia::Document *doc = element->doc(); @@ -662,7 +842,7 @@ "annotator='ucto', annotatortype='auto', datetime='now()'" ); } if ( tokDebug > 0 ){ - cerr << "tokenize sentence element: " << element->id() << endl; + LOG << "[tokenizeSentenceElement] " << element->id() << endl; } UnicodeString line = element->stricttext( inputclass ); if ( line.isEmpty() ){ @@ -679,17 +859,32 @@ passthruLine( line, bos ); } else { - tokenizeLine( line, lang ); + // folia may encode newlines. These should be converted to
nodes + // but Linebreak and newline handling is very dangerous and complicated + // so for now is is disabled! + vector parts; + parts.push_back( line ); // just one part + //split_nl( line, parts ); // disabled multipart + for ( auto const& l : parts ){ + if ( tokDebug >= 1 ){ + LOG << "[tokenizeSentenceElement] tokenize part: " << l << endl; + } + tokenizeLine( l, lang, element->id() ); + if ( &l != &parts.back() ){ + // append ' + Token T( "type_linebreak", "\n", LINEBREAK, "" ); + if ( tokDebug >= 1 ){ + LOG << "[tokenizeSentenceElement] added LINEBREAK token " << endl; + } + tokens.push_back( T ); + } + } } //ignore EOL data, we have by definition only one sentence: int numS = countSentences(true); //force buffer to empty vector outputTokens; - for (int i = 0; i < numS; i++) { - vector v = getSentence( i ); - outputTokens.insert( outputTokens.end(), v.begin(), v.end() ); - } + extractSentencesAndFlush( numS, outputTokens, lang ); outputTokensXML( element, outputTokens, 0 ); - flushSentences( numS, lang ); } void TokenizerClass::outputTokensDoc_init( folia::Document& doc ) const { @@ -707,25 +902,6 @@ doc.append( text ); } - void TokenizerClass::outputTokensDoc( folia::Document& doc, - const vector& tv ) const { - folia::FoliaElement *root = doc.doc()->index(0); - string lan = doc.doc()->language(); - if ( lan.empty() ){ - if ( tokDebug >= 1 ){ - LOG << "[outputTokensDoc] SET docuemnt language=" - << default_language << endl; - } - doc.set_metadata( "language", default_language ); - } - else { - if ( tokDebug >= 2 ){ - LOG << "[outputTokensDoc] Document has language " << lan << endl; - } - } - outputTokensXML(root, tv ); - } - int TokenizerClass::outputTokensXML( folia::FoliaElement *root, const vector& tv, int parCount ) const { @@ -741,28 +917,41 @@ if ( root->isinstance( folia::Sentence_t ) ){ root_is_sentence = true; } - else if ( root->isinstance( folia::Paragraph_t ) + else if ( root->isinstance( folia::Paragraph_t ) //TODO: can't we do this smarter? || root->isinstance( folia::Head_t ) || root->isinstance( folia::Note_t ) || root->isinstance( folia::ListItem_t ) || root->isinstance( folia::Part_t ) + || root->isinstance( folia::Utterance_t ) || root->isinstance( folia::Caption_t ) + || root->isinstance( folia::Cell_t ) || root->isinstance( folia::Event_t ) ){ root_is_structure_element = true; } bool in_paragraph = false; for ( const auto& token : tv ) { - if ( ( !root_is_structure_element && !root_is_sentence ) + if ( ( !root_is_structure_element && !root_is_sentence ) //TODO: instead of !root_is_structurel check if is_structure and accepts paragraphs? && ( (token.role & NEWPARAGRAPH) || !in_paragraph ) ) { - if ( in_paragraph ){ - appendText( root, outputclass ); - root = root->parent(); - } if ( tokDebug > 0 ) { LOG << "[outputTokensXML] Creating paragraph" << endl; } + if ( in_paragraph ){ + if ( text_redundancy == "full" ){ + if ( tokDebug > 0 ) { + LOG << "[outputTokensXML] Creating text on root: " << root->id() << endl; + } + appendText( root, outputclass ); + } + else if ( text_redundancy == "none" ){ + if ( tokDebug > 0 ) { + LOG << "[outputTokensXML] Removing text from root: " << root->id() << endl; + } + removeText( root, outputclass ); + } + root = root->parent(); + } folia::KWargs args; args["id"] = root->doc()->id() + ".p." + toString(++parCount); folia::FoliaElement *p = new folia::Paragraph( args, root->doc() ); @@ -782,12 +971,27 @@ LOG << "[outputTokensXML] back to " << root->classname() << endl; } } - if (( token.role & BEGINOFSENTENCE) && (!root_is_sentence)) { + if ( ( token.role & LINEBREAK) ){ + if (tokDebug > 0) { + LOG << "[outputTokensXML] LINEBREAK!" << endl; + } + folia::FoliaElement *lb = new folia::Linebreak(); + root->append( lb ); + if (tokDebug > 0){ + LOG << "[outputTokensXML] back to " << root->classname() << endl; + } + } + if ( ( token.role & BEGINOFSENTENCE) + && !root_is_sentence + && !root->isinstance( folia::Utterance_t ) ) { folia::KWargs args; - if ( root->id().empty() ) - args["generate_id"] = root->parent()->id(); - else - args["generate_id"] = root->id(); + string id = root->id(); + if ( id.empty() ){ + id = root->parent()->id(); + } + if ( !id.empty() ){ + args["generate_id"] = id; + } if ( tokDebug > 0 ) { LOG << "[outputTokensXML] Creating sentence in '" << args["generate_id"] << "'" << endl; @@ -807,62 +1011,86 @@ } s->doc()->declare( folia::AnnotationType::LANG, ISO_SET, "annotator='ucto'" ); - folia::KWargs args; - args["class"] = tok_lan; - args["set"] = ISO_SET; - folia::LangAnnotation *node = new folia::LangAnnotation( s->doc() ); - node->setAttributes( args ); - s->append( node ); + set_language( s, tok_lan ); } root = s; lastS = root; } - if (tokDebug > 0) { - LOG << "[outputTokensXML] Creating word element for " << token.us << endl; - } - folia::KWargs args; - args["generate_id"] = lastS->id(); - args["class"] = folia::UnicodeToUTF8( token.type ); - if ( passthru ){ - args["set"] = "passthru"; - } - else { - auto it = settings.find(token.lc); - if ( it == settings.end() ){ - it = settings.find("default"); + if ( !(token.role & LINEBREAK) ){ + if (tokDebug > 0) { + LOG << "[outputTokensXML] Creating word element for " << token.us << endl; + } + folia::KWargs args; + string id = lastS->id(); + if ( id.empty() ){ + id = lastS->parent()->id(); + } + if ( !id.empty() ){ + args["generate_id"] = id; + } + args["class"] = TiCC::UnicodeToUTF8( token.type ); + if ( passthru ){ + args["set"] = "passthru"; + } + else { + auto it = settings.find(token.lc); + if ( it == settings.end() ){ + it = settings.find("default"); + } + args["set"] = it->second->set_file; + } + if ( token.role & NOSPACE) { + args["space"]= "no"; + } + if ( outputclass != inputclass || outputclass != "current" ){ + args["textclass"] = outputclass; + } + folia::FoliaElement *w = new folia::Word( args, root->doc() ); + root->append( w ); + UnicodeString out = token.us; + if (lowercase) { + out.toLower(); + } + else if (uppercase) { + out.toUpper(); + } + w->settext( TiCC::UnicodeToUTF8( out ), outputclass ); + if ( tokDebug > 1 ) { + LOG << "created " << w << " text= " << token.us << "(" << outputclass << ")" << endl; } - args["set"] = it->second->set_file; - } - if ( token.role & NOSPACE) { - args["space"]= "no"; - } - folia::FoliaElement *w = new folia::Word( args, root->doc() ); - UnicodeString out = token.us; - if (lowercase) { - out.toLower(); - } - else if (uppercase) { - out.toUpper(); } - w->settext( folia::UnicodeToUTF8( out ), outputclass ); - // LOG << "created " << w << " text= " << token.us << endl; - root->append( w ); if ( token.role & BEGINQUOTE) { if (tokDebug > 0) { LOG << "[outputTokensXML] Creating quote element" << endl; } - folia::FoliaElement *q = new folia::Quote( folia::getArgs( "generate_id='" + root->id() + "'"), - root->doc() ); + folia::KWargs args; + string id = root->id(); + if ( id.empty() ){ + id = root->parent()->id(); + } + if ( !id.empty() ){ + args["generate_id"] = id; + } + folia::FoliaElement *q = new folia::Quote( args, root->doc() ); // LOG << "created " << q << endl; root->append( q ); root = q; quotelevel++; } - if ( ( token.role & ENDOFSENTENCE) && (!root_is_sentence) ) { + if ( ( token.role & ENDOFSENTENCE ) && (!root_is_sentence) && (!root->isinstance(folia::Utterance_t))) { if (tokDebug > 0) { LOG << "[outputTokensXML] End of sentence" << endl; } - appendText( root, outputclass ); + if ( text_redundancy == "full" ){ + appendText( root, outputclass ); + } + else if ( text_redundancy == "none" ){ + removeText( root, outputclass ); + } + if ( token.role & LINEBREAK ){ + folia::FoliaElement *lb = new folia::Linebreak(); + root->append( lb ); + } root = root->parent(); lastS = root; if (tokDebug > 0){ @@ -872,7 +1100,21 @@ in_paragraph = true; } if ( tv.size() > 0 ){ - appendText( root, outputclass ); + if ( text_redundancy == "full" ){ + if ( tokDebug > 0 ) { + LOG << "[outputTokensXML] Creating text on root: " << root->id() << endl; + } + appendText( root, outputclass ); + } + else if ( text_redundancy == "none" ){ + if ( tokDebug > 0 ) { + LOG << "[outputTokensXML] Removing text from root: " << root->id() << endl; + } + removeText( root, outputclass ); + } + } + if ( tokDebug > 0 ) { + LOG << "[outputTokensXML] Done. parCount= " << parCount << endl; } return parCount; } @@ -886,6 +1128,9 @@ short quotelevel = 0; bool first = true; for ( const auto token : tokens ) { + if (tokDebug >= 5){ + LOG << "outputTokens: token=" << token << endl; + } if ( detectPar && (token.role & NEWPARAGRAPH) && !verbose @@ -925,31 +1170,50 @@ OUT << endl; } } - else if ( quotelevel == 0 ) { - if (sentenceperlineoutput) { - OUT << endl; + else { + if ( quotelevel == 0 ) { + if (sentenceperlineoutput) { + OUT << endl; + } + else { + OUT << " " + eosmark + " "; + } + if ( splitOnly ){ + OUT << endl; + } } - else { - OUT << " " + eosmark; + else { //inside quotation + if ( splitOnly + && !(token.role & NOSPACE ) ){ + OUT << " "; + } } } } if ( ( &token != &(*tokens.rbegin()) ) && !verbose ) { if ( !( (token.role & ENDOFSENTENCE) - && sentenceperlineoutput ) ) { - OUT << " "; - //FBK: ADD SPACE WITHIN QUOTE CONTEXT IN ANY CASE + && sentenceperlineoutput + && !splitOnly ) ){ + if ( !(token.role & ENDOFSENTENCE) ){ + if ( splitOnly + && (token.role & NOSPACE) ){ + } + else { + OUT << " "; + } + } } else if ( (quotelevel > 0) && sentenceperlineoutput ) { + //FBK: ADD SPACE WITHIN QUOTE CONTEXT IN ANY CASE OUT << " "; } } } } - int TokenizerClass::countSentences(bool forceentirebuffer) { + int TokenizerClass::countSentences( bool forceentirebuffer ) { //Return the number of *completed* sentences in the token buffer //Performs extra sanity checks at the same time! Making sure @@ -1053,14 +1317,21 @@ short quotelevel = 0; size_t begin = 0; size_t end = 0; - for ( int i = 0; i < size; i++) { - if (tokens[i].role & NEWPARAGRAPH) quotelevel = 0; - if (tokens[i].role & ENDQUOTE) quotelevel--; - if ((tokens[i].role & BEGINOFSENTENCE) && (quotelevel == 0)) { + for ( int i = 0; i < size; ++i ) { + if (tokens[i].role & NEWPARAGRAPH) { + quotelevel = 0; + } + else if (tokens[i].role & ENDQUOTE) { + --quotelevel; + } + if ( (tokens[i].role & BEGINOFSENTENCE) + && (quotelevel == 0)) { begin = i; } //FBK: QUOTELEVEL GOES UP BEFORE begin IS UPDATED... RESULTS IN DUPLICATE OUTPUT - if (tokens[i].role & BEGINQUOTE) quotelevel++; + if (tokens[i].role & BEGINQUOTE) { + ++quotelevel; + } if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) { if (count == index) { @@ -1074,7 +1345,7 @@ } return outToks; } - count++; + ++count; } } throw uRangeError( "No sentence exists with the specified index: " @@ -1529,7 +1800,7 @@ } if ( doPunctFilter && ( type == type_punctuation || type == type_currency || - type == type_emoticon ) ) { + type == type_emoticon || type == type_picto ) ) { if (tokDebug >= 2 ){ LOG << " [passThruLine] skipped PUNCTUATION [" << input << "]" << endl; @@ -1592,7 +1863,7 @@ } if ( doPunctFilter && ( type == type_punctuation || type == type_currency || - type == type_emoticon ) ) { + type == type_emoticon || type == type_picto ) ) { if (tokDebug >= 2 ){ LOG << " [passThruLine] skipped PUNCTUATION [" << input << "]" << endl; @@ -1654,7 +1925,13 @@ int TokenizerClass::tokenizeLine( const string& s, const string& lang ){ UnicodeString uinputstring = convert( s, inputEncoding ); - return tokenizeLine( uinputstring, lang ); + return tokenizeLine( uinputstring, lang, "" ); + } + + // UnicodeString wrapper + int TokenizerClass::tokenizeLine( const UnicodeString& u, + const string& lang ){ + return tokenizeLine( u, lang, "" ); } bool u_isemo( UChar32 c ){ @@ -1662,6 +1939,11 @@ return s == UBLOCK_EMOTICONS; } + bool u_ispicto( UChar32 c ){ + UBlockCode s = ublock_getCode(c); + return s == UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS ; + } + bool u_iscurrency( UChar32 c ){ return u_charType( c ) == U_CURRENCY_SYMBOL; } @@ -1686,6 +1968,9 @@ else if ( u_isemo( c ) ) { return type_emoticon; } + else if ( u_ispicto( c ) ) { + return type_picto; + } else if ( u_isalpha(c)) { return type_word; } @@ -1769,7 +2054,8 @@ } int TokenizerClass::tokenizeLine( const UnicodeString& originput, - const string& _lang ){ + const string& _lang, + const string& id ){ string lang = _lang; if ( lang.empty() ){ lang = "default"; @@ -1791,7 +2077,14 @@ input = settings[lang]->filter.filter( input ); } if ( input.isBogus() ){ //only tokenize valid input - *theErrLog << "ERROR: Invalid UTF-8 in line!:" << input << endl; + if ( id.empty() ){ + LOG << "ERROR: Invalid UTF-8 in line:" << linenum << endl + << " '" << input << "'" << endl; + } + else { + LOG << "ERROR: Invalid UTF-8 in element:" << id << endl + << " '" << input << "'" << endl; + } return 0; } int32_t len = input.countChar32(); @@ -1811,16 +2104,18 @@ UnicodeString word; StringCharacterIterator sit(input); long int i = 0; + long int tok_size = 0; while ( sit.hasNext() ){ UChar32 c = sit.current32(); if ( tokDebug > 8 ){ UnicodeString s = c; int8_t charT = u_charType( c ); LOG << "examine character: " << s << " type= " - << toString( charT ) << endl; + << toString( charT ) << endl; } if (reset) { //reset values for new word reset = false; + tok_size = 0; if (!u_isspace(c)) word = c; else @@ -1912,6 +2207,22 @@ } sit.next32(); ++i; + ++tok_size; + if ( tok_size > 2500 ){ + if ( id.empty() ){ + LOG << "Ridiculously long word/token (over 2500 characters) detected " + << "in line: " << linenum << ". Skipped ..." << endl; + LOG << "The line starts with " << UnicodeString( word, 0, 75 ) + << "..." << endl; + } + else { + LOG << "Ridiculously long word/token (over 2500 characters) detected " + << "in element: " << id << ". Skipped ..." << endl; + LOG << "The text starts with " << UnicodeString( word, 0, 75 ) + << "..." << endl; + } + return 0; + } } int numNewTokens = tokens.size() - begintokencount; if ( numNewTokens > 0 ){ @@ -1919,22 +2230,20 @@ tokens[begintokencount].role |= NEWPARAGRAPH | BEGINOFSENTENCE; paragraphsignal = false; } - if ( detectBounds ){ - //find sentence boundaries - if (sentenceperlineinput) { - tokens[begintokencount].role |= BEGINOFSENTENCE; - tokens.back().role |= ENDOFSENTENCE; - if ( detectQuotes ){ - detectQuotedSentenceBounds( begintokencount ); - } + //find sentence boundaries + if (sentenceperlineinput) { + tokens[begintokencount].role |= BEGINOFSENTENCE; + tokens.back().role |= ENDOFSENTENCE; + if ( detectQuotes ){ + detectQuotedSentenceBounds( begintokencount ); + } + } + else { + if ( detectQuotes ){ + detectQuotedSentenceBounds( begintokencount ); } else { - if ( detectQuotes ){ - detectQuotedSentenceBounds( begintokencount ); - } - else { - detectSentenceBounds( begintokencount ); - } + detectSentenceBounds( begintokencount ); } } } @@ -1951,12 +2260,13 @@ if ( tokDebug > 2 ){ if ( recurse ){ LOG << " [tokenizeWord] Recurse Input: (" << inpLen << ") " - << "word=[" << input << "], type=" << assigned_type << endl; + << "word=[" << input << "], type=" << assigned_type + << " Space=" << (space?"TRUE":"FALSE") << endl; } else { LOG << " [tokenizeWord] Input: (" << inpLen << ") " - << "word=[" << input << "]" << endl; - } + << "word=[" << input << "]" + << " Space=" << (space?"TRUE":"FALSE") << endl; } } if ( input == eosmark ) { if (tokDebug >= 2){ @@ -1983,7 +2293,7 @@ } if ( doPunctFilter && ( type == type_punctuation || type == type_currency || - type == type_emoticon ) ) { + type == type_emoticon || type == type_picto ) ) { if (tokDebug >= 2 ){ LOG << " [tokenizeWord] skipped PUNCTUATION [" << input << "]" << endl; @@ -2064,8 +2374,8 @@ } for ( int m=0; m < max; ++m ){ if ( tokDebug >= 4 ){ - LOG << "\tTOKEN match[" << m << "] = " - << matches[m] << endl; + LOG << "\tTOKEN match[" << m << "] = " << matches[m] + << " Space=" << (space?"TRUE":"FALSE") << endl; } if ( doPunctFilter && (&rule->id)->startsWith("PUNCTUATION") ){ @@ -2083,6 +2393,9 @@ if ( post.length() > 0 ) { internal_space = false; } + else if ( m < max-1 ){ + internal_space = false; + } UnicodeString word = matches[m]; if ( norm_set.find( type ) != norm_set.end() ){ word = "{{" + type + "}}"; @@ -2113,7 +2426,7 @@ break; } } - if ( ! a_rule_matched ){ + if ( !a_rule_matched ){ // no rule matched if ( tokDebug >=4 ){ LOG << "\tthere's no match at all" << endl; @@ -2123,10 +2436,10 @@ } } - bool TokenizerClass::init( const string& fname ){ + bool TokenizerClass::init( const string& fname, const string& tname ){ LOG << "Initiating tokeniser..." << endl; Setting *set = new Setting(); - if ( !set->read( fname, tokDebug, theErrLog ) ){ + if ( !set->read( fname, tname, tokDebug, theErrLog ) ){ LOG << "Cannot read Tokeniser settingsfile " << fname << endl; LOG << "Unsupported language? (Did you install the uctodata package?)" << endl; @@ -2143,30 +2456,39 @@ } LOG << "EOS markers: " << set->eosmarkers << endl; LOG << "Quotations: " << set->quotes << endl; - LOG << "Filter: " << set->filter << endl; + try { + LOG << "Filter: " << set->filter << endl; + } + catch (...){ + } } return true; } - bool TokenizerClass::init( const vector& languages ){ + bool TokenizerClass::init( const vector& languages, + const string& tname ){ if ( tokDebug > 0 ){ LOG << "Initiating tokeniser from language list..." << endl; } - Setting *defalt = 0; + Setting *default_set = 0; for ( const auto& lang : languages ){ if ( tokDebug > 0 ){ LOG << "init language=" << lang << endl; } string fname = "tokconfig-" + lang; Setting *set = new Setting(); - if ( !set->read( fname, tokDebug, theErrLog ) ){ + string add; + if ( default_set == 0 ){ + add = tname; + } + if ( !set->read( fname, add, tokDebug, theErrLog ) ){ LOG << "problem reading datafile for language: " << lang << endl; LOG << "Unsupported language (Did you install the uctodata package?)" << endl; } else { - if ( defalt == 0 ){ - defalt = set; + if ( default_set == 0 ){ + default_set = set; settings["default"] = set; default_language = lang; } @@ -2174,7 +2496,7 @@ } } if ( settings.empty() ){ - cerr << "No useful settingsfile(s) could be found." << endl; + cerr << "ucto: No useful settingsfile(s) could be found." << endl; return false; } return true; diff -Nru ucto-0.9.6/src/ucto.cxx ucto-0.14/src/ucto.cxx --- ucto-0.9.6/src/ucto.cxx 2017-01-23 12:03:47.000000000 +0000 +++ ucto-0.14/src/ucto.cxx 2018-11-20 09:07:46.000000000 +0000 @@ -1,5 +1,5 @@ /* - Copyright (c) 2006 - 2017 + Copyright (c) 2006 - 2018 CLST - Radboud University ILK - Tilburg University @@ -35,8 +35,8 @@ #include "libfolia/folia.h" #include "ticcutils/CommandLine.h" #include "ticcutils/PrettyPrint.h" -#include "ucto/unicode.h" -#include "ucto/textcat.h" +#include "ticcutils/Unicode.h" +#include "ucto/my_textcat.h" #include "ucto/setting.h" #include "ucto/tokenize.h" #include @@ -45,39 +45,55 @@ using namespace Tokenizer; void usage(){ + set languages = Setting::installed_languages(); cerr << "Usage: " << endl; cerr << "\tucto [[options]] [input-file] [[output-file]]" << endl << "Options:" << endl - << "\t-c - Explicitly specify a configuration file" << endl - << "\t-d - set debug level" << endl - << "\t-e - set input encoding (default UTF8)" << endl - << "\t-N - set output normalization (default NFC)" << endl - << "\t-f - Disable filtering of special characters" << endl - << "\t-h or --help - this message" << endl - << "\t-L - Automatically selects a configuration file by language code. (default 'generic')" << endl - << "\t-l - Convert to all lowercase" << endl - << "\t-u - Convert to all uppercase" << endl - << "\t-n - One sentence per line (output)" << endl - << "\t-m - One sentence per line (input)" << endl - << "\t-v - Verbose mode" << endl - << "\t-s - End-of-Sentence marker (default: )" << endl - << "\t--passthru - Don't tokenize, but perform input decoding and simple token role detection" << endl + << "\t-c - Explicitly specify a configuration file" << endl + << "\t-d - set debug level" << endl + << "\t-e - set input encoding (default UTF8)" << endl + << "\t-N - set output normalization (default NFC)" << endl + << "\t--filter=[YES|NO] - Disable filtering of special characters" << endl + << "\t-f - OBSOLETE. use --filter=NO" << endl + << "\t-h or --help - this message" << endl + << "\t-L - Automatically selects a configuration file by language code." << endl + << "\t - Available Languages:" << endl + << "\t "; + for( const auto l : languages ){ + cerr << l << ","; + } + cerr << endl; + cerr << "\t-l - Convert to all lowercase" << endl + << "\t-u - Convert to all uppercase" << endl + << "\t-n - One sentence per line (output)" << endl + << "\t-m - One sentence per line (input)" << endl + << "\t-v - Verbose mode" << endl + << "\t-s - End-of-Sentence marker (default: )" << endl + << "\t--passthru - Don't tokenize, but perform input decoding and simple token role detection" << endl << "\t--normalize=,class2>,... " << endl - << "\t - For class1, class2, etc. output the class tokens instead of the tokens itself." << endl - << "\t--filterpunct - remove all punctuation from the output" << endl - << "\t--detectlanguages= - try to detect languages. Default = 'lang1'" << endl - << "\t-P - Disable paragraph detection" << endl - << "\t-S - Disable sentence detection!" << endl - << "\t-Q - Enable quote detection (experimental)" << endl - << "\t-V or --version - Show version information" << endl - << "\t-x - Output FoLiA XML, use the specified Document ID (obsolete)" << endl - << "\t-F - Input file is in FoLiA XML. All untokenised sentences will be tokenised." << endl - << "\t-X - Output FoLiA XML, use the Document ID specified with --id=" << endl - << "\t--id - use the specified Document ID to label the FoLia doc." << endl - << "\t--textclass - use the specified class to search text in the FoLia doc. (deprecated. use --inputclass)" << endl - << "\t--inputclass - use the specified class to search text in the FoLia doc." << endl - << "\t--outputclass - use the specified class to output text in the FoLia doc. (default is 'current'. changing this is dangerous!)" << endl - << "\t (-x and -F disable usage of most other options: -nPQVsS)" << endl; + << "\t - For class1, class2, etc. output the class tokens instead of the tokens itself." << endl + << "\t-T or --textredundancy=[full|minimal|none] - set text redundancy level for text nodes in FoLiA output: " << endl + << "\t 'full' - add text to all levels:

etc." << endl + << "\t 'minimal' - don't introduce text on higher levels, but retain what is already there." << endl + << "\t 'none' - only introduce text on , AND remove all text from higher levels" << endl + << "\t--filterpunct - remove all punctuation from the output" << endl + << "\t--uselanguages= - Using FoLiA input, only tokenize strings in these languages. Default = 'lang1'" << endl + << "\t--detectlanguages= - try to assign a language to each line of text input. Default = 'lang1'" << endl + << "\t--add-tokens='file' - add additional tokens to the [TOKENS] of the" << endl + << "\t default language. TOKENS are always kept intact." << endl + << "\t-P - Disable paragraph detection" << endl + << "\t-Q - Enable quote detection (experimental)" << endl + << "\t-V or --version - Show version information" << endl + << "\t-x - Output FoLiA XML, use the specified Document ID (obsolete)" << endl + << "\t-F - Input file is in FoLiA XML. All untokenised sentences will be tokenised." << endl + << "\t -F is automatically set when inputfile has extension '.xml'" << endl + << "\t-X - Output FoLiA XML, use the Document ID specified with --id=" << endl + << "\t--id - use the specified Document ID to label the FoLia doc." << endl + << " -X is automatically set when inputfile has extension '.xml'" << endl + << "\t--inputclass - use the specified class to search text in the FoLia doc.(default is 'current')" << endl + << "\t--outputclass - use the specified class to output text in the FoLia doc. (default is 'current')" << endl + << "\t--textclass - use the specified class for both input and output of text in the FoLia doc. (default is 'current'). Implies --filter=NO." << endl + << "\t (-x and -F disable usage of most other options: -nPQVs)" << endl; } int main( int argc, char *argv[] ){ @@ -88,29 +104,32 @@ bool sentenceperlineinput = false; bool paragraphdetection = true; bool quotedetection = false; + bool do_language_detect = false; bool dofiltering = true; bool dopunctfilter = false; - bool splitsentences = true; bool xmlin = false; bool xmlout = false; bool verbose = false; + string redundancy = "minimal"; string eosmarker = ""; string docid = "untitleddoc"; - string inputclass = "current"; - string outputclass = "current"; string normalization = "NFC"; string inputEncoding = "UTF-8"; + string inputclass = "current"; + string outputclass = "current"; vector language_list; string cfile; string ifile; string ofile; string c_file; bool passThru = false; + bool sentencesplit = false; string norm_set_string; + string add_tokens; try { - TiCC::CL_Options Opts( "d:e:fhlPQunmN:vVSL:c:s:x:FX", - "filterpunct,passthru,textclass:,inputclass:,outputclass:,normalize:,id:,version,help,detectlanguages:"); + TiCC::CL_Options Opts( "d:e:fhlPQunmN:vVL:c:s:x:FXT:", + "filter:,filterpunct,passthru,textclass:,inputclass:,outputclass:,normalize:,id:,version,help,detectlanguages:,uselanguages:,textredundancy:,add-tokens:,split"); Opts.init(argc, argv ); if ( Opts.extract( 'h' ) || Opts.extract( "help" ) ){ @@ -120,23 +139,30 @@ if ( Opts.extract( 'V' ) || Opts.extract( "version" ) ){ cout << "Ucto - Unicode Tokenizer - version " << Version() << endl - << "(c) ILK 2009 - 2014, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl + << "(c) CLST 2015 - 2018, Centre for Language and Speech Technology, Radboud University Nijmegen" << endl + << "(c) ILK 2009 - 2015, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl << "Licensed under the GNU General Public License v3" << endl; cout << "based on [" << folia::VersionName() << "]" << endl; return EXIT_SUCCESS; } Opts.extract('e', inputEncoding ); - dofiltering = !Opts.extract( 'f' ); dopunctfilter = Opts.extract( "filterpunct" ); paragraphdetection = !Opts.extract( 'P' ); - splitsentences = !Opts.extract( 'S' ); xmlin = Opts.extract( 'F' ); quotedetection = Opts.extract( 'Q' ); Opts.extract( 's', eosmarker ); touppercase = Opts.extract( 'u' ); tolowercase = Opts.extract( 'l' ); + sentencesplit = Opts.extract( "split" ); sentenceperlineoutput = Opts.extract( 'n' ); sentenceperlineinput = Opts.extract( 'm' ); + Opts.extract( 'T', redundancy ); + Opts.extract( "textredundancy", redundancy ); + if ( redundancy != "full" + && redundancy != "minimal" + && redundancy != "none" ){ + throw TiCC::OptionError( "unknown textredundancy level: " + redundancy ); + } Opts.extract( 'N', normalization ); verbose = Opts.extract( 'v' ); if ( Opts.extract( 'x', docid ) ){ @@ -152,10 +178,46 @@ xmlout = Opts.extract( 'X' ); Opts.extract( "id", docid ); } + if ( sentencesplit ){ + if ( xmlout ){ + throw TiCC::OptionError( "conflicting options --split and -x or -X" ); + } + // sentenceperlineoutput = true; + } passThru = Opts.extract( "passthru" ); - Opts.extract( "textclass", inputclass ); + string textclass; + Opts.extract( "textclass", textclass ); Opts.extract( "inputclass", inputclass ); Opts.extract( "outputclass", outputclass ); + if ( !textclass.empty() ){ + if ( inputclass != "current" ){ + throw TiCC::OptionError( "--textclass conflicts with --inputclass" ); + } + if ( outputclass != "current" ){ + throw TiCC::OptionError( "--textclass conflicts with --outputclass"); + } + inputclass = textclass; + outputclass = textclass; + } + if ( Opts.extract( 'f' ) ){ + cerr << "ucto: The -f option is used. Please consider using --filter=NO" << endl; + dofiltering = false; + } + Opts.extract( "add-tokens", add_tokens ); + string value; + if ( Opts.extract( "filter", value ) ){ + bool result; + if ( !TiCC::stringTo( value, result ) ){ + throw TiCC::OptionError( "illegal value for '--filter' option. (boolean expected)" ); + } + dofiltering = result; + } + if ( dofiltering && xmlin && outputclass == inputclass ){ + // we cannot mangle the original inputclass, so disable filtering + cerr << "ucto: --filter=NO is automatically set. inputclass equals outputclass!" + << endl; + dofiltering = false; + } if ( xmlin && outputclass.empty() ){ if ( dopunctfilter ){ throw TiCC::OptionError( "--outputclass required for --filterpunct on FoLiA input "); @@ -167,38 +229,52 @@ throw TiCC::OptionError( "--outputclass required for -l on FoLiA input "); } } - string value; if ( Opts.extract('d', value ) ){ if ( !TiCC::stringTo(value,debug) ){ throw TiCC::OptionError( "invalid value for -d: " + value ); } } + bool use_lang = Opts.is_present( "uselanguages" ); + bool detect_lang = Opts.is_present( "detectlanguages" ); if ( Opts.is_present('L') ) { if ( Opts.is_present('c') ){ - cerr << "Error: -L and -c options conflict. Use only one of them." << endl; - return EXIT_FAILURE; + throw TiCC::OptionError( "-L and -c options conflict. Use only one of these." ); } - else if ( Opts.is_present( "detectlanguages" ) ){ - cerr << "Error: -L and --detectlanguages options conflict. Use only one of them." << endl; - return EXIT_FAILURE; + else if ( detect_lang ){ + throw TiCC::OptionError( "-L and --detectlanguages options conflict. Use only one of these." ); + } + else if ( use_lang ) { + throw TiCC::OptionError( "-L and --uselanguages options conflict. Use only one of these." ); } } - else if ( Opts.is_present( 'c' ) - && Opts.is_present( "detectlanguages" ) ){ - cerr << "Error: -c and --detectlanguages options conflict. Use only one of them." << endl; - return EXIT_FAILURE; + else if ( Opts.is_present( 'c' ) ){ + if ( detect_lang ){ + throw TiCC::OptionError( "-c and --detectlanguages options conflict. Use only one of these" ); + } + else if ( use_lang ){ + throw TiCC::OptionError( "-c and --uselanguages options conflict. Use only one of these." ); + } + } + if ( detect_lang && use_lang ){ + throw TiCC::OptionError( "--detectlanguages and --uselanguages options conflict. Use only one of these." ); } - Opts.extract( 'c', c_file ); + string languages; Opts.extract( "detectlanguages", languages ); - bool do_language_detect = !languages.empty(); - if ( do_language_detect ){ + if ( languages.empty() ){ + Opts.extract( "uselanguages", languages ); + } + else { + do_language_detect = true; + } + if ( !languages.empty() ){ if ( TiCC::split_at( languages, language_list, "," ) < 1 ){ throw TiCC::OptionError( "invalid language list: " + languages ); } } else { + // so --detectlanguages or --uselanguages string language; if ( Opts.extract('L', language ) ){ // support some backward compatability to old ISO 639-1 codes @@ -248,9 +324,22 @@ vector files = Opts.getMassOpts(); if ( files.size() > 0 ){ ifile = files[0]; + if ( TiCC::match_back( ifile, ".xml" ) ){ + xmlin = true; + } } - if ( files.size() > 1 ){ + if ( use_lang && !xmlin ){ + throw TiCC::OptionError( "--uselanguages is only valid for FoLiA input" ); + } + if ( files.size() == 2 ){ ofile = files[1]; + if ( TiCC::match_back( ofile, ".xml" ) ){ + xmlout = true; + } + } + if ( files.size() > 2 ){ + cerr << "found additional arguments on the commandline: " << files[2] + << "...." << endl; } } catch( const TiCC::OptionError& e ){ @@ -258,46 +347,92 @@ usage(); return EXIT_FAILURE; } - if ( !passThru ){ + set available_languages = Setting::installed_languages(); if ( !c_file.empty() ){ cfile = c_file; } else if ( language_list.empty() ){ - cfile = "tokconfig-generic"; + cerr << "ucto: missing a language specification (-L or --detectlanguages or --uselanguages option)" << endl; + if ( available_languages.size() == 1 + && *available_languages.begin() == "generic" ){ + cerr << "ucto: The uctodata package seems not to be installed." << endl; + cerr << "ucto: You can use '-L generic' to run a simple default tokenizer." + << endl; + cerr << "ucto: Installing uctodata is highly recommended." << endl; + } + else { + cerr << "ucto: Available Languages: "; + for( const auto& l : available_languages ){ + cerr << l << ","; + } + cerr << endl; + } + return EXIT_FAILURE; + } + else { + for ( const auto& l : language_list ){ + if ( available_languages.find(l) == available_languages.end() ){ + cerr << "ucto: unsupported language '" << l << "'" << endl; + if ( available_languages.size() == 1 + && *available_languages.begin() == "generic" ){ + cerr << "ucto: The uctodata package seems not to be installed." << endl; + cerr << "ucto: You can use '-L generic' to run a simple default tokenizer." + << endl; + cerr << "ucto: Installing uctodata is highly recommended." << endl; + } + else { + cerr << "ucto: Available Languages: "; + for( const auto& l : available_languages ){ + cerr << l << ","; + } + cerr << endl; + } + return EXIT_FAILURE; + } + } } } if ((!ifile.empty()) && (ifile == ofile)) { - cerr << "Error: Output file equals input file! Courageously refusing to start..." << endl; + cerr << "ucto: Output file equals input file! Courageously refusing to start..." << endl; return EXIT_FAILURE; } - if ( !passThru ){ - cerr << "configfile = " << cfile << endl; - } - cerr << "inputfile = " << ifile << endl; - cerr << "outputfile = " << ofile << endl; + cerr << "ucto: inputfile = " << ifile << endl; + cerr << "ucto: outputfile = " << ofile << endl; istream *IN = 0; if (!xmlin) { - if ( ifile.empty() ) + if ( ifile.empty() ){ IN = &cin; + } else { IN = new ifstream( ifile ); if ( !IN || !IN->good() ){ - cerr << "Error: problems opening inputfile " << ifile << endl; - cerr << "Courageously refusing to start..." << endl; + cerr << "ucto: problems opening inputfile " << ifile << endl; + cerr << "ucto: Courageously refusing to start..." << endl; + delete IN; return EXIT_FAILURE; } } } ostream *OUT = 0; - if ( ofile.empty() ) + if ( ofile.empty() ){ OUT = &cout; + } else { OUT = new ofstream( ofile ); + if ( !OUT || !OUT->good() ){ + cerr << "ucto: problems opening outputfile " << ofile << endl; + cerr << "ucto: Courageously refusing to start..." << endl; + delete OUT; + if ( IN != &cin ){ + delete IN; + } + return EXIT_FAILURE; + } } try { @@ -309,21 +444,30 @@ } else { // init exept for passthru mode - if ( !cfile.empty() ){ - if ( !tokenizer.init( cfile ) ){ - return EXIT_FAILURE; + if ( !cfile.empty() + && !tokenizer.init( cfile, add_tokens ) ){ + if ( IN != &cin ){ + delete IN; } + if ( OUT != &cout ){ + delete OUT; + } + return EXIT_FAILURE; } - else { - if ( !tokenizer.init( language_list ) ){ - return EXIT_FAILURE; + else if ( !tokenizer.init( language_list, add_tokens ) ){ + if ( IN != &cin ){ + delete IN; } + if ( OUT != &cout ){ + delete OUT; + } + return EXIT_FAILURE; } } tokenizer.setEosMarker( eosmarker ); tokenizer.setVerbose( verbose ); - tokenizer.setSentenceDetection( splitsentences ); //detection of sentences + tokenizer.setSentenceSplit(sentencesplit); tokenizer.setSentencePerLineOutput(sentenceperlineoutput); tokenizer.setSentencePerLineInput(sentenceperlineinput); tokenizer.setLowercase(tolowercase); @@ -334,11 +478,13 @@ tokenizer.setNormalization( normalization ); tokenizer.setInputEncoding( inputEncoding ); tokenizer.setFiltering(dofiltering); + tokenizer.setLangDetection(do_language_detect); tokenizer.setPunctFilter(dopunctfilter); tokenizer.setInputClass(inputclass); tokenizer.setOutputClass(outputclass); tokenizer.setXMLOutput(xmlout, docid); tokenizer.setXMLInput(xmlin); + tokenizer.setTextRedundancy(redundancy); if (xmlin) { folia::Document doc; @@ -354,7 +500,7 @@ } } catch ( exception &e ){ - cerr << e.what() << endl; + cerr << "ucto: " << e.what() << endl; return EXIT_FAILURE; } diff -Nru ucto-0.9.6/src/unicode.cxx ucto-0.14/src/unicode.cxx --- ucto-0.9.6/src/unicode.cxx 2017-01-12 13:39:38.000000000 +0000 +++ ucto-0.14/src/unicode.cxx 1970-01-01 00:00:00.000000000 +0000 @@ -1,393 +0,0 @@ -/* - Copyright (c) 2006 - 2017 - CLST - Radboud University - ILK - Tilburg University - - This file is part of Ucto - - Ucto is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - Ucto is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - - For questions and suggestions, see: - https://github.com/LanguageMachines/ucto/issues - or send mail to: - lamasoftware (at ) science.ru.nl -*/ - -#include -#include -#include -#include -#include "unicode/ustream.h" -#include "unicode/regex.h" -#include "unicode/ucnv.h" -#include "ticcutils/StringOps.h" -#include "ticcutils/PrettyPrint.h" -#include "libfolia/folia.h" -#include "ucto/unicode.h" - -using namespace std; - -namespace Tokenizer { - - UNormalizationMode toNorm( const string& enc ){ - UNormalizationMode mode = UNORM_NFC; - if ( enc == "NONE" ) - mode = UNORM_NONE; - else if ( enc == "NFD" ) - mode = UNORM_NFD; - else if ( enc == "NFC" ) - mode = UNORM_NFC; - else if ( enc == "NFKC" ) - mode = UNORM_NFKC; - else if ( enc == "NFKD" ) - mode = UNORM_NFKD; - else - throw std::logic_error( "invalid normalization mode: " + enc ); - return mode; - } - - inline string toString( UNormalizationMode mode ){ - switch ( mode ){ - case UNORM_NONE: - return "NONE"; - case UNORM_NFD: - return "NFD"; - case UNORM_NFC: - return "NFC"; - case UNORM_NFKC: - return "NFKC"; - case UNORM_NFKD: - return "NFKD"; - default: - throw std::logic_error( "invalid normalization mode in switch" ); - } - } - - std::string UnicodeNormalizer::getMode( ) const { - return toString( mode ); - } - - std::string UnicodeNormalizer::setMode( const std::string& s ) { - string res = getMode(); - mode = toNorm( s ); - return res; - } - - UnicodeString UnicodeNormalizer::normalize( const UnicodeString& us ){ - UnicodeString r; - UErrorCode status=U_ZERO_ERROR; - Normalizer::normalize( us, mode, 0, r, status ); - if (U_FAILURE(status)){ - throw std::invalid_argument("Normalizer"); - } - return r; - } - - ostream& operator<<( ostream& os, const UnicodeFilter& q ){ - if ( q.empty() ){ - os << "none" << endl; - } - else { - auto it=q.the_map.cbegin(); - while ( it != q.the_map.cend() ){ - os << folia::UnicodeToUTF8(UnicodeString(it->first)) << "\t" << it->second << endl; - ++it; - } - } - return os; - } - - UnicodeString UnicodeFilter::filter( const UnicodeString& s ){ - if ( empty() ) - return s; - else { - UnicodeString result; - for ( int i=0; i < s.length(); ++i ){ - auto it=the_map.find(s[i]); - if ( it != the_map.cend() ) - result += it->second; - else - result += s[i]; - } - return result; - } - } - - bool UnicodeFilter::add( const string& s ){ - UnicodeString line = folia::UTF8ToUnicode(s); - return add( line ); - } - - bool UnicodeFilter::add( const UnicodeString& s ){ - UnicodeString line = s; - line.trim(); - if ((line.length() > 0) && (line[0] != '#')) { - UnicodeString open = ""; - UnicodeString close = ""; - int splitpoint = line.indexOf(" "); - if ( splitpoint == -1 ) - splitpoint = line.indexOf("\t"); - if ( splitpoint == -1 ){ - open = line; - } - else { - open = UnicodeString( line, 0,splitpoint); - close = UnicodeString( line, splitpoint+1); - } - open = open.trim().unescape(); - close = close.trim().unescape(); - if ( open.length() != 1 ){ - throw runtime_error( "invalid filter entry: " - + folia::UnicodeToUTF8(line) ); - } - else { - this->add( open[0], close ); - } - } - return true; - } - - bool UnicodeFilter::fill( const string& s ){ - ifstream f ( s ); - if ( !f ){ - throw std::runtime_error("unable to open file: " + s ); - } - else { - string rawline; - while ( getline(f,rawline) ){ - this->add( rawline ); - } - } - return true; - } - - class uConfigError: public std::invalid_argument { - public: - uConfigError( const string& s ): invalid_argument( "ucto: config file:" + s ){}; - uConfigError( const UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){}; - }; - - - UnicodeString UnicodeRegexMatcher::Pattern() const{ - return pattern->pattern(); - } - - UnicodeRegexMatcher::UnicodeRegexMatcher( const UnicodeString& pat, - const UnicodeString& name ): - _name(name) - { - failString.clear(); - matcher = NULL; - UErrorCode u_stat = U_ZERO_ERROR; - UParseError errorInfo; - pattern = RegexPattern::compile( pat, 0, errorInfo, u_stat ); - if ( U_FAILURE(u_stat) ){ - string spat = folia::UnicodeToUTF8(pat); - failString = folia::UnicodeToUTF8(_name); - if ( errorInfo.offset >0 ){ - failString += " Invalid regular expression at position " + TiCC::toString( errorInfo.offset ) + "\n"; - UnicodeString pat1 = UnicodeString( pat, 0, errorInfo.offset -1 ); - failString += folia::UnicodeToUTF8(pat1) + " <== HERE\n"; - } - else { - failString += " Invalid regular expression '" + spat + "' "; - } - throw uConfigError(failString); - } - else { - matcher = pattern->matcher( u_stat ); - if (U_FAILURE(u_stat)){ - failString = "unable to create PatterMatcher with pattern '" + - folia::UnicodeToUTF8(pat) + "'"; - throw uConfigError(failString); - } - } - } - - UnicodeRegexMatcher::~UnicodeRegexMatcher(){ - delete pattern; - delete matcher; - } - - //#define MATCH_DEBUG 1 - - bool UnicodeRegexMatcher::match_all( const UnicodeString& line, - UnicodeString& pre, - UnicodeString& post ){ - UErrorCode u_stat = U_ZERO_ERROR; - pre = ""; - post = ""; - results.clear(); - if ( matcher ){ -#ifdef MATCH_DEBUG - cerr << "start matcher [" << line << "], pattern = " << Pattern() << endl; -#endif - matcher->reset( line ); - if ( matcher->find() ){ -#ifdef MATCH_DEBUG - cerr << "matched " << folia::UnicodeToUTF8(line) << endl; - for ( int i=0; i <= matcher->groupCount(); ++i ){ - cerr << "group[" << i << "] =" << matcher->group(i,u_stat) << endl; - } -#endif - if ( matcher->groupCount() == 0 ){ - // case 1: a rule without capture groups matches - UnicodeString us = matcher->group(0,u_stat) ; -#ifdef MATCH_DEBUG - cerr << "case 1, result = " << us << endl; -#endif - results.push_back( us ); - int start = matcher->start( 0, u_stat ); - if ( start > 0 ){ - pre = UnicodeString( line, 0, start ); -#ifdef MATCH_DEBUG - cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl; -#endif - } - int end = matcher->end( 0, u_stat ); - if ( end < line.length() ){ - post = UnicodeString( line, end ); -#ifdef MATCH_DEBUG - cerr << "found post " << folia::UnicodeToUTF8(post) << endl; -#endif - } - return true; - } - else if ( matcher->groupCount() == 1 ){ - // case 2: a rule with one capture group matches - int start = matcher->start( 1, u_stat ); - if ( start >= 0 ){ - UnicodeString us = matcher->group(1,u_stat) ; -#ifdef MATCH_DEBUG - cerr << "case 2a , result = " << us << endl; -#endif - results.push_back( us ); - if ( start > 0 ){ - pre = UnicodeString( line, 0, start ); -#ifdef MATCH_DEBUG - cerr << "found pre " << pre << endl; -#endif - } - int end = matcher->end( 1, u_stat ); - if ( end < line.length() ){ - post = UnicodeString( line, end ); -#ifdef MATCH_DEBUG - cerr << "found post " << post << endl; -#endif - } - } - else { - // group 1 is empty, return group 0 - UnicodeString us = matcher->group(0,u_stat) ; -#ifdef MATCH_DEBUG - cerr << "case 2b , result = " << us << endl; -#endif - results.push_back( us ); - start = matcher->start( 0, u_stat ); - if ( start > 0 ){ - pre = UnicodeString( line, 0, start ); -#ifdef MATCH_DEBUG - cerr << "found pre " << pre << endl; -#endif - } - int end = matcher->end( 0, u_stat ); - if ( end < line.length() ){ - post = UnicodeString( line, end ); -#ifdef MATCH_DEBUG - cerr << "found post " << post << endl; -#endif - } - } - return true; - } - else { - // a rule with more then 1 capture group - // this is quite ugly... - int end = 0; - for ( int i=0; i <= matcher->groupCount(); ++i ){ -#ifdef MATCH_DEBUG - cerr << "group " << i << endl; -#endif - u_stat = U_ZERO_ERROR; - int start = matcher->start( i, u_stat ); -#ifdef MATCH_DEBUG - cerr << "start = " << start << endl; -#endif - if (!U_FAILURE(u_stat)){ - if ( start < 0 ){ - continue; - } - } - else - break; - if ( start > end ){ - pre = UnicodeString( line, end, start ); -#ifdef MATCH_DEBUG - cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl; -#endif - } - end = matcher->end( i, u_stat ); -#ifdef MATCH_DEBUG - cerr << "end = " << end << endl; -#endif - if (!U_FAILURE(u_stat)){ - results.push_back( UnicodeString( line, start, end - start ) ); -#ifdef MATCH_DEBUG - cerr << "added result " << folia::UnicodeToUTF8( results.back() ) << endl; -#endif - } - else - break; - } - if ( end < line.length() ){ - post = UnicodeString( line, end ); -#ifdef MATCH_DEBUG - cerr << "found post " << folia::UnicodeToUTF8(post) << endl; -#endif - } - return true; - } - } - } - results.clear(); - return false; - } - - const UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{ - if ( n < results.size() ) - return results[n]; - else - return ""; - } - - int UnicodeRegexMatcher::NumOfMatches() const { - if ( results.size() > 0 ) - return results.size()-1; - else - return 0; - } - - int UnicodeRegexMatcher::split( const UnicodeString& us, - vector& result ){ - result.clear(); - const int maxWords = 256; - UnicodeString words[maxWords]; - UErrorCode status = U_ZERO_ERROR; - int numWords = matcher->split( us, words, maxWords, status ); - for ( int i = 0; i < numWords; ++i ) - result.push_back( words[i] ); - return numWords; - } - -} // namespace Tokenizer diff -Nru ucto-0.9.6/tests/Makefile.in ucto-0.14/tests/Makefile.in --- ucto-0.9.6/tests/Makefile.in 2017-01-23 12:52:24.000000000 +0000 +++ ucto-0.14/tests/Makefile.in 2018-11-20 09:07:54.000000000 +0000 @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. +# Makefile.in generated by automake 1.15.1 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2014 Free Software Foundation, Inc. +# Copyright (C) 1994-2017 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -89,7 +89,9 @@ host_triplet = @host@ subdir = tests ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_old_textcat.m4 \ + $(top_srcdir)/m4/ac_osx_pkg.m4 \ + $(top_srcdir)/m4/ac_search_lm.m4 \ $(top_srcdir)/m4/ax_lib_readline.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ @@ -155,13 +157,7 @@ FGREP = @FGREP@ GREP = @GREP@ ICU_CFLAGS = @ICU_CFLAGS@ -ICU_CONFIG = @ICU_CONFIG@ -ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@ -ICU_CXXFLAGS = @ICU_CXXFLAGS@ -ICU_IOLIBS = @ICU_IOLIBS@ -ICU_LIBPATH = @ICU_LIBPATH@ ICU_LIBS = @ICU_LIBS@ -ICU_VERSION = @ICU_VERSION@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ @@ -179,6 +175,7 @@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +MODULE_PREFIX = @MODULE_PREFIX@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ @@ -202,6 +199,8 @@ SHELL = @SHELL@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ +TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ +TEXTCAT_LIBS = @TEXTCAT_LIBS@ VERSION = @VERSION@ XML2_CFLAGS = @XML2_CFLAGS@ XML2_LIBS = @XML2_LIBS@ @@ -252,6 +251,7 @@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +runstatedir = @runstatedir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ diff -Nru ucto-0.9.6/ucto.pc.in ucto-0.14/ucto.pc.in --- ucto-0.9.6/ucto.pc.in 2016-02-11 13:04:11.000000000 +0000 +++ ucto-0.14/ucto.pc.in 2018-11-13 16:30:47.000000000 +0000 @@ -6,7 +6,6 @@ Name: ucto Version: @VERSION@ Description: Unicode Tokenizer -Requires.private: ucto-icu >= 3.6 folia >= 0.3 Libs: -L${libdir} -lucto Libs.private: @LIBS@ Cflags: -I${includedir}