diff -Nru lttoolbox-3.6.6/CMakeLists.txt lttoolbox-3.7.1/CMakeLists.txt --- lttoolbox-3.6.6/CMakeLists.txt 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/CMakeLists.txt 2022-11-01 08:36:47.000000000 +0000 @@ -1,13 +1,11 @@ cmake_minimum_required(VERSION 3.0 FATAL_ERROR) cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}) project(lttoolbox - VERSION 3.5.2 + VERSION 3.7.0 LANGUAGES CXX C ) set(VERSION ${PROJECT_VERSION}) -set(VERSION_MAJOR ${PROJECT_VERSION_MAJOR}) -set(VERSION_API "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}") -set(SOVERSION 1.0.0) +set(VERSION_ABI 3) set(PACKAGE_BUGREPORT "apertium-stuff@lists.sourceforge.net") add_definitions("-DPACKAGE_VERSION=\"${PROJECT_VERSION}\"") @@ -68,7 +66,7 @@ endforeach() # Require latest possible C++ standard - foreach(flag "-std=c++20" "-std=c++2a" "-std=c++17" "-std=c++1z" "-std=c++14" "-std=c++1y") + foreach(flag "-std=c++23" "-std=c++2b" "-std=c++20" "-std=c++2a" "-std=c++17") string(REGEX REPLACE "[^a-z0-9]" "-" _flag ${flag}) CHECK_CXX_COMPILER_FLAG(${flag} COMPILER_SUPPORTS_${_flag}) if(COMPILER_SUPPORTS_${_flag}) @@ -78,7 +76,7 @@ endif() endforeach() if(NOT _ENABLED_CXX) - message(FATAL_ERROR "Could not enable at least C++14 (C++1y) - upgrade your compiler") + message(FATAL_ERROR "Could not enable at least C++17 - upgrade your compiler") endif() # Generate pkg-config file @@ -106,6 +104,10 @@ add_definitions(-D_POSIX_C_SOURCE=200112 -D_GNU_SOURCE) endif() +if(NOT APPLE) + find_package(Threads REQUIRED) +endif() + # Unlocked I/O functions include(CheckSymbolExists) set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112 -D_GNU_SOURCE) @@ -128,6 +130,9 @@ set(GETOPT_LIB) endif() +# ICU +find_package(ICU COMPONENTS i18n io uc REQUIRED) + include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if(BUILD_TESTING) diff -Nru lttoolbox-3.6.6/configure.ac lttoolbox-3.7.1/configure.ac --- lttoolbox-3.6.6/configure.ac 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/configure.ac 2022-11-01 08:36:47.000000000 +0000 @@ -1,21 +1,20 @@ AC_PREREQ(2.52) m4_define([PKG_VERSION_MAJOR], [3]) -m4_define([PKG_VERSION_MINOR], [6]) -m4_define([PKG_VERSION_PATCH], [6]) +m4_define([PKG_VERSION_MINOR], [7]) +m4_define([PKG_VERSION_PATCH], [1]) + +# Bump if the ABI (not API) changed in a backwards-incompatible manner +m4_define([PKG_VERSION_ABI], [3]) AC_INIT([lttoolbox], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [apertium-stuff@lists.sourceforge.net], [lttoolbox], [https://wiki.apertium.org/wiki/Lttoolbox]) VERSION=$PACKAGE_VERSION -VERSION_MAJOR=PKG_VERSION_MAJOR -VERSION_API=PKG_VERSION_MAJOR.PKG_VERSION_MINOR -SOVERSION=1:0:0 +VERSION_ABI=PKG_VERSION_ABI AC_SUBST(PACKAGE_NAME) AC_SUBST(PACKAGE_VERSION) -AC_SUBST(VERSION_MAJOR) -AC_SUBST(VERSION_API) -AC_SUBST(SOVERSION) +AC_SUBST(VERSION_ABI) AM_INIT_AUTOMAKE AC_CONFIG_MACRO_DIR([m4]) @@ -51,6 +50,9 @@ AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) ]) +# Checks for POSIX thread support +AX_PTHREAD([], [AC_MSG_ERROR([Can't find libpthread])]) + # Checks for typedefs, structures, and compiler characteristics. AC_HEADER_STDBOOL AC_C_CONST @@ -68,12 +70,12 @@ # Require highest supported C++ standard AC_LANG(C++) -for version in 23 2b 20 2a 17 1z 14 1y; do +for version in 23 2b 20 2a 17; do version_flag="-std=c++${version}" AX_CHECK_COMPILE_FLAG([${version_flag}], [break], [version_flag=none]) done AS_IF([test "$version_flag" == none], [ - AC_MSG_ERROR([Could not enable at least C++1y (C++14) - upgrade your compiler]) + AC_MSG_ERROR([Could not enable at least C++17 - upgrade your compiler]) ]) CXXFLAGS="$CXXFLAGS ${version_flag}" @@ -82,9 +84,8 @@ #include #include ]],[[ -using namespace std; -static_assert(!is_same::value, "size_t == uint32_t"); -static_assert(!is_same::value, "size_t == uint64_t"); +static_assert(!std::is_same::value, "size_t == uint32_t"); +static_assert(!std::is_same::value, "size_t == uint64_t"); ]])], [AC_DEFINE([SIZET_NOT_CSTDINT], [1], [size_t != (uint32_t, uint64_t)])]) AM_PATH_PYTHON([3.4], [], [AC_MSG_WARN([Can't generate SWIG wrapper or run tests without Python])]) diff -Nru lttoolbox-3.6.6/debian/changelog lttoolbox-3.7.1/debian/changelog --- lttoolbox-3.6.6/debian/changelog 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/changelog 2022-11-01 08:38:15.000000000 +0000 @@ -1,3 +1,20 @@ +lttoolbox (3.7.1-1) unstable; urgency=medium + + * Update to latest upstream + + -- Tino Didriksen Tue, 01 Nov 2022 09:38:15 +0100 + +lttoolbox (3.7.0-1) experimental; urgency=medium + + * Update to latest upstream: + + Updated package names for soname untangling. + + Binary lsx-comp moved from package apertium-separable + * debian/ + + docs: Removed AUTHORS in favour of installing it via rules -A + + copyright: python/* is GPLv3+ + + -- Tino Didriksen Wed, 26 Oct 2022 13:41:14 +0200 + lttoolbox (3.6.6-1) unstable; urgency=medium [ Tino Didriksen ] diff -Nru lttoolbox-3.6.6/debian/control lttoolbox-3.7.1/debian/control --- lttoolbox-3.6.6/debian/control 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/control 2022-11-01 08:38:15.000000000 +0000 @@ -15,7 +15,7 @@ python3-dev, python3-setuptools, swig -Standards-Version: 4.6.0.1 +Standards-Version: 4.6.1 Homepage: https://apertium.org/ Vcs-Git: https://salsa.debian.org/science-team/lttoolbox.git Vcs-Browser: https://salsa.debian.org/science-team/lttoolbox @@ -23,7 +23,7 @@ Package: lttoolbox Architecture: any -Depends: liblttoolbox3-3.6-1 (= ${binary:Version}), +Depends: liblttoolbox3 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends} Description: Apertium lexical processing modules and tools @@ -33,17 +33,15 @@ for making morphological analysers and generators for natural language processing applications. -Package: liblttoolbox3-3.6-1 +Package: liblttoolbox3 Section: libs Architecture: any Multi-Arch: same Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends} Provides: liblttoolbox -Conflicts: liblttoolbox, liblttoolbox3 -Replaces: liblttoolbox, - liblttoolbox3, - liblttoolbox3-3.3-0v5 (<= 3.3.2~r61000-3.1) +Conflicts: liblttoolbox +Replaces: liblttoolbox, liblttoolbox3-3.3-0v5 (<= 3.3.2~r61000-3.1) Breaks: liblttoolbox3-3.3-0v5 (<= 3.3.2~r61000-3.1) Description: Shared library for lttoolbox The lttoolbox contains the augmented letter transducer tools for natural @@ -62,7 +60,8 @@ ${misc:Depends}, ${shlibs:Depends} Conflicts: liblttoolbox3-dev -Replaces: liblttoolbox3-dev +Replaces: apertium-separable (<< 0.6.1-1), liblttoolbox3-dev +Breaks: apertium-separable (<< 0.6.1-1) Description: Development tools and library for lttoolbox The lttoolbox contains the augmented letter transducer tools for natural language processing used by Apertium, a platform for building rule-based @@ -76,7 +75,7 @@ Architecture: any Section: python Provides: ${python3:Provides} -Depends: liblttoolbox3-3.6-1 (= ${binary:Version}), +Depends: liblttoolbox3 (= ${binary:Version}), ${misc:Depends}, ${python3:Depends}, ${shlibs:Depends} diff -Nru lttoolbox-3.6.6/debian/copyright lttoolbox-3.7.1/debian/copyright --- lttoolbox-3.6.6/debian/copyright 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/copyright 2022-11-01 08:38:15.000000000 +0000 @@ -26,3 +26,24 @@ . On Debian systems, the complete text of the GNU General Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". + +Files: python/* +Copyright: 2019-2022, Apertium Project Management Committee +License: GPL-3+ + +License: GPL-3+ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + . + This package is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License + along with this program. If not, see . + . + On Debian systems, the complete text of the GNU General + Public License version 3 can be found in "/usr/share/common-licenses/GPL-3". diff -Nru lttoolbox-3.6.6/debian/docs lttoolbox-3.7.1/debian/docs --- lttoolbox-3.6.6/debian/docs 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/docs 2022-11-01 08:38:15.000000000 +0000 @@ -1,3 +1,2 @@ -AUTHORS NEWS README diff -Nru lttoolbox-3.6.6/debian/liblttoolbox3-3.6-1.install lttoolbox-3.7.1/debian/liblttoolbox3-3.6-1.install --- lttoolbox-3.6.6/debian/liblttoolbox3-3.6-1.install 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/liblttoolbox3-3.6-1.install 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -debian/tmp/usr/lib/*/liblttoolbox3-3.6.so.* diff -Nru lttoolbox-3.6.6/debian/liblttoolbox3-3.6-1.lintian-overrides lttoolbox-3.7.1/debian/liblttoolbox3-3.6-1.lintian-overrides --- lttoolbox-3.6.6/debian/liblttoolbox3-3.6-1.lintian-overrides 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/liblttoolbox3-3.6-1.lintian-overrides 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -# Symbols file only gets in the way for C++ -no-symbols-control-file diff -Nru lttoolbox-3.6.6/debian/liblttoolbox3.install lttoolbox-3.7.1/debian/liblttoolbox3.install --- lttoolbox-3.6.6/debian/liblttoolbox3.install 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/debian/liblttoolbox3.install 2022-11-01 08:38:15.000000000 +0000 @@ -0,0 +1 @@ +debian/tmp/usr/lib/*/*.so.* diff -Nru lttoolbox-3.6.6/debian/liblttoolbox3.lintian-overrides lttoolbox-3.7.1/debian/liblttoolbox3.lintian-overrides --- lttoolbox-3.6.6/debian/liblttoolbox3.lintian-overrides 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/debian/liblttoolbox3.lintian-overrides 2022-11-01 08:38:15.000000000 +0000 @@ -0,0 +1,2 @@ +# Symbols file only gets in the way for C++ +no-symbols-control-file diff -Nru lttoolbox-3.6.6/debian/lttoolbox-dev.install lttoolbox-3.7.1/debian/lttoolbox-dev.install --- lttoolbox-3.6.6/debian/lttoolbox-dev.install 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/lttoolbox-dev.install 2022-11-01 08:38:15.000000000 +0000 @@ -1,14 +1,23 @@ +debian/tmp/usr/bin/lsx-comp debian/tmp/usr/bin/lt-append +debian/tmp/usr/bin/lt-apply-acx debian/tmp/usr/bin/lt-comp +debian/tmp/usr/bin/lt-compose debian/tmp/usr/bin/lt-expand +debian/tmp/usr/bin/lt-invert +debian/tmp/usr/bin/lt-paradigm debian/tmp/usr/bin/lt-print +debian/tmp/usr/bin/lt-restrict debian/tmp/usr/bin/lt-trim debian/tmp/usr/include debian/tmp/usr/lib/*/*.so debian/tmp/usr/lib/*/pkgconfig debian/tmp/usr/share/lttoolbox +debian/tmp/usr/share/man/man1/lsx-comp.* debian/tmp/usr/share/man/man1/lt-append.* debian/tmp/usr/share/man/man1/lt-comp.* +debian/tmp/usr/share/man/man1/lt-compose.* debian/tmp/usr/share/man/man1/lt-expand.* +debian/tmp/usr/share/man/man1/lt-paradigm.* debian/tmp/usr/share/man/man1/lt-print.* debian/tmp/usr/share/man/man1/lt-trim.* diff -Nru lttoolbox-3.6.6/debian/rules lttoolbox-3.7.1/debian/rules --- lttoolbox-3.6.6/debian/rules 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/rules 2022-11-01 08:38:15.000000000 +0000 @@ -5,7 +5,7 @@ export DH_OPTIONS export LC_ALL=C.UTF-8 -export DEB_BUILD_MAINT_OPTIONS = hardening=+all +export "DEB_BUILD_MAINT_OPTIONS=hardening=+all optimize=+lto" DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/buildflags.mk @@ -21,5 +21,6 @@ find $(CURDIR) -type f -name '*.pyo' -exec rm -f '{}' \; find $(CURDIR) -type f -name '*.la' -exec rm -f '{}' \; -override_dh_missing: - dh_missing --fail-missing +override_dh_installdocs: + dh_installdocs + dh_installdocs -A AUTHORS diff -Nru lttoolbox-3.6.6/debian/watch lttoolbox-3.7.1/debian/watch --- lttoolbox-3.6.6/debian/watch 2022-06-06 11:14:19.000000000 +0000 +++ lttoolbox-3.7.1/debian/watch 2022-11-01 08:38:15.000000000 +0000 @@ -1,3 +1,4 @@ version=4 -https://github.com/apertium/lttoolbox/releases \ - .*/@PACKAGE@-(\d[\d.]*)\.tar\.(?:xz|bz2) debian uupdate +opts="searchmode=plain" \ + https://api.github.com/repos/apertium/@PACKAGE@/releases \ + https://github.com/apertium/@PACKAGE@/releases/download/v(?:\d[\d.]*)/@PACKAGE@@ANY_VERSION@@ARCHIVE_EXT@ diff -Nru lttoolbox-3.6.6/lttoolbox/acx.cc lttoolbox-3.7.1/lttoolbox/acx.cc --- lttoolbox-3.6.6/lttoolbox/acx.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/acx.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include + +const xmlChar* CHAR_NODE = (const xmlChar*)"char"; +const xmlChar* EQUIV_NODE = (const xmlChar*)"equiv-char"; +const char* VALUE_ATTR = "value"; + +int32_t get_val(xmlNode* node) +{ + UString s = getattr(node, VALUE_ATTR); + if (s.empty()) { + error_and_die(node, "Missing value attribute."); + } + std::vector v; + ustring_to_vec32(s, v); + if (v.size() > 1) { + error_and_die(node, "Expected a single character in value attribute, but found %d.", v.size()); + } + return v[0]; +} + +std::map> readACX(const char* file) +{ + std::map> acx; + xmlNode* top_node = load_xml(file); + for (auto char_node : children(top_node)) { + if (!xmlStrEqual(char_node->name, CHAR_NODE)) { + error_and_die(char_node, "Expected but found <%s>.", + (const char*)char_node->name); + } + int32_t key = get_val(char_node); + sorted_vector vec; + for (auto equiv_node : children(char_node)) { + if (!xmlStrEqual(equiv_node->name, EQUIV_NODE)) { + error_and_die(char_node, "Expected but found <%s>.", + (const char*)equiv_node->name); + } + vec.insert(get_val(equiv_node)); + } + if (!vec.empty()) { + acx.insert({key, vec}); + } + } + return acx; +} diff -Nru lttoolbox-3.6.6/lttoolbox/acx.h lttoolbox-3.7.1/lttoolbox/acx.h --- lttoolbox-3.6.6/lttoolbox/acx.h 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/acx.h 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _ACXPARSEUTIL_ +#define _ACXPARSEUTIL_ + +#include +#include + +std::map> readACX(const char* file); + +#endif diff -Nru lttoolbox-3.6.6/lttoolbox/acx.rng lttoolbox-3.7.1/lttoolbox/acx.rng --- lttoolbox-3.6.6/lttoolbox/acx.rng 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/acx.rng 2022-11-01 08:36:47.000000000 +0000 @@ -3,14 +3,14 @@ - + 1 - + 1 diff -Nru lttoolbox-3.6.6/lttoolbox/alphabet.cc lttoolbox-3.7.1/lttoolbox/alphabet.cc --- lttoolbox-3.6.6/lttoolbox/alphabet.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/alphabet.cc 2022-11-01 08:36:47.000000000 +0000 @@ -26,13 +26,12 @@ #include -using namespace std; using namespace icu; Alphabet::Alphabet() { - spair[pair(0,0)] = 0; - spairinv.push_back(pair(0,0)); + spair[std::pair(0,0)] = 0; + spairinv.push_back(std::pair(0,0)); } Alphabet::~Alphabet() @@ -71,20 +70,21 @@ } void -Alphabet::includeSymbol(UString const &s) +Alphabet::includeSymbol(UStringView s) { if(slexic.find(s) == slexic.end()) { int32_t slexic_size = slexic.size(); - slexic[s] = -(slexic_size+1); - slexicinv.push_back(s); + UString st{s}; + slexic[st] = -(slexic_size+1); + slexicinv.push_back(st); } } int32_t Alphabet::operator()(int32_t const c1, int32_t const c2) { - auto tmp = make_pair(c1, c2); + auto tmp = std::make_pair(c1, c2); if(spair.find(tmp) == spair.end()) { int32_t spair_size = spair.size(); @@ -96,13 +96,18 @@ } int32_t -Alphabet::operator()(UString const &s) +Alphabet::operator()(UStringView s) { - return slexic[s]; + // While the documentation says this assumes existence, there are clearly code paths that call it with an unknown symbol and thus get 0 back AND create an entry for that 0. Changing it to just return 0 still passes all tests. + auto it = slexic.find(s); + if (it == slexic.end()) { + return 0; + } + return it->second; } int32_t -Alphabet::operator()(UString const &s) const +Alphabet::operator()(UStringView s) const { auto it = slexic.find(s); if (it == slexic.end()) { @@ -112,7 +117,7 @@ } bool -Alphabet::isSymbolDefined(UString const &s) +Alphabet::isSymbolDefined(UStringView s) const { return slexic.find(s) != slexic.end(); } @@ -124,7 +129,7 @@ } void -Alphabet::write(FILE *output) +Alphabet::write(FILE *output) const { // First, we write the taglist Compression::multibyte_write(slexicinv.size(), output); // taglist size @@ -153,7 +158,7 @@ // Reading of taglist int32_t tam = Compression::multibyte_read(input); - map tmp; + std::map tmp; while(tam > 0) { tam--; @@ -172,7 +177,7 @@ tam--; int32_t first = Compression::multibyte_read(input); int32_t second = Compression::multibyte_read(input); - pair tmp(first - bias, second - bias); + std::pair tmp(first - bias, second - bias); int32_t spair_size = a_new.spair.size(); a_new.spair[tmp] = spair_size; a_new.spairinv.push_back(tmp); @@ -184,8 +189,8 @@ void Alphabet::serialise(std::ostream &serialised) const { - Serialiser >::serialise(slexicinv, serialised); - Serialiser > >::serialise(spairinv, serialised); + Serialiser >::serialise(slexicinv, serialised); + Serialiser > >::serialise(spairinv, serialised); } void @@ -195,11 +200,11 @@ slexic.clear(); spairinv.clear(); spair.clear(); - slexicinv = Deserialiser >::deserialise(serialised); + slexicinv = Deserialiser >::deserialise(serialised); for (size_t i = 0; i < slexicinv.size(); i++) { slexic[slexicinv[i]] = -i - 1; // ToDo: This does not turn the result negative due to unsigned semantics } - spairinv = Deserialiser > >::deserialise(serialised); + spairinv = Deserialiser > >::deserialise(serialised); for (size_t i = 0; i < slexicinv.size(); i++) { spair[spairinv[i]] = i; } @@ -220,7 +225,7 @@ } void -Alphabet::getSymbol(UString &result, int32_t const symbol, bool uppercase) const +Alphabet::getSymbol(UString &result, int32_t symbol, bool uppercase) const { if (symbol == 0) { return; @@ -234,20 +239,20 @@ } bool -Alphabet::isTag(int32_t const symbol) const +Alphabet::isTag(int32_t symbol) const { return symbol < 0; } -pair const & -Alphabet::decode(int32_t const code) const +std::pair const & +Alphabet::decode(int32_t code) const { return spairinv[code]; } -set +std::set Alphabet::symbolsWhereLeftIs(UChar32 l) const { - set eps; + std::set eps; for(const auto& sp: spair) { // [(l, r) : tag] if(sp.first.first == l) { eps.insert(sp.second); @@ -256,17 +261,17 @@ return eps; } -void Alphabet::setSymbol(int32_t symbol, UString newSymbolString) { +void Alphabet::setSymbol(int32_t symbol, UStringView newSymbolString) { //Should be a special character! if (symbol < 0) slexicinv[-symbol-1] = newSymbolString; } void -Alphabet::createLoopbackSymbols(set &symbols, Alphabet &basis, Side s, bool nonTagsToo) +Alphabet::createLoopbackSymbols(std::set &symbols, const Alphabet &basis, Side s, bool nonTagsToo) { // Non-tag letters get the same int32_t in spairinv across alphabets, // but tags may differ, so do those separately afterwards. - set tags; + std::set tags; for(auto& it : basis.spairinv) { if(s == left) { @@ -301,3 +306,49 @@ } } } + +std::vector +Alphabet::tokenize(UStringView str) const +{ + std::vector ret; + size_t end = str.size(); + size_t i = 0; + UChar32 c; + while (i < end) { + U16_NEXT(str.data(), i, end, c); + if (c == '\\') { + } else if (c == '<') { + size_t j = i; + while (c != '>' && j < end) { + U16_NEXT(str.data(), j, end, c); + } + if (c == '>') { + ret.push_back(operator()(str.substr(i-1, j-i+1))); + i = j; + } + } else { + ret.push_back(static_cast(c)); + } + } + return ret; +} + +bool +Alphabet::sameSymbol(int32_t tsym, const Alphabet& other, int32_t osym, + bool allow_anys) const +{ + // if it's a letter, then it's equal across alphabets + if (tsym >= 0 && tsym == osym) return true; + if (tsym < 0 && osym < 0 && + this->slexicinv[-tsym-1] == other.slexicinv[-osym-1]) { + return true; + } + if (allow_anys && + ((tsym < 0 && this->slexicinv[-tsym-1] == u""_uv && osym > 0) || + (tsym < 0 && this->slexicinv[-tsym-1] == u""_uv && osym < 0) || + (osym < 0 && other.slexicinv[-osym-1] == u""_uv && tsym > 0) || + (osym < 0 && other.slexicinv[-osym-1] == u""_uv && tsym < 0))) { + return true; + } + return false; +} diff -Nru lttoolbox-3.6.6/lttoolbox/alphabet.h lttoolbox-3.7.1/lttoolbox/alphabet.h --- lttoolbox-3.6.6/lttoolbox/alphabet.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/alphabet.h 2022-11-01 08:36:47.000000000 +0000 @@ -25,7 +25,6 @@ #include #include -using namespace std; using namespace icu; /** @@ -39,13 +38,13 @@ * Symbol-identifier relationship. Only contains . * @see slexicinv */ - map slexic; + std::map> slexic; /** * Identifier-symbol relationship. Only contains . * @see slexic */ - vector slexicinv; + std::vector slexicinv; /** @@ -53,13 +52,13 @@ * other characters are UChar32's casted to ints. * @see spairinv */ - map, int32_t> spair; + std::map, int32_t> spair; /** * All symbol-pairs (both and letters). * @see spair */ - vector > spairinv; + std::vector > spairinv; void copy(Alphabet const &a); @@ -90,7 +89,7 @@ /** * Include a symbol into the alphabet. */ - void includeSymbol(UString const &s); + void includeSymbol(UStringView s); /** * Get an unique code for every symbol pair. This flavour is for @@ -100,7 +99,7 @@ * @return code for (c1, c2). */ int32_t operator()(int32_t const c1, int32_t const c2); - int32_t operator()(UString const &s) const; + int32_t operator()(UStringView s) const; /** * Gets the individual symbol identifier. Assumes it already exists! @@ -108,14 +107,14 @@ * @param s symbol to be identified. * @return symbol identifier. */ - int32_t operator()(UString const &s); + int32_t operator()(UStringView s); /** * Check wether the symbol is defined in the alphabet. * @param s symbol * @return true if defined */ - bool isSymbolDefined(UString const &s); + bool isSymbolDefined(UStringView s) const; /** * Returns the size of the alphabet (number of symbols). @@ -127,7 +126,7 @@ * Write method. * @param output output stream. */ - void write(FILE *output); + void write(FILE *output) const; /** * Read method. @@ -143,7 +142,7 @@ * @param symbol symbol code. * @param output output stream. */ - void writeSymbol(int32_t const symbol, UFILE *output) const; + void writeSymbol(int32_t symbol, UFILE *output) const; /** * Concat a symbol in the string that is passed by reference. @@ -151,7 +150,7 @@ * @param symbol code of the symbol * @param uppercase true if we want an uppercase symbol */ - void getSymbol(UString &result, int32_t const symbol, + void getSymbol(UString &result, int32_t symbol, bool uppercase = false) const; /** @@ -159,14 +158,14 @@ * @param symbol the code of the symbol * @return true if the symbol is a tag */ - bool isTag(int32_t const symbol) const; + bool isTag(int32_t symbol) const; /** * Sets an already existing symbol to represent a new value. * @param symbol the code of the symbol to set * @param newSymbolString the new string for this symbol */ - void setSymbol(int32_t symbol, UString newSymbolString); + void setSymbol(int32_t symbol, UStringView newSymbolString); /** * Note: both the symbol int and int-pair are specific to this alphabet instance. @@ -174,12 +173,12 @@ * @param code a symbol * @return the pair which code represents in this alphabet */ - pair const & decode(int32_t const code) const; + std::pair const & decode(int32_t code) const; /** * Get all symbols where the left-hand side of the symbol-pair is l. */ - set symbolsWhereLeftIs(UChar32 l) const; + std::set symbolsWhereLeftIs(UChar32 l) const; enum Side { @@ -196,7 +195,12 @@ * @param s whether to loopback on the left or right side of the symbol-pair * @param nonTagsToo by default only tags are included, but if this is true we include all symbols */ - void createLoopbackSymbols(set &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false); + void createLoopbackSymbols(std::set &symbols, const Alphabet &basis, Side s = right, bool nonTagsToo = false); + + std::vector tokenize(UStringView str) const; + + bool sameSymbol(int32_t tsym, const Alphabet& other, int32_t osym, + bool allow_anys=false) const; }; #endif diff -Nru lttoolbox-3.6.6/lttoolbox/att_compiler.cc lttoolbox-3.7.1/lttoolbox/att_compiler.cc --- lttoolbox-3.6.6/lttoolbox/att_compiler.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/att_compiler.cc 2022-11-01 08:36:47.000000000 +0000 @@ -28,7 +28,6 @@ #include #include -using namespace std; using namespace icu; AttCompiler::AttCompiler() @@ -99,7 +98,7 @@ } void -AttCompiler::symbol_code(const UString& symbol, vector& split) +AttCompiler::symbol_code(UStringView symbol, std::vector& split) { if (symbol.empty()) { split.push_back(0); @@ -111,7 +110,7 @@ size_t end = symbol.size(); UChar32 c; while (i < end) { - U16_NEXT(symbol.c_str(), i, end, c); + U16_NEXT(symbol.data(), i, end, c); update_alphabet(c); split.push_back(c); } @@ -120,11 +119,11 @@ void AttCompiler::add_transition(int from, int to, - const UString& upper, const UString& lower, + UStringView upper, UStringView lower, double weight) { AttNode* src = get_node(from); - vector lsplit, rsplit; + std::vector lsplit, rsplit; symbol_code(upper, lsplit); symbol_code(lower, rsplit); for (size_t i = 0; i < lsplit.size() || i < rsplit.size(); i++) { @@ -143,15 +142,15 @@ } void -AttCompiler::parse(string const &file_name, bool read_rl) +AttCompiler::parse(std::string const &file_name, bool read_rl) { clear(); UFILE* infile = u_fopen(file_name.c_str(), "r", NULL, NULL); if (infile == NULL) { - cerr << "Error: unable to open '" << file_name << "' for reading." << endl; + std::cerr << "Error: unable to open '" << file_name << "' for reading." << std::endl; } - vector tokens; + std::vector tokens; bool first_line_in_fst = true; // First line -- see below bool multiple_transducers = false; int state_id_offset = 1; @@ -186,7 +185,7 @@ if (first_line_in_fst && tokens.size() == 1) { - cerr << "Error: invalid format in file '" << file_name << "' on line " << line_number << "." << endl; + std::cerr << "Error: invalid format in file '" << file_name << "' on line " << line_number << "." << std::endl; exit(EXIT_FAILURE); } @@ -194,7 +193,7 @@ { if (state_id_offset == 1) { // this is the first split we've seen - cerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl; + std::cerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << std::endl; multiple_transducers = true; } // Update the offset for the new FST @@ -204,7 +203,7 @@ } from = StringUtils::stoi(tokens[0]) + state_id_offset; - largest_seen_state_id = max(largest_seen_state_id, from); + largest_seen_state_id = std::max(largest_seen_state_id, from); get_node(from); /* First line: the initial state is of both types. */ @@ -229,12 +228,12 @@ { weight = default_weight; } - finals.insert(pair (from, weight)); + finals.insert(std::pair (from, weight)); } else { to = StringUtils::stoi(tokens[1]) + state_id_offset; - largest_seen_state_id = max(largest_seen_state_id, to); + largest_seen_state_id = std::max(largest_seen_state_id, to); if(read_rl) { upper = tokens[3]; @@ -269,7 +268,7 @@ /* Classify the nodes of the graph. */ if (splitting) { classify_forwards(); - set path; + std::set path; classify_backwards(starting_state, path); } @@ -282,8 +281,8 @@ { Transducer transducer; /* Correlation between the graph's state ids and those in the transducer. */ - map corr; - set visited; + std::map corr; + std::set visited; corr[starting_state] = transducer.getInitial(); _extract_transducer(type, starting_state, transducer, corr, visited); @@ -302,14 +301,14 @@ /* if(noFinals) { - cerr << "No final states (" << type << ")" << endl; - cerr << " were:" << endl; - cerr << "\t" ; + std::cerr << "No final states (" << type << ")" << std::endl; + std::cerr << " were:" << std::endl; + std::cerr << "\t" ; for (auto& f : finals) { - cerr << f.first << " "; + std::cerr << f.first << " "; } - cerr << endl; + std::cerr << std::endl; } */ return transducer; @@ -321,8 +320,8 @@ */ void AttCompiler::_extract_transducer(TransducerType type, int from, - Transducer& transducer, map& corr, - set& visited) + Transducer& transducer, std::map& corr, + std::set& visited) { if (visited.find(from) != visited.end()) { @@ -391,8 +390,8 @@ void AttCompiler::classify_forwards() { - stack todo; - set done; + std::stack todo; + std::set done; todo.push(starting_state); while(!todo.empty()) { int next = todo.top(); @@ -419,10 +418,10 @@ * @param path the path we took to get here */ TransducerType -AttCompiler::classify_backwards(int state, set& path) +AttCompiler::classify_backwards(int state, std::set& path) { if(finals.find(state) != finals.end()) { - cerr << "ERROR: Transducer contains epsilon transition to a final state. Aborting." << endl; + std::cerr << "ERROR: Transducer contains epsilon transition to a final state. Aborting." << std::endl; exit(EXIT_FAILURE); } AttNode* node = get_node(state); @@ -431,7 +430,7 @@ if(t1.type != UNDECIDED) { type |= t1.type; } else if(path.find(t1.to) != path.end()) { - cerr << "ERROR: Transducer contains initial epsilon loop. Aborting." << endl; + std::cerr << "ERROR: Transducer contains initial epsilon loop. Aborting." << std::endl; exit(EXIT_FAILURE); } else { path.insert(t1.to); @@ -450,7 +449,7 @@ void AttCompiler::write(FILE *output) { - map temp; + std::map temp; if (splitting) { temp["main@standard"_u] = extract_transducer(WORD); Transducer punct_fst = extract_transducer(PUNCT); diff -Nru lttoolbox-3.6.6/lttoolbox/att_compiler.h lttoolbox-3.7.1/lttoolbox/att_compiler.h --- lttoolbox-3.6.6/lttoolbox/att_compiler.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/att_compiler.h 2022-11-01 08:36:47.000000000 +0000 @@ -35,7 +35,6 @@ #define PUNCT 2 #define BOTH 3 -using namespace std; using namespace icu; /** Bitmask; 1 = WORD, 2 = PUNCT, 3 = BOTH. */ @@ -70,15 +69,15 @@ * extract_transducer(). */ void _extract_transducer(TransducerType type, int from, - Transducer& transducer, map& corr, - set& visited) ; + Transducer& transducer, std::map& corr, + std::set& visited) ; /** * Reads the AT&T format file @p file_name. The transducer and the alphabet * are both cleared before reading the new file. * If read_rl = true then the second tape is used as the input */ - void parse(string const &file_name, bool read_rl); + void parse(std::string const &file_name, bool read_rl); /** Writes the transducer to @p file_name in lt binary format. */ @@ -93,7 +92,7 @@ bool splitting = true; /** The final state(s). */ - map finals; + std::map finals; /** * Id of the starting state. We assume it is the source state of the first * transduction in the file. @@ -109,7 +108,7 @@ Alphabet alphabet; /** All non-multicharacter symbols. */ - set letters; + std::set letters; /** Used in AttNode. */ struct Transduction @@ -130,13 +129,13 @@ struct AttNode { int id; - vector transductions; + std::vector transductions; AttNode(int id) : id(id) {} }; /** Stores the transducer graph. */ - map graph; + std::map graph; /** Clears the data associated with the current transducer. */ void clear(); @@ -175,7 +174,7 @@ void classify_single_transition(Transduction& t); void classify_forwards(); - TransducerType classify_backwards(int state, set& path); + TransducerType classify_backwards(int state, std::set& path); /** * Converts symbols like @0@ to epsilon, @_SPACE_@ to space, etc. @@ -187,9 +186,9 @@ // if a character should be in the alphabet, add it void update_alphabet(UChar32 c); // convert a string to a symbol code, splitting non-tag multichars - void symbol_code(const UString& symbol, vector& split); + void symbol_code(UStringView symbol, std::vector& split); void add_transition(int from, int to, - const UString& upper, const UString& lower, + UStringView upper, UStringView lower, double weight); }; diff -Nru lttoolbox-3.6.6/lttoolbox/buffer.h lttoolbox-3.7.1/lttoolbox/buffer.h --- lttoolbox-3.6.6/lttoolbox/buffer.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/buffer.h 2022-11-01 08:36:47.000000000 +0000 @@ -21,8 +21,6 @@ #include #include -using namespace std; - /** * Generic circular buffer class */ @@ -75,7 +73,7 @@ { if(buf_size == 0) { - cerr << "Error: Cannot create empty buffer." << endl; + std::cerr << "Error: Cannot create empty buffer." << std::endl; exit(EXIT_FAILURE); } buf = new T[buf_size]; @@ -228,7 +226,7 @@ */ void setPos(unsigned int const newpos) { - currentpos = newpos; + currentpos = newpos % size; } /** diff -Nru lttoolbox-3.6.6/lttoolbox/cli.cc lttoolbox-3.7.1/lttoolbox/cli.cc --- lttoolbox-3.6.6/lttoolbox/cli.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/cli.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include +#include +#include +#include +#include + +CLI::CLI(std::string desc, std::string ver) +{ + description = desc; + version = ver; +} + +CLI::CLI(std::string desc) +{ + description = desc; +} + +CLI::~CLI() +{ +} + +void CLI::add_str_arg(char short_flag, std::string long_flag, + std::string desc, std::string arg) +{ + options.push_back({.short_opt=short_flag, .long_opt=long_flag, + .desc=desc, .is_bool=false, .var=arg}); +} + +void CLI::add_bool_arg(char short_flag, std::string long_flag, + std::string desc) +{ + options.push_back({.short_opt=short_flag, .long_opt=long_flag, + .desc=desc, .is_bool=true, .var=""}); +} + +void CLI::add_file_arg(std::string name, bool optional) +{ + file_args.push_back(std::make_pair(name, optional)); + if (!optional) min_file_args++; +} + +void CLI::set_epilog(std::string e) +{ + epilog = e; +} + +void CLI::print_usage() +{ + if (!prog_name.empty()) { + std::cout << prog_name; + if (!version.empty()) { + std::cout << " v" << version; + } + std::cout << ": " << description << std::endl; + std::cout << "USAGE: " << prog_name; + std::string bargs; + std::string sargs; + for (auto& it : options) { + if (it.is_bool) { + bargs += it.short_opt; + } else { + sargs += " [-"; + sargs += it.short_opt; + sargs += ' '; + sargs += it.var; + sargs += ']'; + } + } + if (!bargs.empty()) { + std::cout << " [-" << bargs << "]"; + } + std::cout << sargs; + int depth = 0; + for (auto& it : file_args) { + std::cout << ' '; + if (it.second) { + std::cout << '['; + depth += 1; + } + std::cout << it.first; + } + while (depth-- > 0) std::cout << "]"; + std::cout << std::endl; + for (auto& it : options) { + std::cout << " -" << it.short_opt; +#if HAVE_GETOPT_LONG + std::cout << ", --" << it.long_opt << ':'; + for (size_t i = it.long_opt.size(); i < 20; i++) { + std::cout << ' '; + } +#else + std::cout << ": "; +#endif + std::cout << it.desc << std::endl; + } + if (!epilog.empty()) { + std::cout << epilog << std::endl; + } + } + exit(EXIT_FAILURE); +} + +void CLI::parse_args(int argc, char* argv[]) +{ + prog_name = basename(argv[0]); + std::string arg_str; +#if HAVE_GETOPT_LONG + struct option long_options[options.size()]; + int option_index = 0; +#endif + for (size_t i = 0; i < options.size(); i++) { + arg_str += options[i].short_opt; + if (!options[i].is_bool) arg_str += ':'; +#if HAVE_GETOPT_LONG + long_options[i].name = options[i].long_opt.c_str(); + long_options[i].has_arg = (options[i].is_bool ? no_argument : required_argument); + long_options[i].flag = 0; + long_options[i].val = options[i].short_opt; +#endif + } + + while (true) { +#if HAVE_GETOPT_LONG + int cnt = getopt_long(argc, argv, arg_str.c_str(), long_options, &option_index); +#else + int cnt = getopt(argc, argv, arg_str.c_str()); +#endif + if (cnt == -1) break; + + bool found = false; + for (auto& it : options) { + if (it.short_opt == cnt) { + found = true; + if (it.short_opt == 'v' && it.long_opt == "version") { + std::cout << prog_name << " version " << version << std::endl; + exit(EXIT_SUCCESS); + } + if (it.is_bool) { + bools[it.long_opt] = true; + } else { + strs[it.long_opt].push_back(optarg); + } + break; + } + } + if (!found || cnt == 'h') { + print_usage(); + } + } + while (optind < argc) { + files.push_back(argv[optind++]); + } + if (files.size() < min_file_args || files.size() > file_args.size()) { + print_usage(); + } + while (files.size() < file_args.size()) { + files.push_back(""); + } +} + +std::map>& CLI::get_strs() +{ + return strs; +} + +std::map& CLI::get_bools() +{ + return bools; +} + +std::vector& CLI::get_files() +{ + return files; +} diff -Nru lttoolbox-3.6.6/lttoolbox/cli.h lttoolbox-3.7.1/lttoolbox/cli.h --- lttoolbox-3.6.6/lttoolbox/cli.h 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/cli.h 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include + +class CLI { +private: + struct CLIOption { + char short_opt; + std::string long_opt; + std::string desc; + bool is_bool; + std::string var; + }; + + std::string description; + std::string version; + std::string epilog; + + std::vector options; + std::vector> file_args; + size_t min_file_args = 0; + + std::map> strs; + std::map bools; + std::vector files; + + std::string prog_name; + +public: + CLI(std::string desc, std::string version); + CLI(std::string desc); + ~CLI(); + void add_str_arg(char short_flag, std::string long_flag, std::string desc, + std::string arg); + void add_bool_arg(char short_flag, std::string long_flag, std::string desc); + void add_file_arg(std::string name, bool optional = true); + void set_epilog(std::string e); + void print_usage(); + void parse_args(int argc, char* argv[]); + std::map>& get_strs(); + std::map& get_bools(); + std::vector& get_files(); +}; diff -Nru lttoolbox-3.6.6/lttoolbox/CMakeLists.txt lttoolbox-3.7.1/lttoolbox/CMakeLists.txt --- lttoolbox-3.6.6/lttoolbox/CMakeLists.txt 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/CMakeLists.txt 2022-11-01 08:36:47.000000000 +0000 @@ -1,16 +1,19 @@ set(LIBLTTOOLBOX_HEADERS + acx.h alphabet.h att_compiler.h buffer.h + cli.h compiler.h compression.h deserialiser.h entry_token.h exception.h expander.h + file_utils.h fst_processor.h + input_file.h lt_locale.h - ltstr.h match_exe.h match_node.h match_state.h @@ -20,21 +23,28 @@ regexp_compiler.h serialiser.h sorted_vector.h + sorted_vector.hpp state.h - string_to_wostream.h + string_utils.h tmx_compiler.h - trans_exe.h transducer.h + trans_exe.h + ustring.h xml_parse_util.h + xml_walk_util.h ) set(LIBLTTOOLBOX_SOURCES + acx.cc alphabet.cc att_compiler.cc + cli.cc compiler.cc compression.cc entry_token.cc expander.cc + file_utils.cc fst_processor.cc + input_file.cc lt_locale.cc match_exe.cc match_node.cc @@ -44,10 +54,13 @@ regexp_compiler.cc sorted_vector.cc state.cc + string_utils.cc tmx_compiler.cc - trans_exe.cc transducer.cc + trans_exe.cc + ustring.cc xml_parse_util.cc + xml_walk_util.cc ${LIBLTTOOLBOX_HEADERS} ) if(WIN32) @@ -70,42 +83,41 @@ set(GETOPT) endif() -set(LibLttoolbox "lttoolbox${PROJECT_VERSION_MAJOR}-${VERSION_API}") -add_library(${LibLttoolbox} ${LIBLTTOOLBOX_SOURCES}) -target_compile_definitions(${LibLttoolbox} PRIVATE LTTOOLBOX_EXPORTS) -set_target_properties(${LibLttoolbox} PROPERTIES SOVERSION ${SOVERSION} VERSION ${VERSION}) -target_link_libraries(${LibLttoolbox} ${LIBXML2_LIBRARIES}) +add_library(lttoolbox ${LIBLTTOOLBOX_SOURCES}) +target_compile_definitions(lttoolbox PRIVATE LTTOOLBOX_EXPORTS) +set_target_properties(lttoolbox PROPERTIES SOVERSION ${VERSION_ABI}) +target_link_libraries(lttoolbox ${LIBXML2_LIBRARIES} ${ICU_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) add_executable(lt-print lt_print.cc) -target_link_libraries(lt-print ${LibLttoolbox} ${GETOPT_LIB}) +target_link_libraries(lt-print lttoolbox ${GETOPT_LIB}) add_executable(lt-trim lt_trim.cc) -target_link_libraries(lt-trim ${LibLttoolbox} ${GETOPT_LIB}) +target_link_libraries(lt-trim lttoolbox ${GETOPT_LIB}) add_executable(lt-comp lt_comp.cc) -target_link_libraries(lt-comp ${LibLttoolbox} ${GETOPT_LIB}) +target_link_libraries(lt-comp lttoolbox ${GETOPT_LIB}) add_executable(lt-proc lt_proc.cc) -target_link_libraries(lt-proc ${LibLttoolbox} ${GETOPT_LIB}) +target_link_libraries(lt-proc lttoolbox ${GETOPT_LIB}) add_executable(lt-expand lt_expand.cc) -target_link_libraries(lt-expand ${LibLttoolbox} ${GETOPT_LIB}) +target_link_libraries(lt-expand lttoolbox ${GETOPT_LIB}) add_executable(lt-tmxcomp lt_tmxcomp.cc) -target_link_libraries(lt-tmxcomp ${LibLttoolbox} ${GETOPT_LIB}) +target_link_libraries(lt-tmxcomp lttoolbox ${GETOPT_LIB}) add_executable(lt-tmxproc lt_tmxproc.cc) -target_link_libraries(lt-tmxproc ${LibLttoolbox} ${GETOPT_LIB}) +target_link_libraries(lt-tmxproc lttoolbox ${GETOPT_LIB}) if(BUILD_TESTING) add_test(NAME tests COMMAND ${PYTHON_EXECUTABLE} "${CMAKE_SOURCE_DIR}/tests/run_tests.py" $) set_tests_properties(tests PROPERTIES FAIL_REGULAR_EXPRESSION "FAILED") endif() -install(TARGETS ${LibLttoolbox} +install(TARGETS lttoolbox ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) install(FILES ${LIBLTTOOLBOX_HEADERS} - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lttoolbox-${VERSION_API}/lttoolbox) + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lttoolbox) install(TARGETS lt-print lt-trim lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) diff -Nru lttoolbox-3.6.6/lttoolbox/compiler.cc lttoolbox-3.7.1/lttoolbox/compiler.cc --- lttoolbox-3.6.6/lttoolbox/compiler.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/compiler.cc 2022-11-01 08:36:47.000000000 +0000 @@ -15,64 +15,14 @@ * along with this program; if not, see . */ #include -#include -#include -#include #include #include #include +#include +#include -#include -#include #include -#include - -using namespace std; - -UString const Compiler::COMPILER_DICTIONARY_ELEM = "dictionary"_u; -UString const Compiler::COMPILER_ALPHABET_ELEM = "alphabet"_u; -UString const Compiler::COMPILER_SDEFS_ELEM = "sdefs"_u; -UString const Compiler::COMPILER_SDEF_ELEM = "sdef"_u; -UString const Compiler::COMPILER_N_ATTR = "n"_u; -UString const Compiler::COMPILER_PARDEFS_ELEM = "pardefs"_u; -UString const Compiler::COMPILER_PARDEF_ELEM = "pardef"_u; -UString const Compiler::COMPILER_PAR_ELEM = "par"_u; -UString const Compiler::COMPILER_ENTRY_ELEM = "e"_u; -UString const Compiler::COMPILER_RESTRICTION_ATTR = "r"_u; -UString const Compiler::COMPILER_RESTRICTION_LR_VAL = "LR"_u; -UString const Compiler::COMPILER_RESTRICTION_RL_VAL = "RL"_u; -UString const Compiler::COMPILER_PAIR_ELEM = "p"_u; -UString const Compiler::COMPILER_LEFT_ELEM = "l"_u; -UString const Compiler::COMPILER_RIGHT_ELEM = "r"_u; -UString const Compiler::COMPILER_S_ELEM = "s"_u; -UString const Compiler::COMPILER_M_ELEM = "m"_u; -UString const Compiler::COMPILER_REGEXP_ELEM = "re"_u; -UString const Compiler::COMPILER_SECTION_ELEM = "section"_u; -UString const Compiler::COMPILER_ID_ATTR = "id"_u; -UString const Compiler::COMPILER_TYPE_ATTR = "type"_u; -UString const Compiler::COMPILER_IDENTITY_ELEM = "i"_u; -UString const Compiler::COMPILER_IDENTITYGROUP_ELEM = "ig"_u; -UString const Compiler::COMPILER_JOIN_ELEM = "j"_u; -UString const Compiler::COMPILER_BLANK_ELEM = "b"_u; -UString const Compiler::COMPILER_POSTGENERATOR_ELEM = "a"_u; -UString const Compiler::COMPILER_GROUP_ELEM = "g"_u; -UString const Compiler::COMPILER_LEMMA_ATTR = "lm"_u; -UString const Compiler::COMPILER_IGNORE_ATTR = "i"_u; -UString const Compiler::COMPILER_IGNORE_YES_VAL = "yes"_u; -UString const Compiler::COMPILER_ALT_ATTR = "alt"_u; -UString const Compiler::COMPILER_V_ATTR = "v"_u; -UString const Compiler::COMPILER_VL_ATTR = "vl"_u; -UString const Compiler::COMPILER_VR_ATTR = "vr"_u; -UString const Compiler::COMPILER_WEIGHT_ATTR = "w"_u; -UString const Compiler::COMPILER_TEXT_NODE = "#text"_u; -UString const Compiler::COMPILER_COMMENT_NODE = "#comment"_u; -UString const Compiler::COMPILER_ACX_ANALYSIS_ELEM = "analysis-chars"_u; -UString const Compiler::COMPILER_ACX_CHAR_ELEM = "char"_u; -UString const Compiler::COMPILER_ACX_EQUIV_CHAR_ELEM= "equiv-char"_u; -UString const Compiler::COMPILER_ACX_VALUE_ATTR = "value"_u; -UString const Compiler::COMPILER_LSX_WB_ELEM = "d"_u; -UString const Compiler::COMPILER_LSX_CHAR_ELEM = "w"_u; -UString const Compiler::COMPILER_LSX_TAG_ELEM = "t"_u; +#include Compiler::Compiler() { @@ -83,35 +33,24 @@ } void -Compiler::parseACX(string const &file, UString const &dir) +Compiler::parseACX(std::string const &file, UStringView dir) { if(dir == COMPILER_RESTRICTION_LR_VAL) { - reader = xmlReaderForFile(file.c_str(), NULL, 0); - if(reader == NULL) - { - cerr << "Error: cannot open '" << file << "'." << endl; - exit(EXIT_FAILURE); - } - int ret = xmlTextReaderRead(reader); - while(ret == 1) - { - procNodeACX(); - ret = xmlTextReaderRead(reader); - } + acx_map = readACX(file.c_str()); } } void -Compiler::parse(string const &file, UString const &dir) +Compiler::parse(std::string const &file, UStringView dir) { - direction = dir; - reader = xmlReaderForFile(file.c_str(), NULL, 0); - if(reader == NULL) - { - cerr << "Error: Cannot open '" << file << "'." << endl; - exit(EXIT_FAILURE); + if (dir == COMPILER_RESTRICTION_U_VAL) { + direction = COMPILER_RESTRICTION_LR_VAL; + unified_compilation = true; + } else { + direction = dir; } + reader = XMLParseUtil::open_or_exit(file.c_str()); int ret = xmlTextReaderRead(reader); while(ret == 1) @@ -122,7 +61,7 @@ if(ret != 0) { - cerr << "Error: Parse error at the end of input." << endl; + std::cerr << "Error: Parse error at the end of input." << std::endl; } xmlFreeTextReader(reader); @@ -133,7 +72,7 @@ // its own thread. This is the major bottleneck of lt-comp and sections // are completely independent transducers. std::vector minimisations; - for(std::pair& it : sections) + for(auto& it : sections) { if(jobs) { minimisations.push_back( @@ -148,28 +87,42 @@ thr.join(); } + if (is_separable) { + // ensure that all paths end in <$>, in case the user forgot to include + // . This will result in some paths ending with multiple finals + // and multiple finals, but lsx-proc only checks for finals upon reading + // $, so it won't be an issue. + int32_t end = alphabet(word_boundary, word_boundary); + for (auto& it : sections) { + for (auto fin : it.second.getFinals()) { + int end_state = it.second.insertSingleTransduction(end, fin.first); + it.second.setFinal(end_state); + } + } + } + if (!valid(dir)) { exit(EXIT_FAILURE); } } bool -Compiler::valid(UString const& dir) const +Compiler::valid(UStringView dir) const { - const char* side = dir == COMPILER_RESTRICTION_RL_VAL ? "right" : "left"; - const set epsilonSymbols = alphabet.symbolsWhereLeftIs(0); - const set spaceSymbols = alphabet.symbolsWhereLeftIs(' '); + const char* side = (dir == COMPILER_RESTRICTION_RL_VAL ? "right" : "left"); + const std::set epsilonSymbols = alphabet.symbolsWhereLeftIs(0); + const std::set spaceSymbols = alphabet.symbolsWhereLeftIs(' '); for (auto §ion : sections) { auto &fst = section.second; auto finals = fst.getFinals(); auto initial = fst.getInitial(); for(const auto i : fst.closure(initial, epsilonSymbols)) { if (finals.count(i)) { - cerr << "Error: Invalid dictionary (hint: the " << side << " side of an entry is empty)" << endl; + std::cerr << "Error: Invalid dictionary (hint: the " << side << " side of an entry is empty)" << std::endl; return false; } if(fst.closure(i, spaceSymbols).size() > 1) { // >1 since closure always includes self - cerr << "Error: Invalid dictionary (hint: entry on the " << side << " beginning with whitespace)" << endl; + std::cerr << "Error: Invalid dictionary (hint: entry on the " << side << " beginning with whitespace)" << std::endl; return false; } } @@ -204,8 +157,8 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Missing alphabet symbols." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Missing alphabet symbols." << std::endl; exit(EXIT_FAILURE); } } @@ -238,12 +191,12 @@ } int -Compiler::matchTransduction(vector const &pi, - vector const &pd, +Compiler::matchTransduction(std::vector const &pi, + std::vector const &pd, int state, Transducer &t, double const &entry_weight) { - vector::const_iterator left, right, limleft, limright; + std::vector::const_iterator left, right, limleft, limright; if(direction == COMPILER_RESTRICTION_LR_VAL) { @@ -267,7 +220,7 @@ } else { - map >::iterator acx_map_ptr; + std::map >::iterator acx_map_ptr; int rsymbol = 0; while(true) @@ -314,11 +267,26 @@ int new_state = t.insertSingleTransduction(tag, state, weight_value); + if (is_separable) { + // loop-back symbols for and + if (tag == alphabet(0, any_tag) || tag == alphabet(0, any_char)) { + // rl compilation of a badly written rule + // having an epsilon with wildcard output will produce + // garbage output -- see https://github.com/apertium/apertium-separable/issues/8 + std::cerr << "Warning: Cannot insert from empty input. Ignoring. (You probably want to specify exact tags when deleting a word.)" << std::endl; + } else if (tag == alphabet(any_tag, any_tag) || + tag == alphabet(any_char, any_char) || + tag == alphabet(any_tag, 0) || + tag == alphabet(any_char, 0)) { + t.linkStates(new_state, new_state, tag); + } + } + if(acx_map_ptr != acx_map.end()) { for(auto& it : acx_map_ptr->second) { - t.linkStates(state, new_state, alphabet(it ,rsymbol), weight_value); + t.linkStates(state, new_state, alphabet(it, rsymbol), weight_value); } } state = new_state; @@ -330,12 +298,12 @@ void -Compiler::requireEmptyError(UString const &name) +Compiler::requireEmptyError(UStringView name) { if(!xmlTextReaderIsEmptyElement(reader)) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Non-empty element '<" << name << ">' should be empty." << std::endl; exit(EXIT_FAILURE); } } @@ -343,19 +311,11 @@ bool Compiler::allBlanks() { - bool flag = true; - UString text = XMLParseUtil::readValue(reader); - - for(auto c : text) - { - flag = flag && u_isspace(c); - } - - return flag; + return XMLParseUtil::allBlanks(reader); } void -Compiler::readString(vector &result, UString const &name) +Compiler::readString(std::vector &result, UStringView name) { if(name == COMPILER_TEXT_NODE) { @@ -399,18 +359,37 @@ if(!alphabet.isSymbolDefined(symbol)) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Undefined symbol '" << symbol << "'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Undefined symbol '" << symbol << "'." << std::endl; exit(EXIT_FAILURE); } result.push_back(alphabet(symbol)); } + else if (is_separable && name == COMPILER_LSX_TAG_ELEM) { + requireEmptyError(name); + result.push_back(any_tag); + } + else if (is_separable && name == COMPILER_LSX_CHAR_ELEM) { + requireEmptyError(name); + result.push_back(any_char); + } + else if (is_separable && name == COMPILER_LSX_WB_ELEM) { + requireEmptyError(name); + UString mode = attrib(COMPILER_LSX_SPACE_ATTR); + if (mode == COMPILER_LSX_SPACE_YES_VAL) { + result.push_back(word_boundary_s); + } else if (mode == COMPILER_LSX_SPACE_NO_VAL) { + result.push_back(word_boundary_ns); + } else { + result.push_back(word_boundary); + } + } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid specification of element '<" << name; - cerr << ">' in this context." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid specification of element '<" << name; + std::cerr << ">' in this context." << std::endl; exit(EXIT_FAILURE); } } @@ -424,8 +403,8 @@ { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid construction." << std::endl; exit(EXIT_FAILURE); } } @@ -436,13 +415,7 @@ } void -Compiler::skip(UString &name, UString const &elem) -{ - skip(name, elem, true); -} - -void -Compiler::skip(UString &name, UString const &elem, bool open) +Compiler::skip(UString &name, UStringView elem, bool open) { xmlTextReaderRead(reader); name = XMLParseUtil::readName(reader); @@ -459,8 +432,8 @@ { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid construction." << std::endl; exit(EXIT_FAILURE); } } @@ -470,8 +443,8 @@ if(name != elem) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Expected '<" << slash << elem << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Expected '<" << slash << elem << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -479,7 +452,7 @@ EntryToken Compiler::procIdentity(double const entry_weight, bool ig) { - vector both_sides; + std::vector both_sides; if(!xmlTextReaderIsEmptyElement(reader)) { @@ -499,14 +472,14 @@ if(verbose && first_element && (both_sides.front() == (int)' ')) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Entry begins with space." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Entry begins with space." << std::endl; } first_element = false; EntryToken e; if(ig) { - vector right; + std::vector right; right.push_back(static_cast('#')); right.insert(right.end(), both_sides.begin(), both_sides.end()); e.setSingleTransduction(both_sides, right, entry_weight); @@ -521,7 +494,7 @@ EntryToken Compiler::procTransduction(double const entry_weight) { - vector lhs, rhs; + std::vector lhs, rhs; UString name; skip(name, COMPILER_LEFT_ELEM); @@ -543,8 +516,8 @@ if(verbose && first_element && (lhs.front() == (int)' ')) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Entry begins with space." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Entry begins with space." << std::endl; } first_element = false; @@ -573,7 +546,7 @@ } UString -Compiler::attrib(UString const &name) +Compiler::attrib(UStringView name) { return XMLParseUtil::attrib(reader, name); } @@ -587,15 +560,15 @@ if(!current_paradigm.empty() && paradigm_name == current_paradigm) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Paradigm refers to itself '" << paradigm_name << "'." < const &elements) +Compiler::insertEntryTokens(std::vector const &elements) { if(!current_paradigm.empty()) { @@ -631,8 +604,8 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid entry token." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid entry token." << std::endl; exit(EXIT_FAILURE); } } @@ -702,15 +675,14 @@ void -Compiler::requireAttribute(UString const &value, UString const &attrname, - UString const &elemname) +Compiler::requireAttribute(UStringView value, UStringView attrname, UStringView elemname) { if(value.empty()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): '<" << elemname; - cerr << "' element must specify non-void '"; - cerr << attrname << "' attribute." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): '<" << elemname; + std::cerr << "' element must specify non-void '"; + std::cerr << attrname << "' attribute." << std::endl; exit(EXIT_FAILURE); } } @@ -723,8 +695,8 @@ if(type != XML_READER_TYPE_END_ELEMENT) { - UString const &id = attrib(COMPILER_ID_ATTR); - UString const &type = attrib(COMPILER_TYPE_ATTR); + const auto& id = attrib(COMPILER_ID_ATTR); + const auto& type = attrib(COMPILER_TYPE_ATTR); requireAttribute(id, COMPILER_ID_ATTR, COMPILER_SECTION_ELEM); requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); @@ -738,24 +710,116 @@ } } +bool +Compiler::filterEntry(UStringView value, UStringView filter, bool keep_on_empty_filter) +{ + if (value.empty()) return true; + else if (keep_on_empty_filter && filter.empty()) return true; + auto ops = StringUtils::split(value, u" "); + for (auto& it : ops) { + if (it == filter) return true; + } + return false; +} + +void +Compiler::symbolFilters(UStringView value, UStringView prefix, std::vector>& symbols) +{ + if (value.empty()) return; + std::vector syms; + for (auto& it : StringUtils::split(value, u" ")) { + if (it.empty()) continue; + UString tag; + tag += '<'; + tag += prefix; + tag += ':'; + tag += it; + tag += '>'; + alphabet.includeSymbol(tag); + syms.push_back(alphabet(tag)); + } + if (!syms.empty()) symbols.push_back(syms); +} + void Compiler::procEntry() { - UString attribute = this->attrib(COMPILER_RESTRICTION_ATTR); - UString ignore = this->attrib(COMPILER_IGNORE_ATTR); - UString altval = this->attrib(COMPILER_ALT_ATTR); - UString varval = this->attrib(COMPILER_V_ATTR); - UString varl = this->attrib(COMPILER_VL_ATTR); - UString varr = this->attrib(COMPILER_VR_ATTR); - UString wsweight = this->attrib(COMPILER_WEIGHT_ATTR); + UString attribute = attrib(COMPILER_RESTRICTION_ATTR); + UString ignore = attrib(COMPILER_IGNORE_ATTR); + UString altval = attrib(COMPILER_ALT_ATTR); + UString varval = attrib(COMPILER_V_ATTR); + UString varl = attrib(COMPILER_VL_ATTR); + UString varr = attrib(COMPILER_VR_ATTR); + UString wsweight = attrib(COMPILER_WEIGHT_ATTR); + + std::vector elements; // if entry is masked by a restriction of direction or an ignore mark - if((!attribute.empty() && attribute != direction) + if (unified_compilation && ignore != COMPILER_IGNORE_YES_VAL) { + std::vector> symbols; + symbolFilters(attribute, u"r", symbols); + symbolFilters(altval, u"alt", symbols); + symbolFilters(varval, u"v", symbols); + symbolFilters(varl, u"vl", symbols); + symbolFilters(varr, u"vr", symbols); + if (!symbols.empty()) { + bool multi = false; + for (auto& it : symbols) { + if (it.size() > 1) { + multi = true; + break; + } + } + if (multi) { + UString parname = "--"_u; + parname += attribute; + parname += '-'; + parname += altval; + parname += '-'; + parname += varval; + parname += '-'; + parname += varl; + parname += '-'; + parname += varr; + if (paradigms.find(parname) == paradigms.end()) { + std::vector re; + for (auto& it : symbols) { + if (it.size() == 1) { + re.push_back(it[0]); + } else { + re.push_back(static_cast('[')); + re.insert(re.end(), it.begin(), it.end()); + re.push_back(static_cast(']')); + } + } + EntryToken e; + e.setRegexp(re); + std::vector vec(1, e); + parname.swap(current_paradigm); + insertEntryTokens(vec); + parname.swap(current_paradigm); + } + EntryToken e; + e.setParadigm(parname); + elements.push_back(e); + } + else { + std::vector syms; + for (auto& it : symbols) { + syms.push_back(it[0]); + } + EntryToken e; + e.setSingleTransduction(syms, syms); + elements.push_back(e); + } + } + } + else if((!attribute.empty() && attribute != direction) || ignore == COMPILER_IGNORE_YES_VAL - || (!altval.empty() && altval != alt) - || (direction == COMPILER_RESTRICTION_RL_VAL && !varval.empty() && varval != variant) - || (direction == COMPILER_RESTRICTION_RL_VAL && !varl.empty() && varl != variant_left) - || (direction == COMPILER_RESTRICTION_LR_VAL && !varr.empty() && varr != variant_right)) + || !filterEntry(altval, alt, false) + || !filterEntry(varval, variant, true) + || (direction == COMPILER_RESTRICTION_RL_VAL && !filterEntry(varl, variant_left, false)) + || (direction == COMPILER_RESTRICTION_LR_VAL && !filterEntry(varr, variant_right, false))) { // parse to the end of the entry UString name; @@ -775,15 +839,38 @@ weight = StringUtils::stod(wsweight); } - vector elements; + if (entry_debugging && current_paradigm.empty()) { + UString ln = "Line near "_u; + ln += StringUtils::itoa(xmlTextReaderGetParserLineNumber(reader)); + // Note that this line number will usually be a little bit wrong. + // This function actually returns the current line of the *parser* + // which is probably several lines past the element we're currently + // looking at. + UString c = attrib(u"c"); + if (!c.empty()) { + ln += ' '; + ln += c; + } + std::vector empty; + std::vector debug_syms; + ustring_to_vec32(ln, debug_syms); + if (is_separable) { + debug_syms.push_back(word_boundary_s); + } else { + debug_syms.push_back(static_cast(' ')); + } + EntryToken e; + e.setSingleTransduction(empty, debug_syms); + elements.push_back(e); + } while(true) { int ret = xmlTextReaderRead(reader); if(ret != 1) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Parse error." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Parse error." << std::endl; exit(EXIT_FAILURE); } UString name = XMLParseUtil::readName(reader); @@ -817,16 +904,17 @@ // detection of the use of undefined paradigms - UString const &p = elements.rbegin()->paradigmName(); + const auto& p = elements.rbegin()->paradigmName(); - if(paradigms.find(p) == paradigms.end()) + auto it = paradigms.find(p); + if(it == paradigms.end()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Undefined paradigm '" << p << "'." <second.isEmpty()) { while(name != COMPILER_ENTRY_ELEM || type != XML_READER_TYPE_END_ELEMENT) { @@ -848,47 +936,15 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid inclusion of '<" << name << ">' into '<" << COMPILER_ENTRY_ELEM; - cerr << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid inclusion of '<" << name << ">' into '<" << COMPILER_ENTRY_ELEM; + std::cerr << ">'." << std::endl; exit(EXIT_FAILURE); } } } void -Compiler::procNodeACX() -{ - UString name = XMLParseUtil::readName(reader); - if(name == COMPILER_TEXT_NODE) - { - /* ignore */ - } - else if(name == COMPILER_ACX_ANALYSIS_ELEM) - { - /* ignore */ - } - else if(name == COMPILER_ACX_CHAR_ELEM) - { - acx_current_char = static_cast(attrib(COMPILER_ACX_VALUE_ATTR)[0]); - } - else if(name == COMPILER_ACX_EQUIV_CHAR_ELEM) - { - acx_map[acx_current_char].insert(static_cast(attrib(COMPILER_ACX_VALUE_ATTR)[0])); - } - else if(name == COMPILER_COMMENT_NODE) - { - /* ignore */ - } - else - { - cerr << "Error in ACX file (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; - exit(EXIT_FAILURE); - } -} - -void Compiler::procNode() { UString name = XMLParseUtil::readName(reader); @@ -901,7 +957,20 @@ } else if(name == COMPILER_DICTIONARY_ELEM) { - /* ignore */ + if (attrib(COMPILER_TYPE_ATTR) == COMPILER_SEPARABLE_VAL || + attrib(COMPILER_TYPE_ATTR) == COMPILER_SEQUENTIAL_VAL) { + is_separable = true; + alphabet.includeSymbol(Transducer::ANY_TAG_SYMBOL); + alphabet.includeSymbol(Transducer::ANY_CHAR_SYMBOL); + alphabet.includeSymbol(Transducer::LSX_BOUNDARY_SYMBOL); + alphabet.includeSymbol(Transducer::LSX_BOUNDARY_SPACE_SYMBOL); + alphabet.includeSymbol(Transducer::LSX_BOUNDARY_NO_SPACE_SYMBOL); + any_tag = alphabet(Transducer::ANY_TAG_SYMBOL); + any_char = alphabet(Transducer::ANY_CHAR_SYMBOL); + word_boundary = alphabet(Transducer::LSX_BOUNDARY_SYMBOL); + word_boundary_s = alphabet(Transducer::LSX_BOUNDARY_SPACE_SYMBOL); + word_boundary_ns = alphabet(Transducer::LSX_BOUNDARY_NO_SPACE_SYMBOL); + } } else if(name == COMPILER_ALPHABET_ELEM) { @@ -944,8 +1013,8 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid node '<" << name << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -967,25 +1036,25 @@ } void -Compiler::setAltValue(UString const &a) +Compiler::setAltValue(UStringView a) { alt = a; } void -Compiler::setVariantValue(UString const &v) +Compiler::setVariantValue(UStringView v) { variant = v; } void -Compiler::setVariantLeftValue(UString const &v) +Compiler::setVariantLeftValue(UStringView v) { variant_left = v; } void -Compiler::setVariantRightValue(UString const &v) +Compiler::setVariantRightValue(UStringView v) { variant_right = v; } @@ -1013,3 +1082,9 @@ { verbose = verbosity; } + +void +Compiler::setEntryDebugging(bool debug) +{ + entry_debugging = debug; +} diff -Nru lttoolbox-3.6.6/lttoolbox/compiler.h lttoolbox-3.7.1/lttoolbox/compiler.h --- lttoolbox-3.6.6/lttoolbox/compiler.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/compiler.h 2022-11-01 08:36:47.000000000 +0000 @@ -18,29 +18,15 @@ #define _MYCOMPILER_ #include -#include #include #include #include +#include -#include #include -#include #include #include -#ifdef _MSC_VER - #if !defined(LTTOOLBOX_EXPORTS) - #define LTTOOLBOX_IMPORTS __declspec(dllimport) - #else - #define LTTOOLBOX_IMPORTS - #endif -#else - #define LTTOOLBOX_IMPORTS -#endif - -using namespace std; - /** * A compiler of dictionaries to letter transducers */ @@ -100,6 +86,13 @@ UString direction; /** + * If this is set to true, attributes v, vl, vr, r, and alt + * insert special symbols to be filtered by lt-restrict rather than + * ignoring entries. + */ + bool unified_compilation = false; + + /** * List of characters to be considered alphabetic */ UString letters; @@ -129,6 +122,16 @@ */ bool jobs = false; + /** + * Are we compiling an LSX dictionary + */ + bool is_separable = false; + + /** + * Should we put output line numbers on each in
? + */ + bool entry_debugging = false; + /** * Identifier of all the symbols during the compilation @@ -138,42 +141,41 @@ /** * List of named transducers-paradigms */ - map paradigms; + std::map> paradigms; /** * List of named dictionary sections */ - map sections; + std::map sections; /** * List of named prefix copy of a paradigm */ - map > prefix_paradigms; + std::map > prefix_paradigms; /** * List of named suffix copy of a paradigm */ - map > suffix_paradigms; + std::map, std::less<>> suffix_paradigms; /** * List of named endings of a suffix copy of a paradgim */ - map > postsuffix_paradigms; + std::map > postsuffix_paradigms; /** * Mapping of aliases of characters specified in ACX files */ - map > acx_map; + std::map > acx_map; /** - * Original char being mapped + * LSX symbols */ - int acx_current_char = 0; - - /* - static string range(char const a, char const b); - string readAlphabet(); - */ + int32_t any_tag = 0; + int32_t any_char = 0; + int32_t word_boundary = 0; + int32_t word_boundary_s = 0; + int32_t word_boundary_ns = 0; /** * Method to parse an XML Node @@ -181,12 +183,6 @@ void procNode(); /** - * Method to parse an XML Node in ACX files - */ - void procNodeACX(); - - - /** * Parse the <alphabet> element */ void procAlphabet(); @@ -207,6 +203,13 @@ void procEntry(); /** + * Return true if the filter (command line) is consistent with + * the value (attribute) and false otherwise + */ + bool filterEntry(UStringView value, UStringView filter, bool keep_on_empty_filter); + void symbolFilters(UStringView value, UStringView prefix, std::vector>& symbols); + + /** * Parse the <re> element * @return a list of tokens from the dictionary's entry */ @@ -222,7 +225,7 @@ * @param name the name of the attribute * @return the value of the attribute */ - UString attrib(UString const &name); + UString attrib(UStringView name); /** * Construct symbol pairs by align left side of both parts and insert @@ -233,7 +236,7 @@ * @param t the transducer * @return the last state of the inserted transduction */ - int matchTransduction(vector const &lp, vector const &rp, + int matchTransduction(std::vector const &lp, std::vector const &rp, int state, Transducer &t, double const &entry_weight); /** * Parse the <p> element @@ -257,14 +260,7 @@ * Insert a list of tokens into the paradigm / section being processed * @param elements the list */ - void insertEntryTokens(vector const &elements); - - /** - * Skip all document #text nodes before "elem" - * @param name the name of the node - * @param elem the name of the expected node - */ - void skip(UString &name, UString const &elem); + void insertEntryTokens(std::vector const &elements); /** * Skip all document #text nodes before "elem" @@ -272,7 +268,7 @@ * @param elem the name of the expected node * @param open true for open element, false for closed */ - void skip(UString &name, UString const &elem, bool open); + void skip(UString &name, UStringView elem, bool open = true); /** * Skip all blank #text nodes before "name" @@ -281,13 +277,13 @@ void skipBlanks(UString &name); - void readString(vector &result, UString const &name); + void readString(std::vector &result, UStringView name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(UString const &name); + void requireEmptyError(UStringView name); /** * Force an attribute to be specified, amd check for it @@ -295,8 +291,7 @@ * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(UString const &value, UString const &attrname, - UString const &elemname); + void requireAttribute(UStringView value, UStringView attrname, UStringView elemname); /** * True if all the elements in the current node are blanks @@ -304,7 +299,7 @@ */ bool allBlanks(); - bool valid(UString const& dir) const; + bool valid(UStringView dir) const; public: @@ -312,50 +307,56 @@ * Constants to represent the element and the attributes of * dictionaries */ - LTTOOLBOX_IMPORTS static UString const COMPILER_DICTIONARY_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ALPHABET_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_SDEFS_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_SDEF_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_N_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_PARDEFS_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_PARDEF_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_PAR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ENTRY_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_LR_VAL; - LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_RL_VAL; - LTTOOLBOX_IMPORTS static UString const COMPILER_PAIR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_LEFT_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_RIGHT_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_S_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_M_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_REGEXP_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_SECTION_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ID_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_TYPE_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_IDENTITY_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_IDENTITYGROUP_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_JOIN_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_BLANK_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_POSTGENERATOR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_GROUP_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_LEMMA_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_IGNORE_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_IGNORE_YES_VAL; - LTTOOLBOX_IMPORTS static UString const COMPILER_ALT_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_V_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_VL_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_VR_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_WEIGHT_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_TEXT_NODE; - LTTOOLBOX_IMPORTS static UString const COMPILER_COMMENT_NODE; - LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_ANALYSIS_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_CHAR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_EQUIV_CHAR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_VALUE_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_LSX_WB_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_LSX_CHAR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_LSX_TAG_ELEM; + static constexpr UStringView COMPILER_DICTIONARY_ELEM = u"dictionary"; + static constexpr UStringView COMPILER_ALPHABET_ELEM = u"alphabet"; + static constexpr UStringView COMPILER_SDEFS_ELEM = u"sdefs"; + static constexpr UStringView COMPILER_SDEF_ELEM = u"sdef"; + static constexpr UStringView COMPILER_N_ATTR = u"n"; + static constexpr UStringView COMPILER_PARDEFS_ELEM = u"pardefs"; + static constexpr UStringView COMPILER_PARDEF_ELEM = u"pardef"; + static constexpr UStringView COMPILER_PAR_ELEM = u"par"; + static constexpr UStringView COMPILER_ENTRY_ELEM = u"e"; + static constexpr UStringView COMPILER_RESTRICTION_ATTR = u"r"; + static constexpr UStringView COMPILER_RESTRICTION_LR_VAL = u"LR"; + static constexpr UStringView COMPILER_RESTRICTION_RL_VAL = u"RL"; + static constexpr UStringView COMPILER_RESTRICTION_U_VAL = u"U"; + static constexpr UStringView COMPILER_PAIR_ELEM = u"p"; + static constexpr UStringView COMPILER_LEFT_ELEM = u"l"; + static constexpr UStringView COMPILER_RIGHT_ELEM = u"r"; + static constexpr UStringView COMPILER_S_ELEM = u"s"; + static constexpr UStringView COMPILER_M_ELEM = u"m"; + static constexpr UStringView COMPILER_REGEXP_ELEM = u"re"; + static constexpr UStringView COMPILER_SECTION_ELEM = u"section"; + static constexpr UStringView COMPILER_ID_ATTR = u"id"; + static constexpr UStringView COMPILER_TYPE_ATTR = u"type"; + static constexpr UStringView COMPILER_SEQUENTIAL_VAL = u"sequential"; + static constexpr UStringView COMPILER_SEPARABLE_VAL = u"separable"; + static constexpr UStringView COMPILER_IDENTITY_ELEM = u"i"; + static constexpr UStringView COMPILER_IDENTITYGROUP_ELEM = u"ig"; + static constexpr UStringView COMPILER_JOIN_ELEM = u"j"; + static constexpr UStringView COMPILER_BLANK_ELEM = u"b"; + static constexpr UStringView COMPILER_POSTGENERATOR_ELEM = u"a"; + static constexpr UStringView COMPILER_GROUP_ELEM = u"g"; + static constexpr UStringView COMPILER_LEMMA_ATTR = u"lm"; + static constexpr UStringView COMPILER_IGNORE_ATTR = u"i"; + static constexpr UStringView COMPILER_IGNORE_YES_VAL = u"yes"; + static constexpr UStringView COMPILER_ALT_ATTR = u"alt"; + static constexpr UStringView COMPILER_V_ATTR = u"v"; + static constexpr UStringView COMPILER_VL_ATTR = u"vl"; + static constexpr UStringView COMPILER_VR_ATTR = u"vr"; + static constexpr UStringView COMPILER_WEIGHT_ATTR = u"w"; + static constexpr UStringView COMPILER_TEXT_NODE = u"#text"; + static constexpr UStringView COMPILER_COMMENT_NODE = u"#comment"; + static constexpr UStringView COMPILER_ACX_ANALYSIS_ELEM = u"analysis-chars"; + static constexpr UStringView COMPILER_ACX_CHAR_ELEM = u"char"; + static constexpr UStringView COMPILER_ACX_EQUIV_CHAR_ELEM= u"equiv-char"; + static constexpr UStringView COMPILER_ACX_VALUE_ATTR = u"value"; + static constexpr UStringView COMPILER_LSX_WB_ELEM = u"d"; + static constexpr UStringView COMPILER_LSX_CHAR_ELEM = u"w"; + static constexpr UStringView COMPILER_LSX_TAG_ELEM = u"t"; + static constexpr UStringView COMPILER_LSX_SPACE_ATTR = u"space"; + static constexpr UStringView COMPILER_LSX_SPACE_YES_VAL = u"yes"; + static constexpr UStringView COMPILER_LSX_SPACE_NO_VAL = u"no"; /** * Constructor @@ -370,12 +371,12 @@ /** * Compile dictionary to letter transducers */ - void parse(string const &file, UString const &dir); + void parse(std::string const &file, UStringView dir); /** * Read ACX file */ - void parseACX(string const &file, UString const &dir); + void parseACX(std::string const &file, UStringView dir); /** @@ -408,25 +409,27 @@ * Set the alt value to use in compilation * @param a the value */ - void setAltValue(UString const &a); + void setAltValue(UStringView a); /** * Set the variant value to use in compilation * @param v the value */ - void setVariantValue(UString const &v); + void setVariantValue(UStringView v); /** * Set the variant_left value to use in compilation * @param v the value */ - void setVariantLeftValue(UString const &v); + void setVariantLeftValue(UStringView v); /** * Set the variant_right value to use in compilation * @param v the value */ - void setVariantRightValue(UString const &v); + void setVariantRightValue(UStringView v); + + void setEntryDebugging(bool b); }; diff -Nru lttoolbox-3.6.6/lttoolbox/compression.cc lttoolbox-3.7.1/lttoolbox/compression.cc --- lttoolbox-3.6.6/lttoolbox/compression.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/compression.cc 2022-11-01 08:36:47.000000000 +0000 @@ -28,7 +28,7 @@ { if(fwrite_unlocked(&byte, 1, 1, output) != 1) { - cerr << "I/O Error writing" << endl; + std::cerr << "I/O Error writing" << std::endl; exit(EXIT_FAILURE); } } @@ -40,7 +40,7 @@ if(fread_unlocked(&value, 1, 1, input) != 1) { // Not uncomment this code since -// cerr << "I/O Error reading" << endl; +// std::cerr << "I/O Error reading" << std::endl; // exit(EXIT_FAILURE); } @@ -87,13 +87,13 @@ } else { - cerr << "Out of range: " << value << endl; + std::cerr << "Out of range: " << value << std::endl; exit(EXIT_FAILURE); } } void -Compression::multibyte_write(unsigned int value, ostream &output) +Compression::multibyte_write(unsigned int value, std::ostream &output) { if(value < 0x00000040) { @@ -134,7 +134,7 @@ } else { - cerr << "Out of range: " << value << endl; + std::cerr << "Out of range: " << value << std::endl; exit(EXIT_FAILURE); } } @@ -194,7 +194,7 @@ } unsigned int -Compression::multibyte_read(istream &input) +Compression::multibyte_read(std::istream &input) { unsigned char up; unsigned int result = 0; @@ -255,9 +255,9 @@ void -Compression::string_write(UString const &str, FILE *output) +Compression::string_write(UStringView str, FILE *output) { - vector vec; + std::vector vec; ustring_to_vec32(str, vec); Compression::multibyte_write(vec.size(), output); for(auto c : vec) @@ -289,6 +289,16 @@ unsigned int mantissa = static_cast(static_cast(0x40000000 * frexp(value, &exp))); unsigned int exponent = static_cast(static_cast(exp)); + if (std::isinf(value)) { + mantissa = std::numeric_limits::max(); + if (value < 0) { + exponent = std::numeric_limits::max() - 1; + } + else { + exponent = std::numeric_limits::max(); + } + } + if(mantissa < 0x04000000) { multibyte_write(mantissa, output); @@ -319,13 +329,23 @@ } void -Compression::long_multibyte_write(const double& value, ostream &output) +Compression::long_multibyte_write(const double& value, std::ostream &output) { int exp = 0; unsigned int mantissa = static_cast(static_cast(0x40000000 * frexp(value, &exp))); unsigned int exponent = static_cast(static_cast(exp)); + if (std::isinf(value)) { + mantissa = std::numeric_limits::max(); + if (value < 0) { + exponent = std::numeric_limits::max() - 1; + } + else { + exponent = std::numeric_limits::max(); + } + } + if(mantissa < 0x04000000) { multibyte_write(mantissa, output); @@ -393,13 +413,23 @@ } double value = static_cast(static_cast(mantissa)) / 0x40000000; - result = ldexp(value, static_cast(exponent)); + if (mantissa == std::numeric_limits::max() && exponent >= std::numeric_limits::max() - 1) { + if (exponent == std::numeric_limits::max() - 1) { + result = -1.0*std::numeric_limits::infinity(); + } + else { + result = std::numeric_limits::infinity(); + } + } + else { + result = ldexp(value, static_cast(exponent)); + } return result; } double -Compression::long_multibyte_read(istream &input) +Compression::long_multibyte_read(std::istream &input) { double result = 0.0; @@ -436,7 +466,17 @@ } double value = static_cast(static_cast(mantissa)) / 0x40000000; - result = ldexp(value, static_cast(exponent)); + if (mantissa == std::numeric_limits::max() && exponent >= std::numeric_limits::max() - 1) { + if (exponent == std::numeric_limits::max() - 1) { + result = -1.0*std::numeric_limits::infinity(); + } + else { + result = std::numeric_limits::infinity(); + } + } + else { + result = ldexp(value, static_cast(exponent)); + } return result; } diff -Nru lttoolbox-3.6.6/lttoolbox/compression.h lttoolbox-3.7.1/lttoolbox/compression.h --- lttoolbox-3.6.6/lttoolbox/compression.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/compression.h 2022-11-01 08:36:47.000000000 +0000 @@ -24,8 +24,6 @@ #include #include -using namespace std; - // Global lttoolbox features constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; enum LT_FEATURES : uint64_t { @@ -157,7 +155,7 @@ * @param value integer to write. * @param output output stream. */ - static void multibyte_write(unsigned int value, ostream &os); + static void multibyte_write(unsigned int value, std::ostream &os); /** * Read and decode an integer from the input stream. @@ -173,7 +171,7 @@ * @param input input stream. * @return the integer value readed. */ - static unsigned int multibyte_read(istream &is); + static unsigned int multibyte_read(std::istream &is); /** * This method allows to write a plain string to an output stream @@ -182,7 +180,7 @@ * @param str the string to write. * @param output the output stream. */ - static void string_write(UString const &str, FILE *output); + static void string_write(UStringView str, FILE *output); /** * This method reads a plain string from the input stream. @@ -206,7 +204,7 @@ * @param value double to write. * @param output output stream. */ - static void long_multibyte_write(const double& value, ostream &os); + static void long_multibyte_write(const double& value, std::ostream &os); /** * Read and decode a double from the input stream. @@ -222,7 +220,7 @@ * @param input input stream. * @return the double value read. */ - static double long_multibyte_read(istream &is); + static double long_multibyte_read(std::istream &is); }; #endif diff -Nru lttoolbox-3.6.6/lttoolbox/deserialiser.h lttoolbox-3.7.1/lttoolbox/deserialiser.h --- lttoolbox-3.6.6/lttoolbox/deserialiser.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/deserialiser.h 2022-11-01 08:36:47.000000000 +0000 @@ -119,7 +119,7 @@ std::istream &Stream_) { first_type a = Deserialiser::type>::deserialise(Stream_); second_type b = Deserialiser::type>::deserialise(Stream_); - return std::make_pair(a, b); + return {a, b}; } template diff -Nru lttoolbox-3.6.6/lttoolbox/dix.dtd lttoolbox-3.7.1/lttoolbox/dix.dtd --- lttoolbox-3.6.6/lttoolbox/dix.dtd 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/dix.dtd 2022-11-01 08:36:47.000000000 +0000 @@ -18,7 +18,13 @@ --> - + + + + @@ -99,7 +105,7 @@ > - + @@ -108,13 +114,13 @@ - + - + - + - + + + + + + + + + + + + + diff -Nru lttoolbox-3.6.6/lttoolbox/entry_token.cc lttoolbox-3.7.1/lttoolbox/entry_token.cc --- lttoolbox-3.6.6/lttoolbox/entry_token.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/entry_token.cc 2022-11-01 08:36:47.000000000 +0000 @@ -61,14 +61,14 @@ } void -EntryToken::setParadigm(UString const &np) +EntryToken::setParadigm(UStringView np) { parName = np; type = paradigm; } void -EntryToken::setSingleTransduction(vector const &pi, vector const &pd, double const ew) +EntryToken::setSingleTransduction(std::vector const &pi, std::vector const &pd, double const ew) { weight = ew; leftSide = pi; @@ -77,7 +77,7 @@ } void -EntryToken::setRegexp(UString const &r) +EntryToken::setRegexp(UStringView r) { myregexp.clear(); ustring_to_vec32(r, myregexp); @@ -85,6 +85,13 @@ } void +EntryToken::setRegexp(const std::vector& r) +{ + myregexp = r; + type = regexp; +} + +void EntryToken::readRegexp(xmlTextReaderPtr reader) { XMLParseUtil::readValueInto32(reader, myregexp); @@ -115,19 +122,19 @@ return parName; } -vector const & +std::vector const & EntryToken::left() const { return leftSide; } -vector const & +std::vector const & EntryToken::right() const { return rightSide; } -vector const & +std::vector const & EntryToken::regExp() const { return myregexp; diff -Nru lttoolbox-3.6.6/lttoolbox/entry_token.h lttoolbox-3.7.1/lttoolbox/entry_token.h --- lttoolbox-3.6.6/lttoolbox/entry_token.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/entry_token.h 2022-11-01 08:36:47.000000000 +0000 @@ -23,8 +23,6 @@ #include #include -using namespace std; - /** * This is a "Compiler" helper class, to store the parts of each entry * before combining it to build the transducer being "compiled". @@ -54,17 +52,17 @@ /** * Left side of transduction (if 'single_transduction') */ - vector leftSide; + std::vector leftSide; /** * Right side of transduction (if 'single_transduction') */ - vector rightSide; + std::vector rightSide; /** * Regular expression (if 'regexp') */ - vector myregexp; + std::vector myregexp; /** * copy method @@ -101,7 +99,7 @@ * Sets the name of the paradigm. * @param np the paradigm name */ - void setParadigm(UString const &np); + void setParadigm(UStringView np); /** * Set both parts of a single transduction. @@ -109,13 +107,14 @@ * @param pd right part * @param ew entry weight */ - void setSingleTransduction(vector const &pi, vector const &pd, double const ew = 0); + void setSingleTransduction(std::vector const &pi, std::vector const &pd, double const ew = 0); /** * Set regular expression. * @param r the regular expression specification. */ - void setRegexp(UString const &r); + void setRegexp(UStringView r); + void setRegexp(const std::vector& r); /** * More efficient version of setRegexp() @@ -151,19 +150,19 @@ * Retrieve the left part of the paradigm. * @return the left part of the paradigm. */ - vector const & left() const; + std::vector const & left() const; /** * Retrieve the right part of the paradigm. * @return the right part of the paradigm. */ - vector const & right() const; + std::vector const & right() const; /** * Retrieve the regular expression specification. * @return the regular expression specification. */ - vector const & regExp() const; + std::vector const & regExp() const; /** * Retrieve the weight value of the entry. diff -Nru lttoolbox-3.6.6/lttoolbox/expander.cc lttoolbox-3.7.1/lttoolbox/expander.cc --- lttoolbox-3.6.6/lttoolbox/expander.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/expander.cc 2022-11-01 08:36:47.000000000 +0000 @@ -17,21 +17,15 @@ #include #include -#include #include -#include #include #include #include -using namespace std; - -Expander::Expander() : -reader(0) +Expander::Expander() { - LtLocale::tryToSetLocale(); } Expander::~Expander() @@ -39,14 +33,9 @@ } void -Expander::expand(string const &file, UFILE* output) +Expander::expand(std::string const &file, UFILE* output) { - reader = xmlReaderForFile(file.c_str(), NULL, 0); - if(reader == NULL) - { - cerr << "Error: Cannot open '" << file << "'." << endl; - exit(EXIT_FAILURE); - } + reader = XMLParseUtil::open_or_exit(file.c_str()); int ret = xmlTextReaderRead(reader); while(ret == 1) @@ -57,7 +46,7 @@ if(ret != 0) { - cerr << "Error: Parse error at the end of input." << endl; + std::cerr << "Error: Parse error at the end of input." << std::endl; } xmlFreeTextReader(reader); @@ -80,12 +69,12 @@ } void -Expander::requireEmptyError(UString const &name) +Expander::requireEmptyError(UStringView name) { if(!xmlTextReaderIsEmptyElement(reader)) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Non-empty element '<" << name << ">' should be empty." << std::endl; exit(EXIT_FAILURE); } } @@ -93,19 +82,11 @@ bool Expander::allBlanks() { - bool flag = true; - UString text = XMLParseUtil::readValue(reader); - - for(auto c : text) - { - flag = flag && isspace(c); - } - - return flag; + return XMLParseUtil::allBlanks(reader); } void -Expander::readString(UString &result, UString const &name) +Expander::readString(UString &result, UStringView name) { if(name == Compiler::COMPILER_TEXT_NODE) { @@ -168,9 +149,9 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid specification of element '<" << name; - cerr << ">' in this context." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid specification of element '<" << name; + std::cerr << ">' in this context." << std::endl; exit(EXIT_FAILURE); } } @@ -182,8 +163,8 @@ { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid construction." << std::endl; exit(EXIT_FAILURE); } xmlTextReaderRead(reader); @@ -192,7 +173,7 @@ } void -Expander::skip(UString &name, UString const &elem) +Expander::skip(UString &name, UStringView elem) { xmlTextReaderRead(reader); name = XMLParseUtil::readName(reader); @@ -201,8 +182,8 @@ { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid construction." << std::endl; exit(EXIT_FAILURE); } xmlTextReaderRead(reader); @@ -211,8 +192,8 @@ if(name != elem) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Expected '<" << elem << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Expected '<" << elem << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -240,7 +221,7 @@ return both_sides; } -pair +std::pair Expander::procIdentityGroup() { UString lhs; @@ -265,11 +246,11 @@ lhs += both_sides; rhs += both_sides; - pair e(lhs, rhs); + std::pair e(lhs, rhs); return e; } -pair +std::pair Expander::procTransduction() { UString lhs, rhs; @@ -311,12 +292,12 @@ skip(name, Compiler::COMPILER_PAIR_ELEM); - pair e(lhs, rhs); + std::pair e(lhs, rhs); return e; } UString -Expander::attrib(UString const &name) +Expander::attrib(UStringView name) { return XMLParseUtil::attrib(reader, name); } @@ -330,15 +311,14 @@ } void -Expander::requireAttribute(UString const &value, UString const &attrname, - UString const &elemname) +Expander::requireAttribute(UStringView value, UStringView attrname, UStringView elemname) { if(value.empty()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): '<" << elemname; - cerr << "' element must specify non-void '"; - cerr<< attrname << "' attribute." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): '<" << elemname; + std::cerr << "' element must specify non-void '"; + std::cerr<< attrname << "' attribute." << std::endl; exit(EXIT_FAILURE); } } @@ -367,8 +347,8 @@ int ret = xmlTextReaderRead(reader); if(ret != 1) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Parse error." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Parse error." << std::endl; exit(EXIT_FAILURE); } myname = XMLParseUtil::readName(reader); @@ -399,8 +379,8 @@ int ret = xmlTextReaderRead(reader); if(ret != 1) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Parse error." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Parse error." << std::endl; exit(EXIT_FAILURE); } UString name = XMLParseUtil::readName(reader); @@ -409,7 +389,7 @@ int type = xmlTextReaderNodeType(reader); if(name == Compiler::COMPILER_PAIR_ELEM) { - pair p = procTransduction(); + std::pair p = procTransduction(); append(items, p); append(items_lr, p); append(items_rl, p); @@ -423,7 +403,7 @@ } else if(name == Compiler::COMPILER_IDENTITYGROUP_ELEM) { - pair p = procIdentityGroup(); + std::pair p = procIdentityGroup(); append(items, p); append(items_lr, p); append(items_rl, p); @@ -444,8 +424,8 @@ paradigm_lr.find(p) == paradigm_lr.end() && paradigm_rl.find(p) == paradigm_rl.end()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Undefined paradigm '" << p << "'." <' into '<" << Compiler::COMPILER_ENTRY_ELEM; - cerr << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid inclusion of '<" << name << ">' into '<" << Compiler::COMPILER_ENTRY_ELEM; + std::cerr << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -539,7 +519,7 @@ { UString name = XMLParseUtil::readName(reader); - // DO: optimize the execution order of this string "ifs" + // TODO: optimize the execution order of this string "ifs" if(name == Compiler::COMPILER_TEXT_NODE) { @@ -583,8 +563,8 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid node '<" << name << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -593,9 +573,25 @@ Expander::procRegexp() { xmlTextReaderRead(reader); - UString re = XMLParseUtil::readValue(reader); + UString val = XMLParseUtil::readValue(reader); + UString escaped = "^$/<>{}*@#+~:"_u; + UString ret; + bool esc = false; + for (auto& c : val) { + if (esc) { + ret += c; + esc = false; + continue; + } + if (escaped.find(c) != UString::npos) { + ret += '\\'; + } else if (c == '\\') { + esc = true; + } + ret += c; + } xmlTextReaderRead(reader); - return re; + return ret; } void @@ -608,7 +604,7 @@ { for(auto& it2 : endings) { - temp.push_back(pair(it.first + it2.first, + temp.push_back(std::pair(it.first + it2.first, it.second + it2.second)); } } @@ -617,7 +613,7 @@ } void -Expander::append(EntList &result, UString const &endings) +Expander::append(EntList &result, UStringView endings) { for(auto& it : result) { @@ -628,7 +624,7 @@ void Expander::append(EntList &result, - pair const &endings) + std::pair const &endings) { for(auto& it : result) { @@ -638,25 +634,25 @@ } void -Expander::setAltValue(UString const &a) +Expander::setAltValue(UStringView a) { alt = a; } void -Expander::setVariantValue(UString const &v) +Expander::setVariantValue(UStringView v) { variant = v; } void -Expander::setVariantLeftValue(UString const &v) +Expander::setVariantLeftValue(UStringView v) { variant_left = v; } void -Expander::setVariantRightValue(UString const &v) +Expander::setVariantRightValue(UStringView v) { variant_right = v; } diff -Nru lttoolbox-3.6.6/lttoolbox/expander.h lttoolbox-3.7.1/lttoolbox/expander.h --- lttoolbox-3.6.6/lttoolbox/expander.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/expander.h 2022-11-01 08:36:47.000000000 +0000 @@ -19,14 +19,11 @@ #include -#include #include #include #include -using namespace std; - -typedef list > EntList; +typedef std::vector > EntList; /** * An expander of dictionaries @@ -37,7 +34,7 @@ /** * The libxml2's XML reader */ - xmlTextReaderPtr reader; + xmlTextReaderPtr reader = nullptr; /** * The alt value @@ -78,11 +75,11 @@ /** * Paradigms */ - map paradigm; + std::map paradigm; - map paradigm_lr; + std::map paradigm_lr; - map paradigm_rl; + std::map paradigm_rl; /** * Method to parse an XML Node @@ -110,13 +107,13 @@ * @param name the name of the attribute * @return the value of the attribute */ - UString attrib(UString const &name); + UString attrib(UStringView name); /** * Parse the <p> element * @return a pair of strings, left part and right part of a transduction */ - pair procTransduction(); + std::pair procTransduction(); /** * Parse the <i> element @@ -129,7 +126,7 @@ * @return a pair of strings, whose right part begins with '#' * but are otherwise identical */ - pair procIdentityGroup(); + std::pair procIdentityGroup(); /** * Parse the <par> element @@ -142,7 +139,7 @@ * @param name the name of the node * @param elem the name of the expected node */ - void skip(UString &name, UString const &elem); + void skip(UString &name, UStringView elem); /** * Skip all blank #text nodes before "name" @@ -151,13 +148,13 @@ void skipBlanks(UString &name); - void readString(UString &result, UString const &name); + void readString(UString &result, UStringView name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(UString const &name); + void requireEmptyError(UStringView name); /** * Force an attribute to be specified, amd check for it @@ -165,8 +162,7 @@ * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(UString const &value, UString const &attrname, - UString const &elemname); + void requireAttribute(UStringView value, UStringView attrname, UStringView elemname); /** * True if all the elements in the current node are blanks @@ -180,8 +176,8 @@ * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - list > const &endings); + static void append(EntList &result, + EntList const &endings); /** * Append a list of endings to a list of current transductions. @@ -189,8 +185,7 @@ * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - UString const &endings); + static void append(EntList &result, UStringView endings); /** * Append a list of endings to a list of current transductions. @@ -198,8 +193,8 @@ * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - pair const &endings); + static void append(EntList &result, + std::pair const &endings); public: /** @@ -215,31 +210,31 @@ /** * Compile dictionary to letter transducers */ - void expand(string const &file, UFILE* output); + void expand(std::string const &file, UFILE* output); /** * Set the alt value to use in compilation * @param a the value */ - void setAltValue(UString const &a); + void setAltValue(UStringView a); /** * Set the variant value to use in expansion * @param v the value */ - void setVariantValue(UString const &v); + void setVariantValue(UStringView v); /** * Set the variant_left value to use in expansion * @param v the value */ - void setVariantLeftValue(UString const &v); + void setVariantLeftValue(UStringView v); /** * Set the variant_right value to use in expansion * @param v the value */ - void setVariantRightValue(UString const &v); + void setVariantRightValue(UStringView v); /** * Set if we are going to keep morpheme boundaries diff -Nru lttoolbox-3.6.6/lttoolbox/file_utils.cc lttoolbox-3.7.1/lttoolbox/file_utils.cc --- lttoolbox-3.6.6/lttoolbox/file_utils.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/file_utils.cc 2022-11-01 08:36:47.000000000 +0000 @@ -21,14 +21,14 @@ #include UFILE* -openOutTextFile(const string& fname) +openOutTextFile(const std::string& fname) { if (fname.empty() || fname == "-") { return u_finit(stdout, NULL, NULL); } else { UFILE* ret = u_fopen(fname.c_str(), "wb", NULL, NULL); if (!ret) { - cerr << "Error: Cannot open file '" << fname << "' for writing." << endl; + std::cerr << "Error: Cannot open file '" << fname << "' for writing." << std::endl; exit(EXIT_FAILURE); } return ret; @@ -36,14 +36,14 @@ } FILE* -openOutBinFile(const string& fname) +openOutBinFile(const std::string& fname) { if (fname.empty() || fname == "-") { return stdout; } else { FILE* ret = fopen(fname.c_str(), "wb"); if (!ret) { - cerr << "Error: Cannot open file '" << fname << "' for writing." << endl; + std::cerr << "Error: Cannot open file '" << fname << "' for writing." << std::endl; exit(EXIT_FAILURE); } return ret; @@ -51,14 +51,14 @@ } FILE* -openInBinFile(const string& fname) +openInBinFile(const std::string& fname) { if (fname.empty() || fname == "-") { return stdin; } else { FILE* ret = fopen(fname.c_str(), "rb"); if (!ret) { - cerr << "Error: Cannot open file '" << fname << "' for reading." << endl; + std::cerr << "Error: Cannot open file '" << fname << "' for reading." << std::endl; exit(EXIT_FAILURE); } return ret; @@ -66,9 +66,9 @@ } void -writeTransducerSet(FILE* output, const UString& letters, +writeTransducerSet(FILE* output, UStringView letters, Alphabet& alpha, - map& trans) + std::map& trans) { fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); uint64_t features = 0; @@ -80,13 +80,21 @@ for (auto& it : trans) { Compression::string_write(it.first, output); it.second.write(output); - cout << it.first << " " << it.second.size(); - cout << " " << it.second.numberOfTransitions() << endl; + std::cout << it.first << " " << it.second.size(); + std::cout << " " << it.second.numberOfTransitions() << std::endl; } } void -readShared(FILE* input, set& letters, Alphabet& alpha) +writeTransducerSet(FILE* output, const std::set& letters, + Alphabet& alpha, + std::map& trans) +{ + writeTransducerSet(output, UString(letters.begin(), letters.end()), alpha, trans); +} + +void +readShared(FILE* input, std::set& letters, Alphabet& alpha) { fpos_t pos; if (fgetpos(input, &pos) == 0) { @@ -111,9 +119,9 @@ } void -readTransducerSet(FILE* input, set& letters, +readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, - map& trans) + std::map& trans) { readShared(input, letters, alpha); @@ -124,9 +132,9 @@ } void -readTransducerSet(FILE* input, set& letters, +readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, - map& trans) + std::map& trans) { readShared(input, letters, alpha); diff -Nru lttoolbox-3.6.6/lttoolbox/file_utils.h lttoolbox-3.7.1/lttoolbox/file_utils.h --- lttoolbox-3.6.6/lttoolbox/file_utils.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/file_utils.h 2022-11-01 08:36:47.000000000 +0000 @@ -23,18 +23,21 @@ #include -UFILE* openOutTextFile(const string& fname); -FILE* openOutBinFile(const string& fname); -FILE* openInBinFile(const string& fname); +UFILE* openOutTextFile(const std::string& fname); +FILE* openOutBinFile(const std::string& fname); +FILE* openInBinFile(const std::string& fname); -void writeTransducerSet(FILE* output, const UString& letters, +void writeTransducerSet(FILE* output, UStringView letters, Alphabet& alpha, - map& trans); -void readTransducerSet(FILE* input, set& letters, + std::map& trans); +void writeTransducerSet(FILE* output, const std::set& letters, + Alphabet& alpha, + std::map& trans); +void readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, - map& trans); -void readTransducerSet(FILE* input, set& letters, + std::map& trans); +void readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, - map& trans); + std::map& trans); #endif // __FILE_UTILS_H__ diff -Nru lttoolbox-3.6.6/lttoolbox/fst_processor.cc lttoolbox-3.7.1/lttoolbox/fst_processor.cc --- lttoolbox-3.6.6/lttoolbox/fst_processor.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/fst_processor.cc 2022-11-01 08:36:47.000000000 +0000 @@ -19,27 +19,13 @@ #include #include #include +#include #include #include #include -using namespace std; - - -UString const FSTProcessor::XML_TEXT_NODE = "#text"_u; -UString const FSTProcessor::XML_COMMENT_NODE = "#comment"_u; -UString const FSTProcessor::XML_IGNORED_CHARS_ELEM = "ignored-chars"_u; -UString const FSTProcessor::XML_RESTORE_CHAR_ELEM = "restore-char"_u; -UString const FSTProcessor::XML_RESTORE_CHARS_ELEM = "restore-chars"_u; -UString const FSTProcessor::XML_VALUE_ATTR = "value"_u; -UString const FSTProcessor::XML_CHAR_ELEM = "char"_u; -UString const FSTProcessor::WBLANK_START = "[["_u; -UString const FSTProcessor::WBLANK_END = "]]"_u; -UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; - - FSTProcessor::FSTProcessor() { // escaped_chars chars @@ -68,14 +54,14 @@ } void -FSTProcessor::parseICX(string const &file) +FSTProcessor::parseICX(std::string const &file) { if(useIgnoredChars) { reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - cerr << "Error: cannot open '" << file << "'." << endl; + std::cerr << "Error: cannot open '" << file << "'." << std::endl; exit(EXIT_FAILURE); } int ret = xmlTextReaderRead(reader); @@ -93,14 +79,14 @@ } void -FSTProcessor::parseRCX(string const &file) +FSTProcessor::parseRCX(std::string const &file) { if(useRestoreChars) { reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - cerr << "Error: cannot open '" << file << "'." << endl; + std::cerr << "Error: cannot open '" << file << "'." << std::endl; exit(EXIT_FAILURE); } int ret = xmlTextReaderRead(reader); @@ -134,8 +120,8 @@ } else { - cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; + std::cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid node '<" << name << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -172,82 +158,18 @@ } else { - cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; + std::cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid node '<" << name << ">'." << std::endl; exit(EXIT_FAILURE); } } -bool -FSTProcessor::wblankPostGen(InputFile& input, UFILE *output) -{ - UString result = WBLANK_START; - UChar32 c = 0; - bool in_content = false; - - while(!input.eof()) - { - c = input.get(); - if(in_content && c == '~') - { - if(result[result.size()-1] == ']') { - // We just saw the end of a wblank, may want to merge - wblankqueue.push(result); - } - else { - // wake-up-mark happened some characters into the wblanked word - write(result, output); - } - return true; - } - else - { - result += c; - } - - if(c == '\\') - { - if (input.eof()) streamError(); - result += input.get(); - } - else if(c == ']') - { - c = input.get(); - result += c; - - if(c == ']') - { - int resultlen = result.size(); - if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]] - { - write(result, output); - break; - } - else - { - in_content = true; // Assumption: No nested wblanks, always balanced - } - } - } - } - - if(c != ']') - { - streamError(); - } - - return false; -} - int FSTProcessor::readAnalysis(InputFile& input) { if (!input_buffer.isEmpty()) { UChar32 val = input_buffer.next(); - while ((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) { - val = input_buffer.next(); - } return val; } @@ -263,7 +185,6 @@ while ((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) { - input_buffer.add(val); val = input.get(); } @@ -373,9 +294,9 @@ val = input.get(); } while(u_isdigit(val)); input.unget(val); - input_buffer.add(alphabet(""_u)); + input_buffer.add(alphabet(u"")); numbers.push_back(ws); - return alphabet(""_u); + return alphabet(u""); } break; @@ -388,68 +309,93 @@ return val; } -int32_t -FSTProcessor::readPostgeneration(InputFile& input, UFILE *output) +bool +FSTProcessor::readTransliterationBlank(InputFile& input) { - if(!input_buffer.isEmpty()) - { - return input_buffer.next(); + UString blank; + while (!input.eof()) { + UChar32 c = input.get(); + if (u_isspace(c)) { + blank += c; + } else if (c == '[') { + if (input.peek() == '[') { + break; + } + blank += input.readBlock('[', ']'); + } else { + input.unget(c); + break; + } } - - UChar32 val = input.get(); - int32_t altval = 0; - is_wblank = false; - if(input.eof()) - { - return 0; + if (!blank.empty()) { + blankqueue.push(blank); } + return !blank.empty(); +} - switch(val) - { - case '<': - altval = alphabet(input.readBlock('<', '>')); - input_buffer.add(altval); - return altval; +bool +FSTProcessor::readTransliterationWord(InputFile& input) +{ + if (input.eof() || input.peek() == '\0') { + return false; + } - case '[': - val = input.get(); + if (!readTransliterationBlank(input)) { + blankqueue.push(""_u); + } - if(val == '[') - { - if(collect_wblanks) - { - wblankqueue.push(input.finishWBlank()); - is_wblank = true; - return static_cast(' '); - } - else if(wblankPostGen(input, output)) - { - return static_cast('~'); - } - else - { - is_wblank = true; - return static_cast(' '); + UString wblank; + std::vector word; + if (input.peek() == '[') { + input.get(); + wblank = input.finishWBlank(); + while (!input.eof()) { + if (readTransliterationBlank(input)) { + word.push_back(static_cast(' ')); + if (input.peek() == '[') break; + } else { + UChar32 c = input.get(); + if (c == '[') { + input.unget(c); + break; + } else if (c == '\\') { + word.push_back(static_cast(input.get())); + } else if (c == '<') { + word.push_back(alphabet(input.readBlock('<', '>'))); + } else if (c == '\0') { + input.unget(c); + break; + } else { + word.push_back(static_cast(c)); } } - else - { - input.unget(val); - blankqueue.push(input.readBlock('[', ']')); - - input_buffer.add(static_cast(' ')); - return static_cast(' '); + } + if (input.peek() == '[') { + input.get(); + input.finishWBlank(); + } + } else { + while (!input.eof()) { + UChar32 c = input.get(); + if (u_isspace(c) || c == '[' || c == '\0') { + input.unget(c); + break; + } else if (c == '\\') { + word.push_back(static_cast(input.get())); + } else if (c == '<') { + word.push_back(alphabet(input.readBlock('<', '>'))); + } else { + word.push_back(static_cast(c)); } - - case '\\': - val = input.get(); - input_buffer.add(static_cast(val)); - return val; - - default: - input_buffer.add(val); - return val; + } } + if (word.empty()) { + return false; + } + wblankqueue.push_back(wblank); + transliteration_queue.push_back(word); + + return true; } void @@ -583,7 +529,7 @@ return 0x7fffffff; } -pair +std::pair FSTProcessor::readBilingual(InputFile& input, UFILE *output) { UChar32 val = input.get(); @@ -591,7 +537,7 @@ if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } if(outOfWord) @@ -601,7 +547,7 @@ val = input.get(); if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } } else if(val == '\\') @@ -610,14 +556,14 @@ val = input.get(); if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } u_fputc(val,output); skipUntil(input, output, '^'); val = input.get(); if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } } else @@ -627,7 +573,7 @@ val = input.get(); if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } } outOfWord = false; @@ -636,12 +582,12 @@ if(val == '\\') { val = input.get(); - return pair(symbol, val); + return std::pair(symbol, val); } else if(val == '$') { outOfWord = true; - return pair(symbol, static_cast('$')); + return std::pair(symbol, static_cast('$')); } else if(val == '<') { @@ -653,7 +599,7 @@ { symbol = cad; } - return pair(symbol, res); + return std::pair(symbol, res); } else if(val == '[') { @@ -671,7 +617,7 @@ return readBilingual(input, output); } - return pair(symbol, val); + return std::pair(symbol, val); } void @@ -685,65 +631,6 @@ } void -FSTProcessor::flushWblanks(UFILE *output) -{ - while(wblankqueue.size() > 0) - { - write(wblankqueue.front(), output); - wblankqueue.pop(); - } -} - -UString -FSTProcessor::combineWblanks() -{ - UString final_wblank; - UString last_wblank; - bool seen_wblank = false; - - while(wblankqueue.size() > 0) - { - if(wblankqueue.front().compare(WBLANK_FINAL) == 0) - { - if(seen_wblank) { - if(final_wblank.empty()) - { - final_wblank += WBLANK_START; - } - else if(final_wblank.size() > 2) - { - final_wblank += "; "_u; - } - - final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]] - } - else { - need_end_wblank = true; - } - last_wblank.clear(); - } - else - { - seen_wblank = true; - last_wblank = wblankqueue.front(); - } - wblankqueue.pop(); - } - - if(!last_wblank.empty()) - { - wblankqueue.push(last_wblank); - } - - if(!final_wblank.empty()) - { - final_wblank += WBLANK_END; - need_end_wblank = true; - } - return final_wblank; -} - -void FSTProcessor::calcInitial() { for(auto& it : transducers) { @@ -753,54 +640,41 @@ initial_state.init(&root); } -bool -FSTProcessor::endsWith(UString const &str, UString const &suffix) -{ - if(str.size() < suffix.size()) - { - return false; - } - else - { - return str.substr(str.size()-suffix.size()) == suffix; - } -} - void FSTProcessor::classifyFinals() { for(auto& it : transducers) { - if(endsWith(it.first, "@inconditional"_u)) + if(StringUtils::endswith(it.first, u"@inconditional")) { inconditional.insert(it.second.getFinals().begin(), it.second.getFinals().end()); } - else if(endsWith(it.first, "@standard"_u)) + else if(StringUtils::endswith(it.first, u"@standard")) { standard.insert(it.second.getFinals().begin(), it.second.getFinals().end()); } - else if(endsWith(it.first, "@postblank"_u)) + else if(StringUtils::endswith(it.first, u"@postblank")) { postblank.insert(it.second.getFinals().begin(), it.second.getFinals().end()); } - else if(endsWith(it.first, "@preblank"_u)) + else if(StringUtils::endswith(it.first, u"@preblank")) { preblank.insert(it.second.getFinals().begin(), it.second.getFinals().end()); } else { - cerr << "Error: Unsupported transducer type for '"; - cerr << it.first << "'." << endl; + std::cerr << "Error: Unsupported transducer type for '"; + std::cerr << it.first << "'." << std::endl; exit(EXIT_FAILURE); } } } UString -FSTProcessor::filterFinals(const State& state, const UString& casefrom) +FSTProcessor::filterFinals(const State& state, UStringView casefrom) { bool firstupper = false, uppercase = false; if (!dictionaryCase) { @@ -814,7 +688,7 @@ } void -FSTProcessor::writeEscaped(UString const &str, UFILE *output) +FSTProcessor::writeEscaped(UStringView str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { @@ -827,7 +701,7 @@ } size_t -FSTProcessor::writeEscapedPopBlanks(UString const &str, UFILE *output) +FSTProcessor::writeEscapedPopBlanks(UStringView str, UFILE *output) { size_t postpop = 0; for (unsigned int i = 0, limit = str.size(); i < limit; i++) @@ -848,7 +722,7 @@ } void -FSTProcessor::writeEscapedWithTags(UString const &str, UFILE *output) +FSTProcessor::writeEscapedWithTags(UStringView str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { @@ -869,7 +743,7 @@ void -FSTProcessor::printWord(UString const &sf, UString const &lf, UFILE *output) +FSTProcessor::printWord(UStringView sf, UStringView lf, UFILE *output) { u_fputc('^', output); writeEscaped(sf, output); @@ -878,11 +752,11 @@ } void -FSTProcessor::printWordPopBlank(UString const &sf, UString const &lf, UFILE *output) +FSTProcessor::printWordPopBlank(UStringView sf, UStringView lf, UFILE *output) { u_fputc('^', output); size_t postpop = writeEscapedPopBlanks(sf, output); - u_fprintf(output, "%S$", lf.c_str()); + u_fprintf(output, "%.*S$", lf.size(), lf.data()); while (postpop-- && blankqueue.size() > 0) { write(blankqueue.front(), output); @@ -891,13 +765,13 @@ } void -FSTProcessor::printWordBilingual(UString const &sf, UString const &lf, UFILE *output) +FSTProcessor::printWordBilingual(UStringView sf, UStringView lf, UFILE *output) { - u_fprintf(output, "^%S%S$", sf.c_str(), lf.c_str()); + u_fprintf(output, "^%.*S%.*S$", sf.size(), sf.data(), lf.size(), lf.data()); } void -FSTProcessor::printUnknownWord(UString const &sf, UFILE *output) +FSTProcessor::printUnknownWord(UStringView sf, UFILE *output) { u_fputc('^', output); writeEscaped(sf, output); @@ -908,7 +782,7 @@ } unsigned int -FSTProcessor::lastBlank(UString const &str) +FSTProcessor::lastBlank(UStringView str) { for(int i = static_cast(str.size())-1; i >= 0; i--) { @@ -922,7 +796,7 @@ } void -FSTProcessor::printSpace(UChar32 const val, UFILE *output) +FSTProcessor::printSpace(UChar32 val, UFILE *output) { if(blankqueue.size() > 0) { @@ -935,7 +809,7 @@ } void -FSTProcessor::printChar(const UChar32 val, UFILE* output) +FSTProcessor::printChar(UChar32 val, UFILE* output) { if (u_isspace(val)) { if (blankqueue.size() > 0) { @@ -955,13 +829,13 @@ } bool -FSTProcessor::isEscaped(UChar32 const c) const +FSTProcessor::isEscaped(UChar32 c) const { return escaped_chars.find(c) != escaped_chars.end(); } bool -FSTProcessor::isAlphabetic(UChar32 const c) const +FSTProcessor::isAlphabetic(UChar32 c) const { return u_isalnum(c) || alphabetic_chars.find(c) != alphabetic_chars.end(); } @@ -1006,7 +880,7 @@ } void -FSTProcessor::initPostgeneration() +FSTProcessor::initTransliteration() { initGeneration(); } @@ -1029,12 +903,12 @@ { UChar val=input_word[i]; - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); if(current_state.size() > MAX_COMBINATIONS) { - cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << endl; - cerr << " gave up at char " << i << " '" << val << "'." << endl; + std::cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << std::endl; + std::cerr << " gave up at char " << i << " '" << val << "'." << std::endl; UString nullString; return nullString; @@ -1061,30 +935,30 @@ void FSTProcessor::initDecompositionSymbols() { - if((compoundOnlyLSymbol=alphabet("<:co:only-L>"_u)) == 0 - && (compoundOnlyLSymbol=alphabet("<:compound:only-L>"_u)) == 0 - && (compoundOnlyLSymbol=alphabet("<@co:only-L>"_u)) == 0 - && (compoundOnlyLSymbol=alphabet("<@compound:only-L>"_u)) == 0 - && (compoundOnlyLSymbol=alphabet(""_u)) == 0) + if((compoundOnlyLSymbol=alphabet(u"<:co:only-L>")) == 0 + && (compoundOnlyLSymbol=alphabet(u"<:compound:only-L>")) == 0 + && (compoundOnlyLSymbol=alphabet(u"<@co:only-L>")) == 0 + && (compoundOnlyLSymbol=alphabet(u"<@compound:only-L>")) == 0 + && (compoundOnlyLSymbol=alphabet(u"")) == 0) { - cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << endl; + std::cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << std::endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundOnlyLSymbol, ""_u); + alphabet.setSymbol(compoundOnlyLSymbol, u""); } - if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0 - && (compoundRSymbol=alphabet("<:compound:R>"_u)) == 0 - && (compoundRSymbol=alphabet("<@co:R>"_u)) == 0 - && (compoundRSymbol=alphabet("<@compound:R>"_u)) == 0 - && (compoundRSymbol=alphabet(""_u)) == 0) + if((compoundRSymbol=alphabet(u"<:co:R>")) == 0 + && (compoundRSymbol=alphabet(u"<:compound:R>")) == 0 + && (compoundRSymbol=alphabet(u"<@co:R>")) == 0 + && (compoundRSymbol=alphabet(u"<@compound:R>")) == 0 + && (compoundRSymbol=alphabet(u"")) == 0) { - cerr << "Warning: Decomposition symbol <:compound:R> not found" << endl; + std::cerr << "Warning: Decomposition symbol <:compound:R> not found" << std::endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundRSymbol, ""_u); + alphabet.setSymbol(compoundRSymbol, u""); } } @@ -1113,9 +987,10 @@ UString sf; // surface form UString lf_spcmp; // space compound analysis bool seen_cpL = false; // have we seen a tag so far + size_t last_start = input_buffer.getPos(); // position in input_buffer when sf was last cleared size_t last = 0; // position in input_buffer after last analysis size_t last_size = 0; // size of sf at last analysis - map >::iterator rcx_map_ptr; + std::map >::iterator rcx_map_ptr; UChar32 val; do @@ -1192,8 +1067,8 @@ if(useRestoreChars && rcx_map.find(val) != rcx_map.end()) { rcx_map_ptr = rcx_map.find(val); - set tmpset = rcx_map_ptr->second; - if(!u_isupper(val) || caseSensitive) + std::set tmpset = rcx_map_ptr->second; + if(!u_isupper(val) || beCaseSensitive(current_state)) { current_state.step(val, tmpset); } @@ -1212,7 +1087,7 @@ } else { - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); } if(current_state.size() != 0) @@ -1291,18 +1166,16 @@ } while((val = readAnalysis(input)) && isAlphabetic(val)); - unsigned int limit = firstNotAlpha(sf); - unsigned int size = sf.size(); - limit = (limit == static_cast(UString::npos)?size:limit); - if(limit == 0) + auto limit = firstNotAlpha(sf); + if(limit.i_codepoint == 0) { - input_buffer.back(sf.size()); + input_buffer.setPos(1 + last_start); writeEscaped(sf.substr(0,1), output); } else { - input_buffer.back(1+(size-limit)); - UString unknown_word = sf.substr(0, limit); + input_buffer.setPos(last_start + limit.i_codepoint); + UString unknown_word = sf.substr(0, limit.i_utf16); if(do_decomposition) { UString compound = compoundAnalysis(unknown_word); @@ -1323,18 +1196,16 @@ } else if(lf.empty()) { - unsigned int limit = firstNotAlpha(sf); - unsigned int size = sf.size(); - limit = (limit == static_cast(UString::npos)?size:limit); - if(limit == 0) + auto limit = firstNotAlpha(sf); + if(limit.i_codepoint == 0) { - input_buffer.back(sf.size()); + input_buffer.setPos(1 + last_start); writeEscaped(sf.substr(0,1), output); } else { - input_buffer.back(1+(size-limit)); - UString unknown_word = sf.substr(0, limit); + input_buffer.setPos(last_start + limit.i_codepoint); + UString unknown_word = sf.substr(0, limit.i_utf16); if(do_decomposition) { UString compound = compoundAnalysis(unknown_word); @@ -1369,6 +1240,7 @@ current_state = initial_state; lf.clear(); sf.clear(); + last_start = input_buffer.getPos(); last_incond = false; last_postblank = false; last_preblank = false; @@ -1412,42 +1284,6 @@ } void -FSTProcessor::postgeneration_wrapper_null_flush(InputFile& input, UFILE *output) -{ - setNullFlush(false); - while(!input.eof()) - { - postgeneration(input, output); - u_fputc('\0', output); - u_fflush(output); - } -} - -void -FSTProcessor::intergeneration_wrapper_null_flush(InputFile& input, UFILE *output) -{ - setNullFlush(false); - while (!input.eof()) - { - intergeneration(input, output); - u_fputc('\0', output); - u_fflush(output); - } -} - -void -FSTProcessor::transliteration_wrapper_null_flush(InputFile& input, UFILE *output) -{ - setNullFlush(false); - while(!input.eof()) - { - transliteration(input, output); - u_fputc('\0', output); - u_fflush(output); - } -} - -void FSTProcessor::tm_analysis(InputFile& input, UFILE *output) { State current_state = initial_state; @@ -1744,7 +1580,7 @@ alphabet.getSymbol(sf,val); if(current_state.size() > 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !(beCaseSensitive(current_state))) { if(mode == gm_carefulcase) { @@ -1767,426 +1603,211 @@ void FSTProcessor::postgeneration(InputFile& input, UFILE *output) { - if(getNullFlush()) - { - postgeneration_wrapper_null_flush(input, output); - } - - bool skip_mode = true; - collect_wblanks = false; - need_end_wblank = false; - State current_state = initial_state; - UString lf; - UString sf; - int last = 0; - set empty_escaped_chars; - - while(UChar32 val = readPostgeneration(input, output)) - { - if(val == '~') - { - skip_mode = false; - collect_wblanks = true; - } - - if(is_wblank && skip_mode) - { - //do nothing - } - else if(skip_mode) - { - if(u_isspace(val)) - { - if(need_end_wblank) - { - write(WBLANK_FINAL, output); - need_end_wblank = false; - } + transliteration_drop_tilde = true; + transliteration(input, output); +} - printSpace(val, output); - } - else - { - if(!need_end_wblank) - { - flushWblanks(output); - } +void +FSTProcessor::intergeneration(InputFile& input, UFILE *output) +{ + transliteration_drop_tilde = false; + transliteration(input, output); +} - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); +void +FSTProcessor::transliteration(InputFile& input, UFILE *output) +{ + size_t start_pos = 0; + size_t cur_word = 0; + size_t cur_pos = 0; + size_t match_pos = 0; + State current_state = initial_state; + UString last_match; + int space_diff = 0; - if(need_end_wblank) - { - write(WBLANK_FINAL, output); - need_end_wblank = false; + bool firstupper = false; + bool uppercase = false; + bool have_first = false; + bool have_second = false; + + while (true) { + if (transliteration_queue.empty()) { + if (!blankqueue.empty()) { + flushBlanks(output); + } + if (!readTransliterationWord(input)) { + flushBlanks(output); + if (input.eof()) { + break; + } else { + u_fputc(input.get(), output); + u_fflush(output); + continue; } } } - else - { - if(is_wblank) - { - continue; - } - - // test for final states - if(current_state.isFinal(all_finals)) - { - bool firstupper = u_isupper(sf[1]); - bool uppercase = sf.size() > 1 && firstupper && u_isupper(sf[2]); - lf = current_state.filterFinals(all_finals, alphabet, - empty_escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - // case of the beggining of the next word - - UString mybuf; - for(size_t i = sf.size(); i > 0; --i) - { - if(!isalpha(sf[i-1])) - { + if (current_state.isFinal(all_finals)) { + last_match = current_state.filterFinals(all_finals, alphabet, + escaped_chars, displayWeightsMode, + 1, maxWeightClasses, + uppercase, firstupper); + while (cur_word > 0) { + if (cur_word == 1) { + if (cur_pos == 0 && last_match[last_match.size()-1] == ' ') { + match_pos = transliteration_queue.front().size(); + last_match = last_match.substr(0, last_match.size()-1); break; - } - else - { - mybuf = sf[i-1] + mybuf; - } - } - - if(mybuf.size() > 0) - { - bool myfirstupper = u_isupper(mybuf[0]); - bool myuppercase = mybuf.size() > 1 && u_isupper(mybuf[1]); - - for(size_t i = lf.size(); i > 0; --i) - { - if(!isalpha(lf[i-1])) - { - if(myfirstupper && i != lf.size()) - { - lf[i] = u_toupper(lf[i]); - } - else - { - lf[i] = u_tolower(lf[i]); - } - break; - } - else - { - if(myuppercase) - { - lf[i-1] = u_toupper(lf[i-1]); - } - else - { - lf[i-1] = u_tolower(lf[i-1]); - } - } + } else { + cur_pos += transliteration_queue.front().size() + 1; } } - - last = input_buffer.getPos(); + std::vector word = transliteration_queue.front(); + transliteration_queue.pop_front(); + word.push_back(static_cast(' ')); + word.insert(word.end(), transliteration_queue.front().begin(), + transliteration_queue.front().end()); + transliteration_queue.pop_front(); + transliteration_queue.push_front(word); + UString wblank = wblankqueue.front(); + wblankqueue.pop_front(); + wblank = StringUtils::merge_wblanks(wblank, wblankqueue.front()); + wblankqueue.pop_front(); + wblankqueue.push_front(wblank); + cur_word--; + } + if (cur_word == 0) { + match_pos = cur_pos; } + } - current_state.step_case(val, caseSensitive); - - if(current_state.size() != 0) - { - alphabet.getSymbol(sf, val); + int32_t sym = 0; + bool is_end = false; + if (cur_pos < transliteration_queue[cur_word].size()) { + sym = transliteration_queue[cur_word][cur_pos]; + cur_pos++; + } else { + if (cur_word + 1 == transliteration_queue.size() && + !readTransliterationWord(input)) { + is_end = true; + } else { + sym = static_cast(' '); + cur_word++; + cur_pos = 0; } - else - { - UString final_wblank = combineWblanks(); - write(final_wblank, output); - - if(lf.empty()) - { - unsigned int mark = sf.size(); - unsigned int space_index = sf.size(); - - for(unsigned int i = 1, limit = sf.size(); i < limit; i++) - { - if(sf[i] == '~') - { - mark = i; - break; - } - else if(sf[i] == ' ') - { - space_index = i; - } - } - - if(space_index != sf.size()) - { - write(sf.substr(1, space_index-1), output); - - if(need_end_wblank) - { - write(WBLANK_FINAL, output); - need_end_wblank = false; - u_fputc(sf[space_index], output); - flushWblanks(output); - } - else - { - u_fputc(sf[space_index], output); - } - - write(sf.substr(space_index+1, mark-space_index-1), output); - } - else - { - flushWblanks(output); - write(sf.substr(1, mark-1), output); - } + } - if(mark == sf.size()) - { - input_buffer.back(1); - } - else - { - input_buffer.back(sf.size()-mark); - } - } - else - { - write(lf.substr(1,lf.size()-3), output); - input_buffer.setPos(last); - input_buffer.back(2); - val = lf[lf.size()-2]; - if(u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); - } + if (isAlphabetic(sym)) { + if (!have_first) { + have_first = true; + if (u_isupper(sym)) { + firstupper = true; + } else { + firstupper = false; + have_second = true; } - - current_state = initial_state; - lf.clear(); - sf.clear(); - skip_mode = true; - collect_wblanks = false; + } else if (!have_second) { + have_second = true; + uppercase = u_isupper(sym); } } - } - - // print remaining blanks - flushBlanks(output); -} - -void -FSTProcessor::intergeneration(InputFile& input, UFILE *output) -{ - if (getNullFlush()) - { - intergeneration_wrapper_null_flush(input, output); - } - - bool skip_mode = true; - State current_state = initial_state; - UString target; - UString source; - int last = 0; - set empty_escaped_chars; - - while (true) - { - UChar32 val = readPostgeneration(input, output); - if (val == '~') - { - skip_mode = false; - } + current_state.step_case_override(sym, beCaseSensitive(current_state)); - if (skip_mode) - { - if (u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(val != '\0') - { - if (isEscaped(val)) - { - u_fputc('\\', output); + if (current_state.size() == 0 || is_end) { + if (last_match.empty()) { + start_pos++; + } else { + std::vector match = alphabet.tokenize(last_match.substr(1)); + last_match.clear(); + std::vector word = transliteration_queue.front(); + transliteration_queue.pop_front(); + size_t i = 0; + for (; i < match.size() && i < match_pos - start_pos; i++) { + if (match[match.size()-i-1] != word[match_pos-i-1]) { + break; } - u_fputc(val, output); } + std::vector new_word; + new_word.insert(new_word.end(), word.begin(), word.begin()+start_pos); + new_word.insert(new_word.end(), match.begin(), match.end()); + new_word.insert(new_word.end(), word.begin()+match_pos, word.end()); + transliteration_queue.push_front(new_word); + int sf_spaces = 0; + int lf_spaces = 0; + for (auto c : word) { + if (c == static_cast(' ')) sf_spaces++; + } + for (auto c : new_word) { + if (c == static_cast(' ')) lf_spaces++; + } + space_diff += (lf_spaces - sf_spaces); + size_t last_start = start_pos; + start_pos = match_pos - i; + if (start_pos == last_start) start_pos++; + cur_pos = start_pos; + cur_word = 0; } - } - else - { - // test for final states - if (current_state.isFinal(all_finals)) - { - bool firstupper = u_isupper(source[1]); - bool uppercase = source.size() > 1 && firstupper && u_isupper(source[2]); - target = current_state.filterFinals(all_finals, alphabet, - empty_escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - - last = input_buffer.getPos(); - } - - if (val != '\0') - { - current_state.step_case(val, caseSensitive); - } - - if (val != '\0' && current_state.size() != 0) - { - alphabet.getSymbol(source, val); - } - else - { - if (target.empty()) // no match - { - if (val == '\0') - { - // flush source - write(source, output); - } - else - { - u_fputc(source[0], output); - - unsigned int mark, limit; - for (mark = 1, limit = source.size(); mark < limit && source[mark] != '~' ; mark++) - { - u_fputc(source[mark], output); - } - - if (mark != source.size()) - { - int back = source.size() - mark; - input_buffer.back(back); - } - - if (val == '~') - { - input_buffer.back(1); + if (start_pos >= transliteration_queue.front().size()) { + write(blankqueue.front(), output); + blankqueue.pop(); + bool has_wblank = !wblankqueue.front().empty(); + write(wblankqueue.front(), output); + wblankqueue.pop_front(); + auto word = transliteration_queue.front(); + transliteration_queue.pop_front(); + int space_count = 0; + for (auto c : word) { + if (c == static_cast(' ')) space_count++; + } + int space_out = 0; + UString out; + for (auto c : word) { + if (c == ' ') { + if (space_out + space_diff >= space_count) { + out += ' '; } else { - u_fputc(val, output); - } - } - } - else - { - for(unsigned int i=1; i('~')) { + } else { + if (c > 0 && isEscaped(c)) { + out += '\\'; } - } - - if (val != '\0') - { - input_buffer.setPos(last); - input_buffer.back(1); + alphabet.getSymbol(out, c); } } - - current_state = initial_state; - target.clear(); - source.clear(); - skip_mode = true; - } - } - - if (val == '\0') - { - break; - } - } - - // print remaining blanks - flushBlanks(output); -} - -void -FSTProcessor::transliteration(InputFile& input, UFILE *output) -{ - if(getNullFlush()) - { - transliteration_wrapper_null_flush(input, output); - } - - State current_state = initial_state; - UString lf; - UString sf; - UString last_lf; - int rewind_point = 0; - int last_match = 0; - UChar32 firstchar = 0; - - while(UChar32 val = readPostgeneration(input, output)) { - if (sf.empty()) { - firstchar = val; - rewind_point = input_buffer.getPos(); - } else { - lf = filterFinals(current_state, sf); - if (!lf.empty()) { - last_match = input_buffer.getPos(); - last_lf.swap(lf); - } - } - current_state.step(val); - if (current_state.size() != 0) { - alphabet.getSymbol(sf, val); - } else { - if (last_lf.empty()) { - input_buffer.setPos(rewind_point); - if (u_isspace(firstchar)) { - printSpace(firstchar, output); - } else { - if (isEscaped(firstchar)) { - u_fputc('\\', output); + write(out, output); + if (has_wblank) { + write(WBLANK_FINAL, output); + } + while (space_diff < 0) { + if (blankqueue.front() != " "_u) { + write(blankqueue.front(), output); } - u_fputc(firstchar, output); + blankqueue.pop(); + space_diff++; } - } else { - write(last_lf.substr(1), output); - last_lf.clear(); - input_buffer.setPos(last_match); - input_buffer.back(1); + space_diff = 0; + start_pos = 0; } - sf.clear(); + match_pos = 0; + cur_pos = start_pos; + cur_word = 0; + uppercase = false; + firstupper = false; + have_first = false; + have_second = false; current_state = initial_state; } } - // print remaining blanks - flushBlanks(output); } UString -FSTProcessor::biltransfull(UString const &input_word, bool with_delim) +FSTProcessor::biltransfull(UStringView input_word, bool with_delim) { State current_state = initial_state; UString result; @@ -2203,7 +1824,7 @@ if(input_word[start_point] == '*') { - return input_word; + return US(input_word); } if(input_word[start_point] == '=') @@ -2245,7 +1866,7 @@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2280,11 +1901,11 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } return result; } @@ -2339,7 +1960,7 @@ UString -FSTProcessor::biltrans(UString const &input_word, bool with_delim) +FSTProcessor::biltrans(UStringView input_word, bool with_delim) { State current_state = initial_state; UString result; @@ -2356,7 +1977,7 @@ if(input_word[start_point] == '*') { - return input_word; + return US(input_word); } if(input_word[start_point] == '=') @@ -2398,7 +2019,7 @@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2433,11 +2054,11 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } return result; } @@ -2500,7 +2121,7 @@ } UString -FSTProcessor::compose(UString const &lexforms, UString const &queue) const +FSTProcessor::compose(UStringView lexforms, UStringView queue) const { UString result; result.reserve(lexforms.size() + 2 * queue.size()); @@ -2540,7 +2161,7 @@ outOfWord = false; skipUntil(input, output, '^'); - pair tr; // readBilingual return value, containing: + std::pair tr; // readBilingual return value, containing: int val; // the alphabet value of current symbol, and UString symbol; // the current symbol as a string bool seentags = false; // have we seen any tags at all in the analysis? @@ -2656,7 +2277,7 @@ } if(current_state.size() != 0) { - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); } if(current_state.isFinal(all_finals)) { @@ -2693,8 +2314,8 @@ } } -pair -FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) +std::pair +FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim) { State current_state = initial_state; UString result; @@ -2712,7 +2333,7 @@ if(input_word[start_point] == '*') { - return pair(input_word, 0); + return {US(input_word), 0}; } if(input_word[start_point] == '=') @@ -2755,7 +2376,7 @@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2790,13 +2411,13 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } - return pair(result, 0); + return std::pair(result, 0); } } } @@ -2809,13 +2430,13 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } - return pair(result, 0); + return {result, 0}; } @@ -2849,7 +2470,7 @@ { result_with_queue += '$'; } - return pair(result_with_queue, queue.size()); + return {result_with_queue, queue.size()}; } else { @@ -2857,12 +2478,12 @@ { result += '$'; } - return pair(result, 0); + return {result, 0}; } } UString -FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) +FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim) { State current_state = initial_state; UString result; @@ -2878,7 +2499,7 @@ if(input_word[start_point] == '*') { - return input_word; + return US(input_word); } if(input_word[start_point] == '=') @@ -2920,7 +2541,7 @@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2951,11 +2572,11 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } return result; } @@ -2975,7 +2596,7 @@ { if(initial_state.isFinal(all_finals)) { - cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl; + std::cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << std::endl; return false; } else @@ -2984,7 +2605,7 @@ s.step(' '); if(s.size() != 0) { - cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl; + std::cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << std::endl; return false; } } @@ -3011,9 +2632,9 @@ if(val == '<') { UString str = input.readBlock('<', '>'); - if(str.substr(0, 9) == ""_u) + while(!StringUtils::endswith(str, u"]]>")) { str.append(input.readBlock('<', '>').substr(1)); } @@ -3047,7 +2668,7 @@ } void -FSTProcessor::printSAOWord(UString const &lf, UFILE *output) +FSTProcessor::printSAOWord(UStringView lf, UFILE *output) { for(unsigned int i = 1, limit = lf.size(); i != limit; i++) { @@ -3123,7 +2744,7 @@ last = input_buffer.getPos(); } - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); if(current_state.size() != 0) { @@ -3169,18 +2790,16 @@ } while((val = readSAO(input)) && isAlphabetic(val)); - unsigned int limit = firstNotAlpha(sf); - unsigned int size = sf.size(); - limit = (limit == static_cast(UString::npos)?size:limit); - input_buffer.back(1+(size-limit)); + auto limit = firstNotAlpha(sf); + unsigned int size = sf.size(); // TODO: change these to character counts + input_buffer.back(1+(size-limit.i_utf16)); u_fprintf(output, "%S", sf.c_str()); } else if(lf.empty()) { - unsigned int limit = firstNotAlpha(sf); - unsigned int size = sf.size(); - limit = (limit == static_cast(UString::npos)?size:limit); - input_buffer.back(1+(size-limit)); + auto limit = firstNotAlpha(sf); + unsigned int size = sf.size(); // TODO: change these to character counts + input_buffer.back(1+(size-limit.i_utf16)); u_fprintf(output, "%S", sf.c_str()); } else @@ -3202,8 +2821,8 @@ flushBlanks(output); } -UString -FSTProcessor::removeTags(UString const &str) +UStringView +FSTProcessor::removeTags(UStringView str) { for(unsigned int i = 0; i < str.size(); i++) { @@ -3218,61 +2837,61 @@ void -FSTProcessor::setBiltransSurfaceForms(bool const value) +FSTProcessor::setBiltransSurfaceForms(bool value) { biltransSurfaceForms = value; } void -FSTProcessor::setCaseSensitiveMode(bool const value) +FSTProcessor::setCaseSensitiveMode(bool value) { caseSensitive = value; } void -FSTProcessor::setDictionaryCaseMode(bool const value) +FSTProcessor::setDictionaryCaseMode(bool value) { dictionaryCase = value; } void -FSTProcessor::setNullFlush(bool const value) +FSTProcessor::setNullFlush(bool value) { nullFlush = value; } void -FSTProcessor::setIgnoredChars(bool const value) +FSTProcessor::setIgnoredChars(bool value) { useIgnoredChars = value; } void -FSTProcessor::setRestoreChars(bool const value) +FSTProcessor::setRestoreChars(bool value) { useRestoreChars = value; } void -FSTProcessor::setUseDefaultIgnoredChars(bool const value) +FSTProcessor::setUseDefaultIgnoredChars(bool value) { useDefaultIgnoredChars = value; } void -FSTProcessor::setDisplayWeightsMode(bool const value) +FSTProcessor::setDisplayWeightsMode(bool value) { displayWeightsMode = value; } void -FSTProcessor::setMaxAnalysesValue(int const value) +FSTProcessor::setMaxAnalysesValue(int value) { maxAnalyses = value; } void -FSTProcessor::setMaxWeightClassesValue(int const value) +FSTProcessor::setMaxWeightClassesValue(int value) { maxWeightClasses = value; } @@ -3289,18 +2908,22 @@ return nullFlush; } -size_t -FSTProcessor::firstNotAlpha(UString const &sf) +FSTProcessor::Indices +FSTProcessor::firstNotAlpha(UStringView sf) { - UCharCharacterIterator it = UCharCharacterIterator(sf.c_str(), sf.size()); - size_t i = 0; + FSTProcessor::Indices ix = { 0, 0 }; + UCharCharacterIterator it = UCharCharacterIterator(sf.data(), sf.size()); while (it.hasNext()) { UChar32 c = it.next32PostInc(); if(!isAlphabetic(c)) { - return i; + return ix; + } + ix.i_codepoint++; + ix.i_utf16++; + if(c > UINT16_MAX) { + ix.i_utf16++; } - i += c > UINT16_MAX ? 2 : 1; } - return UString::npos; + return ix; } diff -Nru lttoolbox-3.6.6/lttoolbox/fst_processor.h lttoolbox-3.7.1/lttoolbox/fst_processor.h --- lttoolbox-3.6.6/lttoolbox/fst_processor.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/fst_processor.h 2022-11-01 08:36:47.000000000 +0000 @@ -28,14 +28,13 @@ #include #include +#include #include #include #include #include #include -using namespace std; - /** * Kind of output of the generator module */ @@ -58,12 +57,7 @@ /** * Transducers in FSTP */ - map transducers; - - /** - * Current state of lexical analysis - */ - State current_state; + std::map transducers; /** * Initial state of every token @@ -78,57 +72,59 @@ /** * The final states of inconditional sections in the dictionaries */ - map inconditional; + std::map inconditional; /** * The final states of standard sections in the dictionaries */ - map standard; + std::map standard; /** * The final states of postblank sections in the dictionaries */ - map postblank; + std::map postblank; /** * The final states of preblank sections in the dictionaries */ - map preblank; + std::map preblank; /** * Merge of 'inconditional', 'standard', 'postblank' and 'preblank' sets */ - map all_finals; + std::map all_finals; /** * Queue of blanks, used in reading methods */ - queue blankqueue; + std::queue blankqueue; /** * Queue of wordbound blanks, used in reading methods */ - queue wblankqueue; + std::deque wblankqueue; + + std::deque> transliteration_queue; /** * Set of characters being considered alphabetics */ - set alphabetic_chars; + std::set alphabetic_chars; /** * Set of characters to escape with a backslash */ - set escaped_chars; + std::set escaped_chars; /** * Set of characters to ignore */ - set ignored_chars; + std::set ignored_chars; /** * Mapping of characters for simplistic diacritic restoration specified in RCX files */ - map > rcx_map; + std::map > rcx_map; /** * Original char being restored @@ -234,20 +230,7 @@ */ int maxAnalyses = INT_MAX; - /** - * True if a wblank block ([[..]]xyz[[/]]) was just read - */ - bool is_wblank; - - /** - * True if skip_mode is false and need to collect wblanks - */ - bool collect_wblanks; - - /** - * True if a wblank has been processed for postgen and we need an ending wblank - */ - bool need_end_wblank; + bool transliteration_drop_tilde = false; /** * Output no more than 'N' best weight classes @@ -260,14 +243,6 @@ void streamError(); /** - * Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]] - * @param input the stream being read - * @param output the stream to write on - * @return true if the word enclosed by the wordbound blank has a ~ for postgeneration activation - */ - bool wblankPostGen(InputFile& input, UFILE *output); - - /** * Returns true if the character code is identified as alphabetic * @param c the code provided by the user * @return true if it's alphabetic @@ -296,13 +271,8 @@ */ int readDecomposition(InputFile& input, UFILE *output); - /** - * Read text from stream (postgeneration version) - * @param input the stream to read - * @param output the stream to write on - * @return the next symbol in the stream - */ - int readPostgeneration(InputFile& input, UFILE *output); + bool readTransliterationBlank(InputFile& input); + bool readTransliterationWord(InputFile& input); /** * Read text from stream (generation version) @@ -318,7 +288,7 @@ * @param output the stream to write on * @return the queue of 0-symbols, and the next symbol in the stream */ - pair readBilingual(InputFile& input, UFILE *output); + std::pair readBilingual(InputFile& input, UFILE *output); /** * Read text from stream (SAO version) @@ -334,26 +304,6 @@ void flushBlanks(UFILE *output); /** - * Flush all the wordbound blanks remaining in the current process - * @param output stream to write blanks - */ - void flushWblanks(UFILE *output); - - /** - * Combine wordbound blanks in the queue and return them. - * - * May pop from 'wblankqueue' and set 'need_end_wblank' to true. - * - * If 'wblankqueue' (see which) is empty, we get an empty string, - * otherwise we return a semicolon-separated combination of opening - * wblanks in the queue. If there is only a closing wblank, we just - * set need_end_wblank. - * - * @return final wblank string - */ - UString combineWblanks(); - - /** * Calculate the initial state of parsing */ void calcInitial(); @@ -367,14 +317,14 @@ * Shortcut for filtering on all final states with current settings * Assumes that casefrom is non-empty */ - UString filterFinals(const State& state, const UString& casefrom); + UString filterFinals(const State& state, UStringView casefrom); /** * Write a string to an output stream, * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscaped(UString const &str, UFILE *output); + void writeEscaped(UStringView str, UFILE *output); /** * Write a string to an output stream. @@ -385,7 +335,7 @@ * @param output the stream to write in * @return how many blanks to pop and print after printing lu */ - size_t writeEscapedPopBlanks(UString const &str, UFILE *output); + size_t writeEscapedPopBlanks(UStringView str, UFILE *output); /** * Write a string to an output stream, escaping all escapable characters @@ -393,16 +343,7 @@ * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscapedWithTags(UString const &str, UFILE *output); - - - /** - * Checks if an string ends with a particular suffix - * @param str the string to test - * @param the searched suffix - * @returns true if 'str' has the suffix 'suffix' - */ - static bool endsWith(UString const &str, UString const &suffix); + void writeEscapedWithTags(UStringView str, UFILE *output); /** * Prints a word @@ -410,7 +351,7 @@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWord(UString const &sf, UString const &lf, UFILE *output); + void printWord(UStringView sf, UStringView lf, UFILE *output); /** * Prints a word. @@ -420,7 +361,7 @@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordPopBlank(UString const &sf, UString const &lf, UFILE *output); + void printWordPopBlank(UStringView sf, UStringView lf, UFILE *output); /** * Prints a word (Bilingual version) @@ -428,7 +369,7 @@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordBilingual(UString const &sf, UString const &lf, UFILE *output); + void printWordBilingual(UStringView sf, UStringView lf, UFILE *output); /** @@ -436,21 +377,21 @@ * @param lf lexical form * @param output stream where the word is written */ - void printSAOWord(UString const &lf, UFILE *output); + void printSAOWord(UStringView lf, UFILE *output); /** * Prints an unknown word * @param sf surface form of the word * @param output stream where the word is written */ - void printUnknownWord(UString const &sf, UFILE *output); + void printUnknownWord(UStringView sf, UFILE *output); void initDecompositionSymbols(); - vector numbers; + std::vector numbers; int readTMAnalysis(InputFile& input); - unsigned int lastBlank(UString const &str); + unsigned int lastBlank(UStringView str); /** * Print one blankqueue item if there is one, or a given "space" value. @@ -458,28 +399,37 @@ * @param val the space character to use if no blank queue * @param output stream where the word is written */ - void printSpace(UChar32 const val, UFILE *output); + void printSpace(UChar32 val, UFILE *output); /** * Print one possibly escaped character * if it's a space and the blank queue is non-empty, * pop the first blank and print that instead */ - void printChar(const UChar32 val, UFILE* output); + void printChar(UChar32 val, UFILE* output); - void skipUntil(InputFile& input, UFILE *output, UChar32 const character); - static UString removeTags(UString const &str); + void skipUntil(InputFile& input, UFILE *output, UChar32 character); + static UStringView removeTags(UStringView str); UString compoundAnalysis(UString str); - size_t firstNotAlpha(UString const &sf); + + struct Indices { + size_t i_codepoint; + size_t i_utf16; // always >= i_codepoint since some codepoints take up 2 UTF-16's + }; + + /* + * Iterates through unicode characters, returns a Unicode character + * index and UTF-16 string index of first non-alphabetic character, + * or size of the string (in characters, string size) + * + * @return index of first non-alpha char, or string size, as a tuple of number of characters and index in string + */ + Indices firstNotAlpha(UStringView sf); void analysis_wrapper_null_flush(InputFile& input, UFILE *output); void bilingual_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); void generation_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode); - void postgeneration_wrapper_null_flush(InputFile& input, UFILE *output); - void intergeneration_wrapper_null_flush(InputFile& input, UFILE *output); - void transliteration_wrapper_null_flush(InputFile& input, UFILE *output); - - UString compose(UString const &lexforms, UString const &queue) const; + UString compose(UStringView lexforms, UStringView queue) const; void procNodeICX(); void procNodeRCX(); @@ -488,21 +438,32 @@ bool isLastBlankTM = false; xmlTextReaderPtr reader; + + static constexpr size_t max_case_insensitive_state_size = 65536; + /* + * Including lowercased versions for every character can potentially create very large states + * (See https://github.com/apertium/lttoolbox/issues/167 ). As a sanity-check we don't do + * case-insensitive matching if the state size exceeds max_case_insensitive_state_size. + * + * @return running with --case-sensitive or state size exceeds max + */ + bool beCaseSensitive(const State& state) { + return caseSensitive || state.size() >= max_case_insensitive_state_size; + } + public: /* * String constants */ - static UString const XML_TEXT_NODE; - static UString const XML_COMMENT_NODE; - static UString const XML_IGNORED_CHARS_ELEM; - static UString const XML_RESTORE_CHAR_ELEM; - static UString const XML_RESTORE_CHARS_ELEM; - static UString const XML_VALUE_ATTR; - static UString const XML_CHAR_ELEM; - static UString const WBLANK_START; - static UString const WBLANK_END; - static UString const WBLANK_FINAL; + static constexpr UStringView XML_TEXT_NODE = u"#text"; + static constexpr UStringView XML_COMMENT_NODE = u"#comment"; + static constexpr UStringView XML_IGNORED_CHARS_ELEM = u"ignored-chars"; + static constexpr UStringView XML_RESTORE_CHAR_ELEM = u"restore-char"; + static constexpr UStringView XML_RESTORE_CHARS_ELEM = u"restore-chars"; + static constexpr UStringView XML_VALUE_ATTR = u"value"; + static constexpr UStringView XML_CHAR_ELEM = u"char"; + static constexpr UStringView WBLANK_FINAL = u"[[/]]"; FSTProcessor(); @@ -510,7 +471,8 @@ void initTMAnalysis(); void initSAO(){initAnalysis();}; void initGeneration(); - void initPostgeneration(); + void initPostgeneration(){initTransliteration();}; + void initTransliteration(); void initBiltrans(); void initDecomposition(); @@ -520,29 +482,29 @@ void postgeneration(InputFile& input, UFILE *output); void intergeneration(InputFile& input, UFILE *output); void transliteration(InputFile& input, UFILE *output); - UString biltrans(UString const &input_word, bool with_delim = true); - UString biltransfull(UString const &input_word, bool with_delim = true); + UString biltrans(UStringView input_word, bool with_delim = true); + UString biltransfull(UStringView input_word, bool with_delim = true); void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); - pair biltransWithQueue(UString const &input_word, bool with_delim = true); - UString biltransWithoutQueue(UString const &input_word, bool with_delim = true); + std::pair biltransWithQueue(UStringView input_word, bool with_delim = true); + UString biltransWithoutQueue(UStringView input_word, bool with_delim = true); void SAO(InputFile& input, UFILE *output); - void parseICX(string const &file); - void parseRCX(string const &file); + void parseICX(std::string const &file); + void parseRCX(std::string const &file); void load(FILE *input); bool valid() const; - void setCaseSensitiveMode(bool const value); - void setDictionaryCaseMode(bool const value); - void setBiltransSurfaceForms(bool const value); - void setIgnoredChars(bool const value); - void setRestoreChars(bool const value); - void setNullFlush(bool const value); - void setUseDefaultIgnoredChars(bool const value); - void setDisplayWeightsMode(bool const value); - void setMaxAnalysesValue(int const value); - void setMaxWeightClassesValue(int const value); + void setCaseSensitiveMode(bool value); + void setDictionaryCaseMode(bool value); + void setBiltransSurfaceForms(bool value); + void setIgnoredChars(bool value); + void setRestoreChars(bool value); + void setNullFlush(bool value); + void setUseDefaultIgnoredChars(bool value); + void setDisplayWeightsMode(bool value); + void setMaxAnalysesValue(int value); + void setMaxWeightClassesValue(int value); bool getNullFlush(); bool getDecompoundingMode(); }; diff -Nru lttoolbox-3.6.6/lttoolbox/lsx-comp.1 lttoolbox-3.7.1/lttoolbox/lsx-comp.1 --- lttoolbox-3.6.6/lttoolbox/lsx-comp.1 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lsx-comp.1 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,101 @@ +.Dd March 8, 2006 +.Dt LT-COMP 1 +.Os Apertium +.Sh NAME +.Nm lt-comp +.Nd augmented letter transducer compiler for Apertium +.Sh SYNOPSIS +.Nm lt-comp +.Op Fl a | v | l | r | m | h +.Cm lr | rl +.Ar dictionary_file +.Ar output_file +.Op Ar acx_file +.Sh DESCRIPTION +.Nm lt-comp +is the application responsible for compiling dictionaries used by +.Xr lt-proc 1 +in Apertium into a compact and efficient representation +(a class of finite-state transducers called augmented letter transducers). +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl a , Fl Fl alt +Sets the value of the +.Sy alt +attribute to use in compilation. +.Pp +Note that if no value is set, all entries containing an \fIalt\fR +attribute are omitted. +.It Fl v , Fl Fl var +Sets the value of the +.Sy v +attribute to use in compilation. +This should only be used with monodixes; for bidixes, see +.Fl l +and +.Fl r . +.Pp +Note that if no value is set, all entries containing a +.Sy v +attribute are considered to be +.Em left-to-right . +.It Fl l , Fl Fl var-left +Sets the value of the +.Sy vl +attribute for use in compilation of bidixes. +.Dq Left +here refers to the side of the dictionary, so this option is only valid in +.Cm rl +mode. +.It Fl r , Fl Fl var-right +Sets the value of the +.Sy vr +attribute for use in compilation of bidixes. +.Dq Right +here refers to the side of the dictionary, so this option is only valid in +.Cm lr +mode. +.It Fl m , Fl Fl keep-boundaries +Keep any morpheme boundaries defined by the '' symbol +.It Fl H , Fl Fl hfst +expect HFST symbols +.It Fl S , Fl Fl no-split +don't attempt to split into word and punctuation transducers +.It Fl j , Fl Fl jobs +Parallelise minimisation by using one cpu core per section. By +default, this also creates a new section after 50.000 entries. You can +override this number by setting the environment variable +LT_MAX_SECTION_ENTRIES to some number. If set to 0, sections are never +split (but kept exactly as in the dix file). You can also set the +environment variable LT_JOBS=true if you always want parallel +minimisation even if lt-comp was called without this option. +.It Fl h , Fl Fl help +Prints a short help message. +.It Cm lr +The resulting transducer will process dictionary entries +.Em left-to-right . +.It Cm rl +The resulting transducer will process dictionary entries +.Em right-to-left . +.El +.Sh FILES +.Bl -tag -width Ds +.It Ar dictionary_file +The input dictionary. +.It Ar output_file +The compiled dictionary (a finite state transducer). +.It Ar acx_file +Optional XML file of equivalent characters in monodices. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-expand 1 , +.Xr lt-proc 1 +.Sh COPYRIGHT +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff -Nru lttoolbox-3.6.6/lttoolbox/lt_append.cc lttoolbox-3.7.1/lttoolbox/lt_append.cc --- lttoolbox-3.6.6/lttoolbox/lt_append.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_append.cc 2022-11-01 08:36:47.000000000 +0000 @@ -15,121 +15,33 @@ * along with this program; if not, see . */ #include -#include #include - -#include +#include #include - -#include #include -#include -#include -#include -#include - -#ifdef _MSC_VER -#include -#include -#endif - -using namespace std; - -void endProgram(char *name) -{ - if(name != NULL) - { - cout << basename(name) << " v" << PACKAGE_VERSION <<": add sections to a compiled transducer" << endl; - cout << "USAGE: " << basename(name) << " [-ksh] bin_file1 bin_file2 output_file" << endl; - cout << " -k, --keep: in case of section name conflicts, keep the one from the first transducer" << endl; - cout << " -s, --single: treat input transducers as one-sided" << endl; - cout << " -h, --help: print this message and exit" << endl; - } - exit(EXIT_FAILURE); -} - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - - bool pairs = true; - bool keep = false; - -#ifdef _MSC_VER - _setmode(_fileno(output), _O_U8TEXT); -#endif - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"keep", no_argument, 0, 'k'}, - {"single", no_argument, 0, 's'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "ksh", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "ksh"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'k': - keep = true; - break; - - case 's': - pairs = false; - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } - } - - string infile1; - string infile2; - string outfile; - switch(argc - optind) - { - case 1: - infile1 = argv[argc-1]; - break; - - case 2: - infile1 = argv[argc-2]; - infile2 = argv[argc-1]; - break; - - case 3: - infile1 = argv[argc-3]; - infile2 = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; - } - - FILE* input1 = openInBinFile(infile1); - FILE* input2 = openInBinFile(infile2); - FILE* output = openOutBinFile(outfile); + CLI cli("add sections to a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('k', "keep", "in case of section name conflicts, keep the one from the first transducer"); + cli.add_bool_arg('s', "single", "treat input transducers as one-sided"); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("bin_file1", false); + cli.add_file_arg("bin_file2"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + bool pairs = !cli.get_bools()["single"]; + bool keep = cli.get_bools()["keep"]; + + FILE* input1 = openInBinFile(cli.get_files()[0]); + FILE* input2 = openInBinFile(cli.get_files()[1]); + FILE* output = openOutBinFile(cli.get_files()[2]); Alphabet alpha1, alpha2; - set chars1, chars2; - map trans1, trans2; + std::set chars1, chars2; + std::map trans1, trans2; readTransducerSet(input1, chars1, alpha1, trans1); readTransducerSet(input2, chars2, alpha2, trans2); @@ -144,7 +56,7 @@ if (keep) { continue; } else { - cerr << "WARNING: section '" << it.first << "' appears in both transducers and will be overwritten!" << endl; + std::cerr << "WARNING: section '" << it.first << "' appears in both transducers and will be overwritten!" << std::endl; } } it.second.updateAlphabet(alpha2, alpha1, pairs); diff -Nru lttoolbox-3.6.6/lttoolbox/lt_apply_acx.cc lttoolbox-3.7.1/lttoolbox/lt_apply_acx.cc --- lttoolbox-3.6.6/lttoolbox/lt_apply_acx.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_apply_acx.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("apply an ACX file to a compiled transducer", PACKAGE_VERSION); + cli.add_file_arg("input_file", false); + cli.add_file_arg("acx_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + FILE* input = openInBinFile(cli.get_files()[0]); + auto acx = readACX(cli.get_files()[1].c_str()); + FILE* output = openOutBinFile(cli.get_files()[2]); + + Alphabet alpha; + std::set letters; + std::map trans; + readTransducerSet(input, letters, alpha, trans); + + for (auto& it : trans) { + it.second.applyACX(alpha, acx); + } + + writeTransducerSet(output, letters, alpha, trans); + + fclose(input); + fclose(output); + return 0; +} diff -Nru lttoolbox-3.6.6/lttoolbox/lt_comp.cc lttoolbox-3.7.1/lttoolbox/lt_comp.cc --- lttoolbox-3.6.6/lttoolbox/lt_comp.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_comp.cc 2022-11-01 08:36:47.000000000 +0000 @@ -17,14 +17,10 @@ #include #include #include +#include +#include -#include #include -#include -#include -#include - -using namespace std; /* * Error function that does nothing so that when we fallback from @@ -36,160 +32,73 @@ return; } -void endProgram(char *name) -{ - if(name != NULL) - { - cout << basename(name) << " v" << PACKAGE_VERSION <<": build a letter transducer from a dictionary" << endl; - cout << "USAGE: " << basename(name) << " [-hmvalrHSj] lr | rl dictionary_file output_file [acx_file]" << endl; -#if HAVE_GETOPT_LONG - cout << " -m, --keep-boundaries: keep morpheme boundaries" << endl; - cout << " -v, --var: set language variant" << endl; - cout << " -a, --alt: set alternative (monodix)" << endl; - cout << " -l, --var-left: set left language variant (bidix)" << endl; - cout << " -r, --var-right: set right language variant (bidix)" << endl; - cout << " -H, --hfst: expect HFST symbols" << endl; - cout << " -S, --no-split: don't attempt to split into word and punctuation transducers" << endl; - cout << " -j, --jobs: use one cpu core per section when minimising, new section after 50k entries" << endl; -#else - cout << " -m: keep morpheme boundaries" << endl; - cout << " -v: set language variant" << endl; - cout << " -a: set alternative (monodix)" << endl; - cout << " -l: set left language variant (bidix)" << endl; - cout << " -r: set right language variant (bidix)" << endl; - cout << " -H: expect HFST symbols" << endl; - cout << " -S: don't attempt to split into word and punctuation transducers" << endl; - cout << " -j: use one cpu core per section when minimising, new section after 50k entries" << endl; -#endif - cout << "Modes:" << endl; - cout << " lr: left-to-right compilation" << endl; - cout << " rl: right-to-left compilation" << endl; - } - exit(EXIT_FAILURE); -} - - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("build a letter transducer from a dictionary", PACKAGE_VERSION); + cli.add_bool_arg('d', "debug", "insert line numbers before each entry"); + cli.add_bool_arg('m', "keep-boundaries", "keep morpheme boundaries"); + cli.add_str_arg('v', "var", "set language variant", "VAR"); + cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); + cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); + cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); + cli.add_bool_arg('H', "hfst", "expect HFST symbols"); + cli.add_bool_arg('S', "no-split", "don't attempt to split into word and punctuation sections"); + cli.add_bool_arg('j', "jobs", "use one cpu core per section when minimising, new section after 50k entries"); + cli.add_bool_arg('V', "verbose", "compile verbosely"); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("lr | rl | u", false); + cli.add_file_arg("dictionary_file", false); + cli.add_file_arg("output_file", false); + cli.add_file_arg("acx_file", true); + cli.parse_args(argc, argv); char ttype = 'x'; Compiler c; AttCompiler a; - c.setKeepBoundaries(false); - c.setVerbose(false); - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - string vl; - string vr; - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"alt", required_argument, 0, 'a'}, - {"var", required_argument, 0, 'v'}, - {"var-left", required_argument, 0, 'l'}, - {"var-right", required_argument, 0, 'r'}, - {"keep-boundaries", no_argument, 0, 'm'}, - {"hfst", no_argument, 0, 'H'}, - {"no-split", no_argument, 0, 'S'}, - {"help", no_argument, 0, 'h'}, - {"verbose", no_argument, 0, 'V'}, - {"jobs", no_argument, 0, 'j'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "a:v:l:r:mHShVj", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "a:v:l:r:mHShV"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'a': - c.setAltValue(to_ustring(optarg)); - break; - - case 'v': - c.setVariantValue(to_ustring(optarg)); - break; - - case 'l': - vl = optarg; - c.setVariantLeftValue(to_ustring(optarg)); - break; - - case 'r': - vr = optarg; - c.setVariantRightValue(to_ustring(optarg)); - break; - - case 'm': - c.setKeepBoundaries(true); - break; - - case 'H': - a.setHfstSymbols(true); - break; - - case 'S': - a.setSplitting(false); - break; - case 'j': - c.setJobs(true); - c.setMaxSectionEntries(50000); - break; + bool have_vl = false; + bool have_vr = false; + auto args = cli.get_strs(); + if (args.find("var") != args.end()) { + c.setVariantValue(to_ustring(args["var"][0].c_str())); + } + if (args.find("alt") != args.end()) { + c.setAltValue(to_ustring(args["alt"][0].c_str())); + } + if (args.find("var-left") != args.end()) { + have_vl = true; + c.setVariantLeftValue(to_ustring(args["var-left"][0].c_str())); + } + if (args.find("var-right") != args.end()) { + have_vr = true; + c.setVariantRightValue(to_ustring(args["var-right"][0].c_str())); + } - case 'V': - c.setVerbose(true); - break; + c.setEntryDebugging(cli.get_bools()["debug"]); + c.setKeepBoundaries(cli.get_bools()["keep-boundaries"]); + c.setVerbose(cli.get_bools()["verbose"]); - case 'h': - default: - endProgram(argv[0]); - break; - } - } + a.setHfstSymbols(cli.get_bools()["hfst"]); + a.setSplitting(!cli.get_bools()["no-split"]); - if(std::getenv("LT_JOBS")) { + auto LT_JOBS = std::getenv("LT_JOBS"); + if(cli.get_bools()["jobs"] || (LT_JOBS != NULL && LT_JOBS[0] != 'n')) { c.setJobs(true); c.setMaxSectionEntries(50000); } + else { + c.setJobs(false); + c.setMaxSectionEntries(0); + } if(const char* max_section_entries = std::getenv("LT_MAX_SECTION_ENTRIES")) { - c.setMaxSectionEntries(stol(max_section_entries)); + c.setMaxSectionEntries(std::stol(max_section_entries)); } - string opc; - string infile; - string outfile; - string acxfile; - - switch(argc - optind + 1) - { - case 5: - opc = argv[argc-4]; - infile = argv[argc-3]; - outfile = argv[argc-2]; - acxfile = argv[argc-1]; - break; - - case 4: - opc = argv[argc-3]; - infile = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; - } + std::string opc = cli.get_files()[0]; + std::string infile = cli.get_files()[1]; + std::string outfile = cli.get_files()[2]; + std::string acxfile = cli.get_files()[3]; xmlTextReaderPtr reader; reader = xmlReaderForFile(infile.c_str(), NULL, 0); @@ -207,7 +116,7 @@ } else { - cerr << "Error: Cannot not open file '" << infile << "'." << endl << endl; + std::cerr << "Error: Cannot not open file '" << infile << "'." << std::endl << std::endl; exit(EXIT_FAILURE); } initGenericErrorDefaultFunc(NULL); @@ -215,10 +124,9 @@ if(opc == "lr") { - if(vr == "" && vl != "") - { - cout << "Error: -l specified, but mode is lr" << endl; - endProgram(argv[0]); + if (have_vl) { + std::cout << "Error: -l specified, but mode is lr" << std::endl; + cli.print_usage(); } if(ttype == 'a') { @@ -226,7 +134,7 @@ } else { - if(acxfile != "") + if(!acxfile.empty()) { c.parseACX(acxfile, Compiler::COMPILER_RESTRICTION_LR_VAL); } @@ -235,10 +143,9 @@ } else if(opc == "rl") { - if(vl == "" && vr != "") - { - cout << "Error: -r specified, but mode is rl" << endl; - endProgram(argv[0]); + if (have_vr) { + std::cout << "Error: -r specified, but mode is rl" << std::endl; + cli.print_usage(); } if(ttype == 'a') { @@ -249,17 +156,19 @@ c.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL); } } + else if (opc == "u") { + if (ttype == 'a') { + a.parse(infile, false); + } else { + c.parse(infile, Compiler::COMPILER_RESTRICTION_U_VAL); + } + } else { - endProgram(argv[0]); + cli.print_usage(); } - FILE *output = fopen(outfile.c_str(), "wb"); - if(!output) - { - cerr << "Error: Cannot open file '" << outfile << "'." << endl; - exit(EXIT_FAILURE); - } + FILE* output = openOutBinFile(outfile); if(ttype == 'a') { a.write(output); diff -Nru lttoolbox-3.6.6/lttoolbox/lt-compose.1 lttoolbox-3.7.1/lttoolbox/lt-compose.1 --- lttoolbox-3.6.6/lttoolbox/lt-compose.1 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt-compose.1 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,60 @@ +.Dd September 25, 2022 +.Dt LT-COMPOSE 1 +.Os Apertium +.Sh NAME +.Nm lt-compose +.Nd compiled dictionary composition for Apertium +.Sh SYNOPSIS +.Nm lt-compose +.Ar transducer1_binary +.Ar transducer2_binary +.Ar composed_binary +.Sh DESCRIPTION +.Nm lt-compose +is the application responsible for composing two compiled +dictionaries, matching the output-side of transducer1 with the +input-side of transducer2. By default, matches are anchored to +initial/final states, so the transducer2 has to match full paths (in +regex terms, transducer2 is implicitly surrounded by ^ and $). But +there is also support for letting transducer2 match sub-paths of +transducer1 (in which matches become optional, making the composition +a superset of transducer1). Matching sub-paths means that transducer2 +can start matching in the midst of paths of transducer2 (in regex +terms, transducer2 is implicitly surrounded in .* on both sides). +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl i , Fl Fl inverted +Apply transducer2 to the input-side (left) of transducer1 instead of +the output-side. You would do this when altering the forms of an +analyser. +.It Fl a , Fl Fl anywhere +Allow transducer2 to match sub-paths instead of requiring matching +initial/final states. Matches then become optional. +.It Fl j , Fl Fl jobs +Parallelise composition by using one cpu core per section of +transducer1. You can also set the environment variable LT_JOBS=true if +you always want parallelisation where available in lttoolbox. +.Sh FILES +.Bl -tag -width Ds +.It Ar transducer1_binary +a finite state transducer +.It Ar transducer2_binary +a finite state transducer +.It Ar composed_binary +a finite state transducer +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-comp 1 , +.Xr lt-expand 1 , +.Xr lt-print 1 , +.Xr lt-trim 1 , +.Xr lt-proc 1 +.Sh AUTHOR +Copyright \(co 2005-2022 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff -Nru lttoolbox-3.6.6/lttoolbox/lt_compose.cc lttoolbox-3.7.1/lttoolbox/lt_compose.cc --- lttoolbox-3.6.6/lttoolbox/lt_compose.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_compose.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include +#include + +void +compose(FILE* file_f, FILE* file_g, FILE* file_gf, bool f_inverted, bool g_anywhere, bool jobs) +{ + Alphabet alph_f; + std::set letters_f; + std::map trans_f; + readTransducerSet(file_f, letters_f, alph_f, trans_f); + Alphabet alph_g; + std::set letters_g; + std::map trans_g; + readTransducerSet(file_g, letters_g, alph_g, trans_g); + + std::map trans_gf; + + Transducer union_g; + for (auto& it : trans_g) { + if (union_g.isEmpty()) { + union_g = it.second; + } else { + union_g.unionWith(alph_g, it.second); + } + } + union_g.minimize(); + + std::vector>> compositions; + for (auto& it : trans_f) { + if (it.second.numberOfTransitions() == 0) { + std::cerr << "Warning: section " << it.first << " is empty! Skipping it..." << std::endl; + continue; + } + if(jobs) { + compositions.push_back(std::async( + [](Transducer &f, Transducer &g, Alphabet &alph_f, Alphabet &alph_g, + bool f_inverted, bool g_anywhere, UString name) { + Transducer gf = f.compose(g, alph_f, alph_g, f_inverted, g_anywhere); + if (gf.hasNoFinals()) { + std::cerr << "Warning: section " << name + << " had no final state after composing! Skipping it..." + << std::endl; + ; + } else { + gf.minimize(); + } + return std::make_pair(name, gf); + }, + std::ref(it.second), std::ref(union_g), std::ref(alph_f), + std::ref(alph_g), f_inverted, g_anywhere, it.first)); + } else { + Transducer gf = it.second.compose(union_g, alph_f, alph_g, f_inverted, g_anywhere); + if (gf.hasNoFinals()) { + std::cerr << "Warning: section " << it.first + << " had no final state after composing! Skipping it..." + << std::endl; + continue; + } + gf.minimize(); + trans_gf[it.first] = gf; + } + } + for (auto &thr : compositions) { + auto it = thr.get(); + if (!it.second.hasNoFinals()) { + trans_gf[it.first] = it.second; + } + } + + if (trans_gf.empty()) { + std::cerr << "Error: Composition gave empty transducer!" << std::endl; + exit(EXIT_FAILURE); + } + + writeTransducerSet(file_gf, letters_f, alph_f, trans_gf); +} + + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("compose transducer1 with transducer2", PACKAGE_VERSION); + cli.add_bool_arg('i', "inverted", "run composition right-to-left on transducer1"); + cli.add_bool_arg('a', "anywhere", "don't require anchored matches, let transducer2 optionally compose at any sub-path"); + cli.add_file_arg("transducer1_bin_file", false); + cli.add_file_arg("transducer2_bin_file"); + cli.add_file_arg("trimmed_bin_file"); + cli.parse_args(argc, argv); + + FILE* transducer1 = openInBinFile(cli.get_files()[0]); + FILE* transducer2 = openInBinFile(cli.get_files()[1]); + FILE* composition = openOutBinFile(cli.get_files()[2]); + + bool jobs = false; + auto LT_JOBS = std::getenv("LT_JOBS"); + if(cli.get_bools()["jobs"] || (LT_JOBS != NULL && LT_JOBS[0] != 'n')) { + jobs = true; + } + compose(transducer1, transducer2, composition, + cli.get_bools()["inverted"], + cli.get_bools()["anywhere"], + jobs); + + fclose(transducer1); + fclose(transducer2); + fclose(composition); + + return 0; +} diff -Nru lttoolbox-3.6.6/lttoolbox/lt_expand.cc lttoolbox-3.7.1/lttoolbox/lt_expand.cc --- lttoolbox-3.6.6/lttoolbox/lt_expand.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_expand.cc 2022-11-01 08:36:47.000000000 +0000 @@ -18,133 +18,40 @@ #include #include #include - -#include -#include -#include -#include -#include - -#ifdef _MSC_VER -#include -#include -#endif - -using namespace std; - -void endProgram(char *name) -{ - if(name != NULL) - { - cout << basename(name) << " v" << PACKAGE_VERSION <<": expand the contents of a dictionary file" << endl; - cout << "USAGE: " << basename(name) << " [-mavlrh] dictionary_file [output_file]" << endl; -#if HAVE_GETOPT_LONG - cout << " -m, --keep-boundaries: keep morpheme boundaries" << endl; - cout << " -v, --var: set language variant" << endl; - cout << " -a, --alt: set alternative (monodix)" << endl; - cout << " -l, --var-left: set left language variant (bidix)" << endl; - cout << " -r, --var-right: set right language variant (bidix)" << endl; -#else - cout << " -m: keep morpheme boundaries" << endl; - cout << " -v: set language variant" << endl; - cout << " -a: set alternative (monodix)" << endl; - cout << " -l: set left language variant (bidix)" << endl; - cout << " -r: set right language variant (bidix)" << endl; -#endif - } - exit(EXIT_FAILURE); -} +#include int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("expand the contents of a dictionary file", PACKAGE_VERSION); + cli.add_bool_arg('m', "keep-boundaries", "keep morpheme boundaries"); + cli.add_str_arg('v', "var", "set language variant", "VAR"); + cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); + cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); + cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); + cli.add_file_arg("dictionary_file", false); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); - FILE* input = NULL; - UFILE* output = NULL; Expander e; - e.setKeepBoundaries(false); - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"keep-boundaries", no_argument, 0, 'm'}, - {"alt", required_argument, 0, 'a'}, - {"var", required_argument, 0, 'v'}, - {"var-left", required_argument, 0, 'l'}, - {"var-right", required_argument, 0, 'r'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "a:v:l:r:mh", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "a:v:l:r:mh"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'a': - e.setAltValue(to_ustring(optarg)); - break; - - case 'v': - e.setVariantValue(to_ustring(optarg)); - break; - - case 'l': - e.setVariantLeftValue(to_ustring(optarg)); - break; - - case 'm': - e.setKeepBoundaries(true); - break; - - case 'r': - e.setVariantRightValue(to_ustring(optarg)); - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } + e.setKeepBoundaries(cli.get_bools()["keep-boundaries"]); + auto args = cli.get_strs(); + if (args.find("var") != args.end()) { + e.setVariantValue(to_ustring(args["var"][0].c_str())); } - - string infile; - string outfile; - - switch(argc - optind + 1) - { - case 2: - infile = argv[argc-1]; - break; - - case 3: - infile = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; + if (args.find("alt") != args.end()) { + e.setAltValue(to_ustring(args["alt"][0].c_str())); + } + if (args.find("var-left") != args.end()) { + e.setVariantLeftValue(to_ustring(args["var-left"][0].c_str())); + } + if (args.find("var-right") != args.end()) { + e.setVariantRightValue(to_ustring(args["var-right"][0].c_str())); } -#ifdef _MSC_VER - _setmode(_fileno(output), _O_U8TEXT); -#endif - - input = openInBinFile(infile); - fclose(input); - output = openOutTextFile(outfile); + UFILE* output = openOutTextFile(cli.get_files()[1]); - e.expand(infile, output); + e.expand(cli.get_files()[0], output); u_fclose(output); return EXIT_SUCCESS; diff -Nru lttoolbox-3.6.6/lttoolbox/lt_invert.cc lttoolbox-3.7.1/lttoolbox/lt_invert.cc --- lttoolbox-3.6.6/lttoolbox/lt_invert.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_invert.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + + CLI cli("reverse the direction of a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("in_bin"); + cli.add_file_arg("out_bin"); + cli.parse_args(argc, argv); + + FILE* input = openInBinFile(cli.get_files()[0]); + FILE* output = openOutBinFile(cli.get_files()[1]); + + Alphabet alphabet; + std::set alphabetic_chars; + std::map transducers; + readTransducerSet(input, alphabetic_chars, alphabet, transducers); + + for (auto& it : transducers) { + it.second.invert(alphabet); + } + + writeTransducerSet(output, alphabetic_chars, alphabet, transducers); + + fclose(input); + fclose(output); + return EXIT_SUCCESS; +} diff -Nru lttoolbox-3.6.6/lttoolbox/lt_locale.cc lttoolbox-3.7.1/lttoolbox/lt_locale.cc --- lttoolbox-3.6.6/lttoolbox/lt_locale.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_locale.cc 2022-11-01 08:36:47.000000000 +0000 @@ -16,6 +16,7 @@ */ #include #include +#include #include #include @@ -23,14 +24,12 @@ #include #endif -using namespace std; - void LtLocale::tryToSetLocale() { try { - locale::global(locale(locale::classic(), "", locale::ctype)); + std::locale::global(std::locale(std::locale::classic(), "", std::locale::ctype)); } catch (...) { // Nothing @@ -38,6 +37,7 @@ UErrorCode status = U_ZERO_ERROR; uloc_setDefault("en_US_POSIX", &status); + ucnv_setDefaultName("UTF-8"); #if !defined(__CYGWIN__) && !defined (__MINGW32__) if(setlocale(LC_CTYPE, "") != NULL) @@ -45,7 +45,7 @@ return; } - cerr << "Warning: unsupported locale, fallback to \"C\"" << endl; + std::cerr << "Warning: unsupported locale, fallback to \"C\"" << std::endl; setlocale(LC_ALL, "C"); #endif diff -Nru lttoolbox-3.6.6/lttoolbox/lt_locale.h lttoolbox-3.7.1/lttoolbox/lt_locale.h --- lttoolbox-3.6.6/lttoolbox/lt_locale.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_locale.h 2022-11-01 08:36:47.000000000 +0000 @@ -20,8 +20,6 @@ #include -using namespace std; - class LtLocale { public: diff -Nru lttoolbox-3.6.6/lttoolbox/lt-paradigm.1 lttoolbox-3.7.1/lttoolbox/lt-paradigm.1 --- lttoolbox-3.6.6/lttoolbox/lt-paradigm.1 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt-paradigm.1 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,47 @@ +.Dd June 30, 2022 +.Dt LT-PARADIGM 1 +.Os Apertium +.Sh NAME +.Nm lt-paradigm +.Nd generate listings from a compiled transducer +.Sh SYNOPSIS +.Nm lt-paradigm +.Op Fl a | s | z | h +.Op Fl e Ar TAG +.Ar fst_file +.Op Ar input_file Op Ar output_file +.Sh DESCRIPTION +.Nm lt-paradigm +prints paths matching input patterns from a transducer +.Bl -tag -width Ds +.It Ar fst_file +The compiled transducer +.It Ar input_file +A list of patterns to be extracted, separated by newlines or nulls +.It Ar output_file +All paths matching the patterns in input_file. Each path is terminated by a newline and groups are separated by the separator used in the input. +.El +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl a Fl Fl analyser +Match patterns on the right side of the transducer rather than the left. +.It Fl e Ar TAG Fl Fl exclude Ar TAG +When expanding <*>, do use +.Ar TAG +.It Fl s Fl Fl sort +Sort the output for each pattern. +.It Fl z Fl Fl null-flush +No-op, included for compatibility. +.It Fl h Fl Fl help +Prints a short help message. +.El +.Sh SEE ALSO +.Xr lt-expand 1 , +.Xr hfst-expand 1 , +.Sh COPYRIGHT +Copyright \(co 2022 Apertium +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff -Nru lttoolbox-3.6.6/lttoolbox/lt_paradigm.cc lttoolbox-3.7.1/lttoolbox/lt_paradigm.cc --- lttoolbox-3.6.6/lttoolbox/lt_paradigm.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_paradigm.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +void expand(Transducer& inter, int state, const std::set& past_states, + const std::vector& syms, const Alphabet& alpha, UFILE* out, + std::set>& outset) +{ + if (inter.isFinal(state)) { + UString l, r; + for (auto& it : syms) { + auto pr = alpha.decode(it); + alpha.getSymbol(l, pr.first); + alpha.getSymbol(r, pr.second); + } + if (!l.empty() && !r.empty()) { + if (out != nullptr) { + u_fprintf(out, "%S:%S\n", r.c_str(), l.c_str()); + } else { + outset.insert({r, l}); + } + } + } + std::set new_states = past_states; + new_states.insert(state); + for (auto& it : inter.getTransitions()[state]) { + if (past_states.find(it.second.first) != past_states.end()) { + continue; + } + std::vector new_syms = syms; + new_syms.push_back(it.first); + expand(inter, it.second.first, new_states, new_syms, alpha, out, outset); + } +} + +void process(UStringView pattern, std::map& trans, + Alphabet& alpha, + const std::set& letters, const std::set& tags, + UFILE* output, bool sort) +{ + int32_t any_char = static_cast('*'); + int32_t any_tag = alpha(u"<*>"); + std::vector pat = alpha.tokenize(pattern); + Transducer other; + int state = other.getInitial(); + for (auto& it : pat) { + if (it == any_char) { + state = other.insertNewSingleTransduction(0, state); + for (auto& sym : letters) { + other.linkStates(state, state, alpha(sym, sym)); + } + } else if (it == any_tag) { + state = other.insertNewSingleTransduction(0, state); + for (auto& sym : tags) { + other.linkStates(state, state, alpha(sym, sym)); + } + } else { + state = other.insertNewSingleTransduction(alpha(it, it), state); + } + } + other.setFinal(state); + std::set> outset; + for (auto& it : trans) { + Transducer inter = it.second.trim(other, alpha, alpha); + if (!inter.getFinals().empty()) { + std::set states; + std::vector syms; + expand(inter, inter.getInitial(), states, syms, alpha, + (sort ? nullptr : output), outset); + } + } + if (sort) { + for (auto& it : outset) { + u_fprintf(output, "%S:%S\n", it.first.c_str(), it.second.c_str()); + } + } +} + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("generate listings from a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('a', "analyser", "FST is an analyser (tags on the right)"); + cli.add_str_arg('e', "exclude", "disregard paths containing TAG", "TAG"); + cli.add_bool_arg('s', "sort", "alphabetize the paths for each pattern"); + cli.add_bool_arg('z', "null-flush", "flush output on \\0"); + cli.add_bool_arg('h', "help", "show this help and exit"); + cli.add_file_arg("FST", false); + cli.add_file_arg("input"); + cli.add_file_arg("output"); + cli.parse_args(argc, argv); + + bool should_invert = !cli.get_bools()["analyser"]; + bool sort = cli.get_bools()["sort"]; + std::set skip_tags; + for (auto& it : cli.get_strs()["exclude"]) { + skip_tags.insert(to_ustring(it.c_str())); + } + + FILE* fst = openInBinFile(cli.get_files()[0]); + + std::set letters; + Alphabet alpha; + std::map trans; + readTransducerSet(fst, letters, alpha, trans); + fclose(fst); + + alpha.includeSymbol(u"<*>"); + std::set tags; + for (int32_t i = 1; i <= alpha.size(); i++) { + if (!skip_tags.empty()) { + UString t; + alpha.getSymbol(t, -i); + if (skip_tags.find(t) != skip_tags.end()) continue; + } + tags.insert(-i); + } + + if (should_invert) { + for (auto& it : trans) { + it.second.invert(alpha); + } + } + + InputFile input; + if (!cli.get_files()[1].empty()) { + input.open_or_exit(cli.get_files()[1].c_str()); + } + UFILE* output = openOutTextFile(cli.get_files()[2]); + + UString cur; + do { + UChar32 c = input.get(); + if (c == '\n' || c == '\0' || c == U_EOF) { + process(cur, trans, alpha, letters, tags, output, sort); + if (c != U_EOF) { + u_fputc(c, output); + u_fflush(output); + } + cur.clear(); + } else { + cur += c; + } + } while (!input.eof()); + + u_fclose(output); + return 0; +} diff -Nru lttoolbox-3.6.6/lttoolbox/lt_print.cc lttoolbox-3.7.1/lttoolbox/lt_print.cc --- lttoolbox-3.6.6/lttoolbox/lt_print.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_print.cc 2022-11-01 08:36:47.000000000 +0000 @@ -15,113 +15,30 @@ * along with this program; if not, see . */ #include -#include #include - -#include +#include #include -#include -#include -#include -#include -#include -#include - -#ifdef _MSC_VER -#include -#include -#endif - -using namespace std; - -void endProgram(char *name) -{ - if(name != NULL) - { - cout << basename(name) << " v" << PACKAGE_VERSION <<": dump a transducer to text in ATT format" << endl; - cout << "USAGE: " << basename(name) << " [-aHh] bin_file [output_file] " << endl; - cout << " -a, --alpha: print transducer alphabet" << endl; - cout << " -H, --hfst: use HFST-compatible character escapes" << endl; - cout << " -h, --help: print this message and exit" << endl; - } - exit(EXIT_FAILURE); -} - - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("dump a transducer to text in ATT format", PACKAGE_VERSION); + cli.add_bool_arg('a', "alpha", "print transducer alphabet"); + cli.add_bool_arg('H', "hfst", "use HFST-compatible character escapes"); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("bin_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); - bool alpha = false; - bool hfst = false; - -#ifdef _MSC_VER - _setmode(_fileno(output), _O_U8TEXT); -#endif - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"alpha", no_argument, 0, 'a'}, - {"hfst", no_argument, 0, 'H'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "aHh", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "aHh"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'a': - alpha = true; - break; - - case 'H': - hfst = true; - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } - } - - string infile; - string outfile; - switch(argc - optind) - { - case 1: - infile = argv[argc-1]; - break; - - case 2: - infile = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; - } + bool alpha = cli.get_bools()["alpha"]; + bool hfst = cli.get_bools()["hfst"]; - FILE* input = openInBinFile(infile); - UFILE* output = openOutTextFile(outfile); + FILE* input = openInBinFile(cli.get_files()[0]); + UFILE* output = openOutTextFile(cli.get_files()[1]); Alphabet alphabet; - set alphabetic_chars; - map transducers; + std::set alphabetic_chars; + std::map transducers; readTransducerSet(input, alphabetic_chars, alphabet, transducers); diff -Nru lttoolbox-3.6.6/lttoolbox/lt_proc.cc lttoolbox-3.7.1/lttoolbox/lt_proc.cc --- lttoolbox-3.6.6/lttoolbox/lt_proc.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_proc.cc 2022-11-01 08:36:47.000000000 +0000 @@ -16,82 +16,9 @@ */ #include #include -#include +#include #include -#include -#include -#include -#include - -#ifdef _MSC_VER -#include -#include -#endif - - -using namespace std; - -void endProgram(char *name) -{ - cout << basename(name) << ": process a stream with a letter transducer" << endl; - cout << "USAGE: " << basename(name) << " [ -a | -b | -c | -d | -e | -g | -n | -p | -x | -s | -t | -v | -h | -z | -w ] [-W] [-N N] [-L N] [ -i icx_file ] [ -r rcx_file ] fst_file [input_file [output_file]]" << endl; - cout << "Options:" << endl; -#if HAVE_GETOPT_LONG - cout << " -a, --analysis: morphological analysis (default behavior)" << endl; - cout << " -b, --bilingual: lexical transfer" << endl; - cout << " -c, --case-sensitive: use the literal case of the incoming characters" << endl; - cout << " -d, --debugged-gen morph. generation with all the stuff" << endl; - cout << " -e, --decompose-nouns: Try to decompound unknown words" << endl; - cout << " -g, --generation: morphological generation" << endl; - cout << " -i, --ignored-chars: specify file with characters to ignore" << endl; - cout << " -r, --restore-chars: specify file with characters to diacritic restoration" << endl; - cout << " -l, --tagged-gen: morphological generation keeping lexical forms" << endl; - cout << " -m, --tagged-nm-gen: same as -l but without unknown word marks" << endl; - cout << " -n, --non-marked-gen morph. generation without unknown word marks" << endl; - cout << " -o, --surf-bilingual: lexical transfer with surface forms" << endl; - cout << " -p, --post-generation: post-generation" << endl; - cout << " -x, --inter-generation: inter-generation" << endl; - cout << " -s, --sao: SAO annotation system input processing" << endl; - cout << " -t, --transliteration: apply transliteration dictionary" << endl; - cout << " -v, --version: version" << endl; - cout << " -z, --null-flush: flush output on the null character " << endl; - cout << " -w, --dictionary-case: use dictionary case instead of surface case" << endl; - cout << " -C, --careful-case: use dictionary case if present, else surface" << endl; - cout << " -I, --no-default-ignore: skips loading the default ignore characters" << endl; - cout << " -W, --show-weights: Print final analysis weights (if any)" << endl; - cout << " -N, --analyses: Output no more than N analyses (if the transducer is weighted, the N best analyses)" << endl; - cout << " -L, --weight-classes: Output no more than N best weight classes (where analyses with equal weight constitute a class)" << endl; - cout << " -h, --help: show this help" << endl; -#else - cout << " -a: morphological analysis (default behavior)" << endl; - cout << " -b: lexical transfer" << endl; - cout << " -c: use the literal case of the incoming characters" << endl; - cout << " -d: morph. generation with all the stuff" << endl; - cout << " -e: try to decompose unknown words as compounds" << endl; - cout << " -g: morphological generation" << endl; - cout << " -i: specify file with characters to ignore" << endl; - cout << " -r: specify file with characters to diacritic restoration" << endl; - cout << " -l: morphological generation keeping lexical forms" << endl; - cout << " -n: morph. generation without unknown word marks" << endl; - cout << " -o: lexical transfer with surface forms" << endl; - cout << " -p: post-generation" << endl; - cout << " -x: inter-generation" << endl; - cout << " -s: SAO annotation system input processing" << endl; - cout << " -t: apply transliteration dictionary" << endl; - cout << " -v: version" << endl; - cout << " -z: flush output on the null character " << endl; - cout << " -C: use dictionary case if present, else surface" << endl; - cout << " -W: Print final analysis weights (if any)" << endl; - cout << " -N: Output no more than N analyses" << endl; - cout << " -L: Output no more than N best weight classes" << endl; - cout << " -I: skips loading the default ignore characters" << endl; - cout << " -w: use dictionary case instead of surface case" << endl; - cout << " -h: show this help" << endl; -#endif - exit(EXIT_FAILURE); -} - void checkValidity(FSTProcessor const &fstp) { if(!fstp.valid()) @@ -104,205 +31,142 @@ { LtLocale::tryToSetLocale(); - int cmd = 0; - int maxAnalyses; - int maxWeightClasses; - FSTProcessor fstp; - -#if HAVE_GETOPT_LONG - static struct option long_options[]= - { - {"analysis", 0, 0, 'a'}, - {"bilingual", 0, 0, 'b'}, - {"surf-bilingual", 0, 0, 'o'}, - {"generation", 0, 0, 'g'}, - {"ignored-chars", 1, 0, 'i'}, - {"restore-chars", 1, 0, 'r'}, - {"non-marked-gen", 0, 0, 'n'}, - {"debugged-gen", 0, 0, 'd'}, - {"tagged-gen", 0, 0, 'l'}, - {"tagged-nm-gen", 0, 0, 'm'}, - {"post-generation", 0, 0, 'p'}, - {"inter-generation", 0, 0, 'x'}, - {"sao", 0, 0, 's'}, - {"transliteration", 0, 0, 't'}, - {"null-flush", 0, 0, 'z'}, - {"dictionary-case", 0, 0, 'w'}, - {"version", 0, 0, 'v'}, - {"case-sensitive", 0, 0, 'c'}, - {"careful-case", 0, 0, 'C'}, - {"no-default-ignore", 0, 0, 'I'}, - {"show-weights", 0, 0, 'W'}, - {"analyses", 1, 0, 'N'}, - {"weight-classes", 1, 0, 'L'}, - {"help", 0, 0, 'h'} - }; -#endif + CLI cli("process a stream with a letter transducer", PACKAGE_VERSION); + cli.add_file_arg("fst_file", false); + cli.add_file_arg("input_file"); + cli.add_file_arg("output_file"); + cli.add_bool_arg('a', "analysis", "morphological analysis (default behavior)"); + cli.add_bool_arg('b', "bilingual", "lexical transfer"); + cli.add_bool_arg('c', "case-sensitive", "use the literal case of the incoming characters"); + cli.add_bool_arg('d', "debugged-gen", "morph. generation with all the stuff"); + cli.add_bool_arg('e', "decompose-nouns", "Try to decompound unknown words"); + cli.add_bool_arg('g', "generation", "morphological generation"); + cli.add_str_arg('i', "ignored-chars", "specify file with characters to ignore", "icx_file"); + cli.add_str_arg('r', "restore-chars", "specify file with characters to diacritic restoration", "rcx_file"); + cli.add_bool_arg('l', "tagged-gen", "morphological generation keeping lexical forms"); + cli.add_bool_arg('m', "tagged-nm-gen", "same as -l but without unknown word marks"); + cli.add_bool_arg('n', "non-marked-gen", "morph. generation without unknown word marks"); + cli.add_bool_arg('o', "surf-bilingual", "lexical transfer with surface forms"); + cli.add_bool_arg('p', "post-generation", "post-generation"); + cli.add_bool_arg('x', "inter-generation", "inter-generation"); + cli.add_bool_arg('s', "sao", "SAO annotation system input processing"); + cli.add_bool_arg('t', "transliteration", "apply transliteration dictionary"); + cli.add_bool_arg('v', "version", "version"); + cli.add_bool_arg('z', "null-flush", "flush output on the null character"); + cli.add_bool_arg('w', "dictionary-case", "use dictionary case instead of surface"); + cli.add_bool_arg('C', "careful-case", "use dictionary case if present, else surface"); + cli.add_bool_arg('I', "no-default-ignore", "skips loading the default ignore characters"); + cli.add_bool_arg('W', "show-weights", "Print final analysis weights (if any)"); + cli.add_str_arg('N', "analyses", "Output no more than N analyses (if the transducer is weighted, the N best analyses)", "N"); + cli.add_str_arg('L', "weight-classes", "Output no more than N best weight classes (where analyses with equal weight constitute a class)", "N"); + cli.add_bool_arg('h', "help", "show this help"); + cli.parse_args(argc, argv); + FSTProcessor fstp; GenerationMode bilmode = gm_unknown; - // more than one option sets generation mode, but -gb also sets gm_unknown - bool really_g = false; - while(true) - { -#if HAVE_GETOPT_LONG - int option_index; - int c = getopt_long(argc, argv, "abcegi:r:lmndopxstzwvCIWN:L:h", long_options, &option_index); -#else - int c = getopt(argc, argv, "abcegi:r:lmndopxstzwvCIWN:L:h"); -#endif - - if(c == -1) - { - break; - } + char cmd = 0; - switch(c) - { - case 'c': - fstp.setCaseSensitiveMode(true); - break; - - case 'i': - fstp.setIgnoredChars(true); - fstp.parseICX(optarg); - break; - - case 'r': - fstp.setRestoreChars(true); - fstp.parseRCX(optarg); - fstp.setUseDefaultIgnoredChars(false); - break; - - case 'I': - fstp.setUseDefaultIgnoredChars(false); - break; - - case 'W': - fstp.setDisplayWeightsMode(true); - break; - - case 'N': - maxAnalyses = atoi(optarg); - if (maxAnalyses < 1) - { - cerr << "Invalid or no argument for analyses count" << endl; - exit(EXIT_FAILURE); - } - fstp.setMaxAnalysesValue(maxAnalyses); - break; - - case 'L': - maxWeightClasses = atoi(optarg); - if (maxWeightClasses < 1) - { - cerr << "Invalid or no argument for weight class count" << endl; - exit(EXIT_FAILURE); - } - fstp.setMaxWeightClassesValue(maxWeightClasses); - break; - - case 'e': - case 'a': - case 'b': - case 'o': - case 'g': - case 'p': - case 'x': - case 't': - case 's': - if(cmd == 0) - { - cmd = c; - if (cmd == 'g') really_g = true; - } - else if(cmd == 'g' && c == 'b') { - // "lt-proc -g -b generador.bin" should run biltrans, keeping unknown-marks - if (really_g) bilmode = gm_unknown; - cmd = 'b'; - } - else - { - endProgram(argv[0]); - } - break; - - case 'd': - if (cmd == 0) cmd = 'g'; - bilmode = gm_all; - break; - - case 'l': - if (cmd == 0) cmd = 'g'; - bilmode = gm_tagged; - break; - - case 'm': - if (cmd == 0) cmd = 'g'; - bilmode = gm_tagged_nm; - break; - - case 'n': - if (cmd == 0) cmd = 'g'; - bilmode = gm_clean; - break; - - case 'C': - if (cmd == 0) cmd = 'g'; - bilmode = gm_carefulcase; - break; - - case 'z': - fstp.setNullFlush(true); - break; - - case 'w': - fstp.setDictionaryCaseMode(true); - break; - - case 'v': - cout << basename(argv[0]) << " version " << PACKAGE_VERSION << endl; - exit(EXIT_SUCCESS); - break; - case 'h': - default: - endProgram(argv[0]); - break; + auto args = cli.get_bools(); + if (args["analysis"]) { + cmd = 'a'; + } + if (args["bilingual"]) { + if (cmd) cli.print_usage(); + cmd = 'b'; + } + if (args["surf-bilingual"]) { + if (cmd && cmd != 'b') cli.print_usage(); + if (!cmd) cmd = 'b'; + fstp.setBiltransSurfaceForms(true); + } + if (args["generation"]) { + if (cmd && cmd != 'b') cli.print_usage(); + if (!cmd) cmd = 'g'; + } + if (args["decompose-nouns"]) { + if (cmd) cli.print_usage(); + cmd = 'e'; + } + if (args["post-generation"]) { + if (cmd) cli.print_usage(); + cmd = 'p'; + } + if (args["inter-generation"] || args["transliteration"]) { + if (cmd) cli.print_usage(); + cmd = 't'; + } + if (args["sao"]) { + if (cmd) cli.print_usage(); + cmd = 's'; + } + + if (args["debugged-gen"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_all; + } + if (args["tagged-gen"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_tagged; + } + if (args["tagged-nm-gen"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_tagged_nm; + } + if (args["non-marked-gen"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_clean; + } + if (args["careful-case"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_carefulcase; + } + + fstp.setCaseSensitiveMode(cli.get_bools()["case-sensitive"]); + fstp.setUseDefaultIgnoredChars(!cli.get_bools()["no-default-ignore"]); + fstp.setDisplayWeightsMode(cli.get_bools()["show-weights"]); + fstp.setNullFlush(cli.get_bools()["null-flush"]); + fstp.setDictionaryCaseMode(cli.get_bools()["dictionary-case"]); + + auto strs = cli.get_strs(); + if (strs.find("ignored-chars") != strs.end()) { + fstp.setIgnoredChars(true); + for (auto& it : strs["ignored-chars"]) { + fstp.parseICX(it); } } - - InputFile input; - UFILE* output = u_finit(stdout, NULL, NULL); - - if(optind == (argc - 3)) - { - FILE* in = openInBinFile(argv[optind]); - input.open_or_exit(argv[optind+1]); - output = openOutTextFile(argv[optind+2]); - fstp.load(in); - fclose(in); - } - else if(optind == (argc -2)) - { - FILE* in = openInBinFile(argv[optind]); - input.open_or_exit(argv[optind+1]); - fstp.load(in); - fclose(in); + if (strs.find("restore-chars") != strs.end()) { + fstp.setRestoreChars(true); + fstp.setUseDefaultIgnoredChars(false); + for (auto& it : strs["restore-chars"]) { + fstp.parseRCX(it); + } } - else if(optind == (argc - 1)) - { - FILE* in = openInBinFile(argv[optind]); - fstp.load(in); - fclose(in); + if (strs.find("analyses") != strs.end()) { + int n = atoi(strs["analyses"].back().c_str()); + if (n < 1) { + std::cerr << "Invalid or no argument for analyses count" << std::endl; + exit(EXIT_FAILURE); + } + fstp.setMaxAnalysesValue(n); } - else - { - endProgram(argv[0]); + if (strs.find("weight-classes") != strs.end()) { + int n = atoi(strs["weight-classes"].back().c_str()); + if (n < 1) { + std::cerr << "Invalid or no argument for weight class count" << std::endl; + exit(EXIT_FAILURE); + } + fstp.setMaxWeightClassesValue(n); } -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif + FILE* in = openInBinFile(cli.get_files()[0]); + fstp.load(in); + fclose(in); + + InputFile input; + if (!cli.get_files()[1].empty()) { + input.open_or_exit(cli.get_files()[1].c_str()); + } + UFILE* output = openOutTextFile(cli.get_files()[2]); try { @@ -320,12 +184,6 @@ fstp.postgeneration(input, output); break; - case 'x': - fstp.initPostgeneration(); - checkValidity(fstp); - fstp.intergeneration(input, output); - break; - case 's': fstp.initAnalysis(); checkValidity(fstp); @@ -338,13 +196,6 @@ fstp.transliteration(input, output); break; - case 'o': - fstp.initBiltrans(); - checkValidity(fstp); - fstp.setBiltransSurfaceForms(true); - fstp.bilingual(input, output, bilmode); - break; - case 'b': fstp.initBiltrans(); checkValidity(fstp); @@ -365,9 +216,9 @@ break; } } - catch (exception& e) + catch (std::exception& e) { - cerr << e.what(); + std::cerr << e.what(); if (fstp.getNullFlush()) { u_fputc('\0', output); } diff -Nru lttoolbox-3.6.6/lttoolbox/lt_restrict.cc lttoolbox-3.7.1/lttoolbox/lt_restrict.cc --- lttoolbox-3.6.6/lttoolbox/lt_restrict.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_restrict.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include + +void get_symbol(const std::string& s, Alphabet& alpha, const char* prefix, + sorted_vector& vec) +{ + UString t; + t += '<'; + t += to_ustring(prefix); + t += ':'; + t += to_ustring(s.c_str()); + t += '>'; + if (alpha.isSymbolDefined(t)) { + vec.insert(alpha(alpha(t), alpha(t))); + } +} + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("remove paths from a transducer", PACKAGE_VERSION); + cli.add_bool_arg('m', "minimise", "minimise transducers after deleting paths"); + cli.add_str_arg('v', "var", "set language variant", "VAR"); + cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); + cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); + cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); + cli.add_file_arg("lr | rl", false); + cli.add_file_arg("input_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + std::string dir = cli.get_files()[0]; + if (dir == "lr") dir = "LR"; + else if (dir == "rl") dir = "RL"; + FILE* input = openInBinFile(cli.get_files()[1]); + FILE* output = openOutBinFile(cli.get_files()[2]); + + Alphabet alpha; + std::set letters; + std::map trans; + readTransducerSet(input, letters, alpha, trans); + + sorted_vector keep; + sorted_vector drop; + bool has_var = false; + get_symbol(dir, alpha, "r", keep); + for (auto& it : cli.get_strs()["var"]) { + get_symbol(it, alpha, "v", keep); + has_var = true; + } + for (auto& it : cli.get_strs()["alt"]) { + get_symbol(it, alpha, "alt", keep); + } + for (auto& it : cli.get_strs()["var-left"]) { + get_symbol(it, alpha, "vl", keep); + } + for (auto& it : cli.get_strs()["var-right"]) { + get_symbol(it, alpha, "vr", keep); + } + + for (int32_t i = 1; i <= alpha.size(); i++) { + UString t; + alpha.getSymbol(t, -i); + if (StringUtils::startswith(t, u" #include -using namespace std; void endProgram(char *name) { if(name != NULL) { - cout << basename(name) << " v" << PACKAGE_VERSION <<": build a letter transducer from a TMX translation memory" << endl; - cout << "USAGE: " << basename(name) << " [OPTIONS] lang1-lang2 tmx_file output_file" << endl; - cout << "Modes:" << endl; - cout << " lang1: input language" << endl; - cout << " lang2: output language" << endl; - cout << "Options:" <. */ #include -#include #include - -#include -#include -#include -#include - -using namespace std; - -[[noreturn]] -void endProgram(char *name) -{ - cout << basename(name) << ": process a stream with a letter transducer" << endl; - cout << "USAGE: " << basename(name) << " fst_file [input_file [output_file]]" << endl; - exit(EXIT_FAILURE); -} - -void checkValidity(FSTProcessor const &fstp) -{ - if(!fstp.valid()) - { - exit(EXIT_FAILURE); - } -} +#include +#include int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("process a stream with a letter transducer"); + cli.add_file_arg("fst_file", false); + cli.add_file_arg("input_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); - InputFile input; - UFILE* output = u_finit(stdout, NULL, NULL); FSTProcessor fstp; - FILE *aux; + FILE* aux = openInBinFile(cli.get_files()[0]); + fstp.load(aux); + fclose(aux); + fstp.initTMAnalysis(); + if (!fstp.valid()) { + return EXIT_FAILURE; + } - switch(argc) - { - case 4: - output = u_fopen(argv[3], "wb", NULL, NULL); - if(!output) - { - endProgram(argv[0]); - } - // follow - case 3: - if (!input.open(argv[2])) { - endProgram(argv[0]); - } - // follow - case 2: - aux = fopen(argv[1], "rb"); - if(!aux) - { - endProgram(argv[0]); - } - fstp.load(aux); - fclose(aux); - break; - default: - endProgram(argv[0]); - break; + InputFile input; + if (!cli.get_files()[1].empty()) { + input.open_or_exit(cli.get_files()[1].c_str()); } + UFILE* output = openOutTextFile(cli.get_files()[2].c_str()); - fstp.initTMAnalysis(); - checkValidity(fstp); fstp.tm_analysis(input, output); u_fclose(output); diff -Nru lttoolbox-3.6.6/lttoolbox/lt-trim.1 lttoolbox-3.7.1/lttoolbox/lt-trim.1 --- lttoolbox-3.6.6/lttoolbox/lt-trim.1 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt-trim.1 2022-11-01 08:36:47.000000000 +0000 @@ -85,6 +85,24 @@ .Em very simple translator pipeline, since the output of bidix seldom goes unchanged through transfer. +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl s , Fl Fl match-section +A section with this name (id@type) in the analyser will only be +trimmed against a section with the same id in the bidix. (The default +is to trim all sections of the analyser against all sections of the +bidix.) Using this option can some times speed up trimming +considerably. For example, if you have some complicated regular +expressions, try putting them in a + +
+ +in both .dix files and passing +.Dq regex@standard +to \fI--match-section\fP. +.Pp +This argument may be used multiple times to specify multiple sections +that must match by name. .Sh FILES .Bl -tag -width Ds .It Ar analyser_binary diff -Nru lttoolbox-3.6.6/lttoolbox/lt_trim.cc lttoolbox-3.7.1/lttoolbox/lt_trim.cc --- lttoolbox-3.6.6/lttoolbox/lt_trim.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_trim.cc 2022-11-01 08:36:47.000000000 +0000 @@ -16,25 +16,12 @@ */ #include #include - +#include #include - -#include #include -#include - -void endProgram(char *name) -{ - if(name != NULL) - { - cout << basename(name) << " v" << PACKAGE_VERSION <<": trim a transducer to another transducer" << endl; - cout << "USAGE: " << basename(name) << " analyser_bin_file bidix_bin_file trimmed_bin_file " << endl; - } - exit(EXIT_FAILURE); -} void -trim(FILE* file_mono, FILE* file_bi, FILE* file_out) +trim(FILE* file_mono, FILE* file_bi, FILE* file_out, std::set match_sections) { Alphabet alph_mono; std::set letters_mono; @@ -51,66 +38,90 @@ // The "." in ".*" is a set of equal pairs of the output symbols // from the monodix alphabet (: etc.) Alphabet alph_prefix = alph_bi; - set loopback_symbols; // ints refer to alph_prefix + std::set loopback_symbols; // ints refer to alph_prefix alph_prefix.createLoopbackSymbols(loopback_symbols, alph_mono, Alphabet::right); + UString union_name = u""; // Not a valid section name, used as key for those where we don't care about names matching + std::map moved_bi_transducers; for (auto& it : trans_bi) { - if (union_transducer.isEmpty()) { - union_transducer = it.second; - } else { - union_transducer.unionWith(alph_bi, it.second); + if(match_sections.count(it.first)) { + moved_bi_transducers[it.first] = it.second.appendDotStar(loopback_symbols).moveLemqsLast(alph_prefix); + } + else { + if (union_transducer.isEmpty()) { + union_transducer = it.second; + } + else { + union_transducer.unionWith(alph_bi, it.second); + } } } union_transducer.minimize(); - Transducer prefix_transducer = union_transducer.appendDotStar(loopback_symbols); - // prefix_transducer should _not_ be minimized (both useless and takes forever) - Transducer moved_transducer = prefix_transducer.moveLemqsLast(alph_prefix); + // prefix/moved transducer should _not_ be minimized (both useless and takes forever) + moved_bi_transducers[union_name] = union_transducer.appendDotStar(loopback_symbols).moveLemqsLast(alph_prefix); std::map trans_trim; + std::set sections_unmatched = match_sections; // just used to warn if user asked for a match that never happened for (auto& it : trans_mono) { if (it.second.numberOfTransitions() == 0) { - cerr << "Warning: section " << it.first << " is empty! Skipping it..." << endl; + std::cerr << "Warning: section " << it.first << " is empty! Skipping it..." << std::endl; continue; } - Transducer trimmed = it.second.intersect(moved_transducer, - alph_mono, - alph_prefix); + if (moved_bi_transducers.count(it.first)) { + sections_unmatched.erase(it.first); + } + Transducer& moved_transducer = moved_bi_transducers.count(it.first) + ? moved_bi_transducers[it.first] + : moved_bi_transducers[union_name]; + Transducer trimmed = it.second.trim(moved_transducer, + alph_mono, + alph_prefix); if (trimmed.hasNoFinals()) { - cerr << "Warning: section " << it.first << " had no final state after trimming! Skipping it..." << endl; + std::cerr << "Warning: section " << it.first << " had no final state after trimming! Skipping it..." << std::endl; continue; } trimmed.minimize(); trans_trim[it.first] = trimmed; } + for (const auto &name : sections_unmatched) { + std::cerr << "Warning: section " << name << " was not found in both transducers! Skipping if in just one..." << std::endl; + } if (trans_trim.empty()) { - cerr << "Error: Trimming gave empty transducer!" << endl; - cerr << "Hint: There are no words in bilingual dictionary that match " - "words in both monolingual dictionaries?" << endl; + std::cerr << "Error: Trimming gave empty transducer!" << std::endl; + std::cerr << "Hint: There are no words in bilingual dictionary that match " + "words in both monolingual dictionaries?" << std::endl; exit(EXIT_FAILURE); } - writeTransducerSet(file_out, UString(letters_mono.begin(), letters_mono.end()), - alph_mono, trans_trim); + writeTransducerSet(file_out, letters_mono, alph_mono, trans_trim); } int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - - if(argc != 4) - { - endProgram(argv[0]); + CLI cli("trim a transducer to another transducer", PACKAGE_VERSION); + cli.add_file_arg("analyser_bin_file", false); + cli.add_file_arg("bidix_bin_file"); + cli.add_file_arg("trimmed_bin_file"); + cli.add_str_arg('s', "match-section", "A section with this name (id@type) will only be trimmed against a section with the same name. This argument may be used multiple times.", "section_name"); + cli.parse_args(argc, argv); + + auto strs = cli.get_strs(); + std::set match_sections; + if (strs.find("match-section") != strs.end()) { + for (auto &it : strs["match-section"]) { + match_sections.insert(to_ustring(it.c_str())); + } } + FILE* analyser = openInBinFile(cli.get_files()[0]); + FILE* bidix = openInBinFile(cli.get_files()[1]); + FILE* output = openOutBinFile(cli.get_files()[2]); - FILE* analyser = openInBinFile(argv[1]); - FILE* bidix = openInBinFile(argv[2]); - FILE* output = openOutBinFile(argv[3]); - - trim(analyser, bidix, output); + trim(analyser, bidix, output, match_sections); fclose(analyser); fclose(bidix); diff -Nru lttoolbox-3.6.6/lttoolbox/Makefile.am lttoolbox-3.7.1/lttoolbox/Makefile.am --- lttoolbox-3.6.6/lttoolbox/Makefile.am 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/Makefile.am 2022-11-01 08:36:47.000000000 +0000 @@ -1,26 +1,26 @@ -h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ +h_sources = acx.h alphabet.h att_compiler.h buffer.h cli.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h file_utils.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \ transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ - ustring.h -cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ + ustring.h sorted_vector.hpp +cc_sources = acx.cc alphabet.cc att_compiler.cc cli.cc compiler.cc compression.cc entry_token.cc \ expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc -library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) +library_includedir = $(includedir)/$(PACKAGE_NAME) library_include_HEADERS = $(h_sources) -bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim lt-append +bin_PROGRAMS = lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-print lt-trim lt-compose lt-append lsx-comp lt-invert lt-restrict lt-apply-acx instdir = lttoolbox -lib_LTLIBRARIES= liblttoolbox3.la -liblttoolbox3_la_SOURCES= $(h_sources) $(cc_sources) -liblttoolbox3_la_LDFLAGS= -version-info $(SOVERSION) -release $(VERSION_API) -liblttoolbox3_la_LIBADD= $(ICU_LIBS) +lib_LTLIBRARIES= liblttoolbox.la +liblttoolbox_la_SOURCES= $(h_sources) $(cc_sources) +liblttoolbox_la_LDFLAGS= -version-info $(VERSION_ABI) +liblttoolbox_la_LIBADD= $(ICU_LIBS) lttoolboxdir = $(prefix)/share/lttoolbox lttoolboxinclude = $(prefix)/include @@ -28,17 +28,23 @@ lttoolbox_DATA = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd -LDADD = liblttoolbox$(VERSION_MAJOR).la -AM_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LIBXML_LIBS) $(ICU_LIBS) +LDADD = liblttoolbox.la $(PTHREAD_LIBS) +AM_LDFLAGS = -llttoolbox $(LIBXML_LIBS) $(ICU_LIBS) lt_append_SOURCES = lt_append.cc lt_print_SOURCES = lt_print.cc lt_trim_SOURCES = lt_trim.cc +lt_compose_SOURCES = lt_compose.cc lt_comp_SOURCES = lt_comp.cc lt_proc_SOURCES = lt_proc.cc lt_expand_SOURCES = lt_expand.cc +lt_paradigm_SOURCES = lt_paradigm.cc lt_tmxcomp_SOURCES = lt_tmxcomp.cc lt_tmxproc_SOURCES = lt_tmxproc.cc +lsx_comp_SOURCES = lt_comp.cc +lt_invert_SOURCES = lt_invert.cc +lt_restrict_SOURCES = lt_restrict.cc +lt_apply_acx_SOURCES = lt_apply_acx.cc #lt-validate-dictionary: Makefile.am validate-header.sh # @echo "Creating lt-validate-dictionary script" @@ -50,9 +56,9 @@ -man_MANS = lt-append.1 lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 +man_MANS = lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1 lsx-comp.1 -INCLUDES = -I$(top_srcdir) $(LIBXML_CFLAGS) $(ICU_CFLAGS) +INCLUDES = -I$(top_srcdir) $(LIBXML_CFLAGS) $(ICU_CFLAGS) $(PTHREAD_CFLAGS) CLEANFILES = *~ EXTRA_DIST = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd $(man_MANS) diff -Nru lttoolbox-3.6.6/lttoolbox/match_exe.cc lttoolbox-3.7.1/lttoolbox/match_exe.cc --- lttoolbox-3.6.6/lttoolbox/match_exe.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/match_exe.cc 2022-11-01 08:36:47.000000000 +0000 @@ -34,12 +34,12 @@ copy(te); } -MatchExe::MatchExe(Transducer const &t, map const &final_type) +MatchExe::MatchExe(Transducer const &t, std::map const &final_type) { // memory allocation node_list.reserve(t.transitions.size()); - for(map > >::const_iterator it = t.transitions.begin(), + for(auto it = t.transitions.begin(), limit = t.transitions.end(); it != limit; it++) { MatchNode mynode(it->second.size()); @@ -47,7 +47,7 @@ } // set up finals - for(map::const_iterator it = final_type.begin(), limit = final_type.end(); + for(auto it = final_type.begin(), limit = final_type.end(); it != limit; it++) { finals[&node_list[it->first]] = it->second; @@ -57,12 +57,12 @@ initial_id = t.initial; // set up the transitions - for(map > >::const_iterator it = t.transitions.begin(), + for(auto it = t.transitions.begin(), limit = t.transitions.end(); it != limit; it++) { MatchNode &mynode = node_list[it->first]; int i = 0; - for(multimap >::const_iterator it2 = it->second.begin(), + for(auto it2 = it->second.begin(), limit2 = it->second.end(); it2 != limit2; it2++) { mynode.addTransition(it2->first, &node_list[it2->second.first], it2->second.second, i++); @@ -100,7 +100,7 @@ return &node_list[initial_id]; } -map & +std::map & MatchExe::getFinals() { return finals; diff -Nru lttoolbox-3.6.6/lttoolbox/match_exe.h lttoolbox-3.7.1/lttoolbox/match_exe.h --- lttoolbox-3.6.6/lttoolbox/match_exe.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/match_exe.h 2022-11-01 08:36:47.000000000 +0000 @@ -27,7 +27,6 @@ #include #include -using namespace std; /** * Matcher class for execution of lexical recognizing algorithms @@ -43,12 +42,12 @@ /** * MatchNode list */ - vector node_list; + std::vector node_list; /** * Set of final nodes */ - map finals; + std::map finals; /** * Copy function @@ -73,7 +72,7 @@ * @param t the transducer * @param final_type the final types */ - MatchExe(Transducer const &t, map const &final_type); + MatchExe(Transducer const &t, std::map const &final_type); /** * Destructor @@ -103,7 +102,7 @@ * Gets the set of final nodes * @return the set of final nodes */ - map & getFinals(); + std::map & getFinals(); }; #endif diff -Nru lttoolbox-3.6.6/lttoolbox/match_node.cc lttoolbox-3.7.1/lttoolbox/match_node.cc --- lttoolbox-3.6.6/lttoolbox/match_node.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/match_node.cc 2022-11-01 08:36:47.000000000 +0000 @@ -57,6 +57,6 @@ void MatchNode::addTransition(int const i, MatchNode * const d, double w, int pos) { -// transitions[i].insert(make_pair(d, w)); +// transitions[i].insert({d, w}); transitions.add(i, d, w, pos); } diff -Nru lttoolbox-3.6.6/lttoolbox/match_node.h lttoolbox-3.7.1/lttoolbox/match_node.h --- lttoolbox-3.6.6/lttoolbox/match_node.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/match_node.h 2022-11-01 08:36:47.000000000 +0000 @@ -25,10 +25,9 @@ class MatchState; -using namespace std; //class MatchNode; -//typedef map > MNode; +//typedef std::map > MNode; typedef SortedVector MNode; diff -Nru lttoolbox-3.6.6/lttoolbox/match_state.cc lttoolbox-3.7.1/lttoolbox/match_state.cc --- lttoolbox-3.6.6/lttoolbox/match_state.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/match_state.cc 2022-11-01 08:36:47.000000000 +0000 @@ -122,19 +122,19 @@ } int -MatchState::classifyFinals(map const &final_class) const +MatchState::classifyFinals(std::map const &final_class) const { - set empty_set; + std::set empty_set; return classifyFinals(final_class, empty_set); } int -MatchState::classifyFinals(map const &final_class, set const &banned_rules) const +MatchState::classifyFinals(std::map const &final_class, std::set const &banned_rules) const { int result = INT_MAX; for (int i = first; i != last; i = (i+1)%BUF_LIMIT) { - map::const_iterator it2 = final_class.find(state[i]); + auto it2 = final_class.find(state[i]); if(it2 != final_class.end()) { if(it2->second < result && banned_rules.find(it2->second) == banned_rules.end()) diff -Nru lttoolbox-3.6.6/lttoolbox/match_state.h lttoolbox-3.7.1/lttoolbox/match_state.h --- lttoolbox-3.6.6/lttoolbox/match_state.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/match_state.h 2022-11-01 08:36:47.000000000 +0000 @@ -24,7 +24,6 @@ #include -using namespace std; /** * Class to represent the current state of transducer processing @@ -104,9 +103,9 @@ */ void init(MatchNode *initial); - int classifyFinals(map const &final_class, set const &banned_rules) const; + int classifyFinals(std::map const &final_class, std::set const &banned_rules) const; - int classifyFinals(map const &final_class) const; + int classifyFinals(std::map const &final_class) const; void debug(); diff -Nru lttoolbox-3.6.6/lttoolbox/node.h lttoolbox-3.7.1/lttoolbox/node.h --- lttoolbox-3.6.6/lttoolbox/node.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/node.h 2022-11-01 08:36:47.000000000 +0000 @@ -25,7 +25,6 @@ class State; class Node; -using namespace std; class Dest { @@ -118,7 +117,7 @@ * The outgoing transitions of this node. * Schema: (input symbol, (output symbol, destination, weight)) */ - map transitions; + std::map transitions; /** * Copy method diff -Nru lttoolbox-3.6.6/lttoolbox/pattern_list.cc lttoolbox-3.7.1/lttoolbox/pattern_list.cc --- lttoolbox-3.6.6/lttoolbox/pattern_list.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/pattern_list.cc 2022-11-01 08:36:47.000000000 +0000 @@ -22,10 +22,6 @@ #include #include -UString const PatternList::ANY_CHAR = ""_u; -UString const PatternList::ANY_TAG = ""_u; -UString const PatternList::QUEUE = ""_u; - void PatternList::copy(PatternList const &o) { @@ -80,7 +76,7 @@ { if(sequence) { - cerr << "Error: opening an unended sequence" << endl; + std::cerr << "Error: opening an unended sequence" << std::endl; exit(EXIT_FAILURE); } sequence = true; @@ -92,23 +88,23 @@ { if(!sequence) { - cerr << "Error: ending an unopened sequence" << endl; + std::cerr << "Error: ending an unopened sequence" << std::endl; exit(EXIT_FAILURE); } sequence = false; - for(list >::iterator it = sequence_data.begin(), + for(auto it = sequence_data.begin(), limit = sequence_data.end(); it != limit; it++) { it->push_back(alphabet(QUEUE)); - patterns.insert(make_pair(sequence_id, *it)); + patterns.insert({sequence_id, *it}); } } void -PatternList::insertOutOfSequence(UString const &lemma, UString const &tags, - vector &result) +PatternList::insertOutOfSequence(UStringView lemma, UStringView tags, + std::vector &result) { if(lemma.empty()) { @@ -136,9 +132,9 @@ { for(unsigned int i = 0, limit = tagCount(tags); i < limit; i++) { - UString tag = "<"_u + tagAt(tags, i) + ">"_u; + UString tag = "<"_u + US(tagAt(tags, i)) + ">"_u; - if(tag == "<*>"_u) + if(tag == u"<*>"_uv) { result.push_back(alphabet(ANY_TAG)); } @@ -152,21 +148,20 @@ } void -PatternList::insertIntoSequence(int const id, UString const &lemma, - UString const &tags) +PatternList::insertIntoSequence(int id, UStringView lemma, UStringView tags) { sequence_id = id; if(sequence_data.size() == 0) { - vector new_vector; + std::vector new_vector; insertOutOfSequence(lemma, tags, new_vector); sequence_data.push_back(new_vector); } else { - list >::iterator it = sequence_data.begin(); - list >::iterator limit = sequence_data.end(); + auto it = sequence_data.begin(); + auto limit = sequence_data.end(); for(; it != limit; it++) { it->push_back('+'); @@ -176,14 +171,14 @@ } void -PatternList::insert(int const id, UString const &lemma, UString const &tags) +PatternList::insert(int id, UStringView lemma, UStringView tags) { if(!sequence) { - vector local; + std::vector local; insertOutOfSequence(lemma, tags, local); local.push_back(alphabet(QUEUE)); - patterns.insert(make_pair(id, local)); + patterns.insert({id, local}); } else { @@ -192,11 +187,11 @@ } void -PatternList::insert(int const id, int const otherid) +PatternList::insert(int id, int otherid) { if(!sequence) { - cerr << "Error: using labels outside of a sequence" << endl; + std::cerr << "Error: using labels outside of a sequence" << std::endl; exit(EXIT_FAILURE); } @@ -212,15 +207,15 @@ } else { - list > new_sequence_data; + std::list> new_sequence_data; - for(list >::iterator it = sequence_data.begin(), + for(auto it = sequence_data.begin(), limit = sequence_data.end(); it != limit; it++) { for(PatternRange p = patterns.equal_range(otherid); p.first != p.second; p.first++) { - vector temp = *it; + std::vector temp = *it; temp.push_back('+'); temp.insert(temp.end(), (p.first->second).begin(), (p.first->second).end()); @@ -233,7 +228,7 @@ } int -PatternList::tagCount(UString const &tags) +PatternList::tagCount(UStringView tags) { int count = 0; @@ -252,8 +247,8 @@ return count; } -UString -PatternList::tagAt(UString const &tags, int const index) +UStringView +PatternList::tagAt(UStringView tags, int index) { int start = 0; int end = 0; @@ -282,7 +277,7 @@ if(index > count) { - return ""_u; + return u""; } if(end != 0) { @@ -303,7 +298,7 @@ void PatternList::buildTransducer() { - for(PatternStore::const_iterator it = patterns.begin(), limit = patterns.end(); + for(auto it = patterns.begin(), limit = patterns.end(); it != limit; it++) { int state = transducer.getInitial(); @@ -366,7 +361,7 @@ PatternList::write(FILE *output) { alphabet.write(output); - UString const tagger_name = "tagger"_u; + UStringView tagger_name = u"tagger"; Compression::multibyte_write(1, output); Compression::string_write(tagger_name, output); @@ -374,7 +369,7 @@ Compression::multibyte_write(final_type.size(), output); - for(map::const_iterator it = final_type.begin(), limit = final_type.end(); + for(auto it = final_type.begin(), limit = final_type.end(); it != limit; it++) { Compression::multibyte_write(it->first, output); @@ -408,7 +403,7 @@ { alphabet.serialise(serialised); transducer.serialise(serialised); - Serialiser >::serialise(final_type, serialised); + Serialiser >::serialise(final_type, serialised); } void @@ -416,7 +411,7 @@ { alphabet.deserialise(serialised); transducer.deserialise(serialised); - final_type = Deserialiser >::deserialise(serialised); + final_type = Deserialiser >::deserialise(serialised); } MatchExe * diff -Nru lttoolbox-3.6.6/lttoolbox/pattern_list.h lttoolbox-3.7.1/lttoolbox/pattern_list.h --- lttoolbox-3.6.6/lttoolbox/pattern_list.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/pattern_list.h 2022-11-01 08:36:47.000000000 +0000 @@ -26,10 +26,8 @@ #include #include -using namespace std; - -typedef multimap > PatternStore; -typedef pair PatternRange; +typedef std::multimap > PatternStore; +typedef std::pair PatternRange; class PatternList { @@ -37,37 +35,35 @@ Alphabet alphabet; PatternStore patterns; bool sequence; - list > sequence_data; + std::list > sequence_data; Transducer transducer; - map final_type; + std::map final_type; int sequence_id; double default_weight; void copy(PatternList const &o); void destroy(); - void insertOutOfSequence(UString const &lemma, UString const &tags, - vector &result); - void insertIntoSequence(int const id, UString const &lemma, - UString const &tags); + void insertOutOfSequence(UStringView lemma, UStringView tags, std::vector &result); + void insertIntoSequence(int id, UStringView lemma, UStringView tags); - static int tagCount(UString const &tags); - static UString tagAt(UString const &tags, int const index); + static int tagCount(UStringView tags); + static UStringView tagAt(UStringView tags, int index); public: /** * This symbol stands for any char */ - static UString const ANY_CHAR; + static constexpr UStringView ANY_CHAR = u""; /** * This symbol stands for any tag */ - static UString const ANY_TAG; + static constexpr UStringView ANY_TAG = u""; /** * This symbol marks a word queue */ - static UString const QUEUE; + static constexpr UStringView QUEUE = u""; /** * Constructor @@ -106,14 +102,14 @@ * @param lemma * @param tags */ - void insert(int const id, UString const &lemma, UString const &tags); + void insert(int id, UStringView lemma, UStringView tags); /** * Insertion method * @param id * @param otherid */ - void insert(int const id, int const otherid); + void insert(int id, int otherid); /** * Get the PatternStore diff -Nru lttoolbox-3.6.6/lttoolbox/regexp_compiler.cc lttoolbox-3.7.1/lttoolbox/regexp_compiler.cc --- lttoolbox-3.6.6/lttoolbox/regexp_compiler.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/regexp_compiler.cc 2022-11-01 08:36:47.000000000 +0000 @@ -18,21 +18,16 @@ #include #include +#include -RegexpCompiler::RegexpCompiler() : -token(0), -index(0), -alphabet(0), -state(0), -letter(0), -postop(0), -default_weight(0.0000) +#define FIN_FICHERO INT_MAX + +RegexpCompiler::RegexpCompiler() { } RegexpCompiler::~RegexpCompiler() { - destroy(); } RegexpCompiler::RegexpCompiler(RegexpCompiler const &rec) @@ -45,7 +40,6 @@ { if(this != &rec) { - destroy(); copy(rec); } @@ -66,11 +60,6 @@ default_weight = rec.default_weight; } -void -RegexpCompiler::destroy() -{ -} - bool RegexpCompiler::isReserved(int const t) { @@ -98,14 +87,14 @@ void RegexpCompiler::error() { - cerr << "Error parsing regexp" < const &er) +RegexpCompiler::compile(std::vector const &er) { input = er; token = input[0]; @@ -334,7 +323,7 @@ consume(']'); Postop(); - for(set::iterator it = brackets.begin(); + for(auto it = brackets.begin(); it != brackets.end(); it++) { int mystate = t.getInitial(); diff -Nru lttoolbox-3.6.6/lttoolbox/regexp_compiler.h lttoolbox-3.7.1/lttoolbox/regexp_compiler.h --- lttoolbox-3.6.6/lttoolbox/regexp_compiler.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/regexp_compiler.h 2022-11-01 08:36:47.000000000 +0000 @@ -17,7 +17,6 @@ #ifndef _REGEXP_COMPILER_ #define _REGEXP_COMPILER_ -#include #include #include @@ -25,10 +24,6 @@ #include #include -using namespace std; - -#define FIN_FICHERO - 1 - /** * Compiler that builds a transducer to identify regular expressions. This * compiler is a recursive descendent parser (RDP). @@ -39,22 +34,22 @@ /** * Last token */ - int token; + int token = 0; /** * Input string */ - vector input; + std::vector input; /** * Location in the input string */ - size_t index; + size_t index = 0; /** * Alphabet to encode symbols */ - Alphabet *alphabet; + Alphabet *alphabet = nullptr; /** * Transducer to store analysis @@ -64,27 +59,27 @@ /** * Current state */ - int state; + int state = 0; /** * Current letter */ - int letter; + int letter = 0; /** * Post-operator: '+', '?', '*' */ - UChar32 postop; + UChar32 postop = '\0'; /** * Default value of weight */ - double default_weight; + double default_weight = 0.0000; /** * */ - set brackets; + std::set brackets; /** * Copy method @@ -93,11 +88,6 @@ void copy(RegexpCompiler const &rec); /** - * Destroy method - */ - void destroy(); - - /** * RDP top function */ void S(); @@ -208,7 +198,7 @@ * Function that parses a regular expression and produces a transducer * @param er the regular expression */ - void compile(vector const &er); + void compile(std::vector const &er); /** * Set the decoder of symbols diff -Nru lttoolbox-3.6.6/lttoolbox/serialiser.h lttoolbox-3.7.1/lttoolbox/serialiser.h --- lttoolbox-3.6.6/lttoolbox/serialiser.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/serialiser.h 2022-11-01 08:36:47.000000000 +0000 @@ -135,8 +135,7 @@ std::ostream &Output) { ::serialise(static_cast(SerialisedType_.size()), Output); - for (typename std::basic_string::const_iterator - SerialisedType_iterator = SerialisedType_.begin(); + for (auto SerialisedType_iterator = SerialisedType_.begin(); // Call .end() each iteration to save memory. SerialisedType_iterator != SerialisedType_.end(); ++SerialisedType_iterator) { @@ -240,8 +239,7 @@ uint64_t size = SerialisedType_.size(); ::serialise(size, Output); - for (typename Container::const_iterator value_type_ = - SerialisedType_.begin(); + for (auto value_type_ = SerialisedType_.begin(); // Call .end() each iteration to save memory. value_type_ != SerialisedType_.end(); ++value_type_) { ::serialise(*value_type_, Output); diff -Nru lttoolbox-3.6.6/lttoolbox/sorted_vector.cc lttoolbox-3.7.1/lttoolbox/sorted_vector.cc --- lttoolbox-3.6.6/lttoolbox/sorted_vector.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/sorted_vector.cc 2022-11-01 08:36:47.000000000 +0000 @@ -17,7 +17,6 @@ #include #include -using namespace std; void SortedVector::copy(SortedVector const &o) diff -Nru lttoolbox-3.6.6/lttoolbox/sorted_vector.hpp lttoolbox-3.7.1/lttoolbox/sorted_vector.hpp --- lttoolbox-3.6.6/lttoolbox/sorted_vector.hpp 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/sorted_vector.hpp 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,277 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#pragma once +#ifndef c6d28b7452ec699b_SORTED_VECTOR_HPP +#define c6d28b7452ec699b_SORTED_VECTOR_HPP +#include +#include +#include +#include + +namespace detail { + template + bool is_sorted(ForwardIt first, ForwardIt last, Comp comp) { + if (first != last) { + ForwardIt next = first; + while (++next != last) { + if (comp(*next, *first)) { + return false; + } + first = next; + } + } + return true; + } +} + +template> +class sorted_vector { +public: + typedef typename std::vector container; + typedef typename container::iterator iterator; + typedef typename container::const_iterator const_iterator; + typedef typename container::const_reverse_iterator const_reverse_iterator; + typedef typename container::size_type size_type; + typedef T value_type; + typedef T key_type; + + sorted_vector() {} + + sorted_vector(const std::set& o) { + insert(o.begin(), o.end()); + } + + std::pair insert(T t) { + if (elements.empty()) { + elements.push_back(t); + return {elements.begin(), true}; + } + iterator it = std::lower_bound(elements.begin(), elements.end(), t, comp); + size_t at = std::distance(elements.begin(), it); + if (it == elements.end() || comp(*it, t) || comp(t, *it)) { + elements.insert(it, t); + return {elements.begin() + at, true}; + } + return {elements.begin() + at, false}; + } + + template + void insert(It b, It e) { + size_t d = std::distance(b, e); + if (d == 1) { + insert(*b); + return; + } + + static thread_local container merged; + merged.resize(0); + merged.reserve(elements.size() + d); + + if (detail::is_sorted(b, e, comp)) { + std::merge(elements.begin(), elements.end(), b, e, std::back_inserter(merged), comp); + } + else { + static thread_local container sorted; + sorted.assign(b, e); + std::sort(sorted.begin(), sorted.end(), comp); + std::merge(elements.begin(), elements.end(), sorted.begin(), sorted.end(), std::back_inserter(merged), comp); + } + + merged.swap(elements); + auto it = std::unique(elements.begin(), elements.end()); + elements.erase(it, elements.end()); + } + + void push_back(T t) { + insert(t); + } + + bool erase(T t) { + if (elements.empty()) { + return false; + } + if (comp(elements.back(), t)) { + return false; + } + if (comp(t, elements.front())) { + return false; + } + auto it = lower_bound(t); + if (it != elements.end() && !comp(*it, t) && !comp(t, *it)) { + elements.erase(it); + return true; + } + return false; + } + + const_iterator erase(const_iterator it) { + size_type o = std::distance(elements.begin(), it); + return elements.erase(elements.begin() + o); + } + + template + void erase(It b, It e) { + for (; b != e; ++b) { + erase(*b); + } + } + + const_iterator find(T t) const { + if (elements.empty()) { + return elements.end(); + } + if (comp(elements.back(), t)) { + return elements.end(); + } + if (comp(t, elements.front())) { + return elements.end(); + } + auto it = lower_bound(t); + if (it != elements.end() && (comp(*it, t) || comp(t, *it))) { + return elements.end(); + } + return it; + } + + size_t count(T t) const { + return (find(t) != end()); + } + + iterator begin() { + return elements.begin(); + } + + iterator end() { + return elements.end(); + } + + const_iterator begin() const { + return elements.begin(); + } + + const_iterator end() const { + return elements.end(); + } + + const_iterator cbegin() const { + return elements.cbegin(); + } + + const_iterator cend() const { + return elements.cend(); + } + + const_reverse_iterator rbegin() const { + return elements.rbegin(); + } + + const_reverse_iterator rend() const { + return elements.rend(); + } + + T front() const { + return elements.front(); + } + + T back() const { + return elements.back(); + } + + iterator lower_bound(T t) { + return std::lower_bound(elements.begin(), elements.end(), t, comp); + } + + const_iterator lower_bound(T t) const { + return std::lower_bound(elements.begin(), elements.end(), t, comp); + } + + const_iterator upper_bound(T t) const { + return std::upper_bound(elements.begin(), elements.end(), t, comp); + } + + bool intersects(const sorted_vector& other) const { + auto ti = begin(); + auto oi = other.begin(); + auto te = end(); + auto oe = other.end(); + while (ti != te && oi != oe) { + if (*ti == *oi) { + return true; + } + else if (comp(*ti, *oi)) { + ++ti; + } + else { + ++oi; + } + } + return false; + } + + size_type size() const { + return elements.size(); + } + + size_type capacity() const { + return elements.capacity(); + } + + bool empty() const { + return elements.empty(); + } + + template + void assign(It b, It e) { + clear(); + insert(b, e); + } + + void assign(const_iterator b, const_iterator e) { + elements.assign(b, e); + } + + void swap(sorted_vector& other) { + elements.swap(other.elements); + } + + void clear() { + elements.clear(); + } + + void sort() { + std::sort(elements.begin(), elements.end(), Comp()); + } + + void pop_back() { + elements.pop_back(); + } + + container& get() { + return elements; + } + + bool operator<(const sorted_vector& o) const { + return elements < o.elements; + } + +private: + container elements; + Comp comp; +}; + +#endif diff -Nru lttoolbox-3.6.6/lttoolbox/state.cc lttoolbox-3.7.1/lttoolbox/state.cc --- lttoolbox-3.6.6/lttoolbox/state.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/state.cc 2022-11-01 08:36:47.000000000 +0000 @@ -15,6 +15,7 @@ * along with this program; if not, see . */ #include +#include #include #include @@ -22,7 +23,6 @@ //debug// //#include -//using namespace std; //debug// State::State() @@ -75,13 +75,13 @@ for(size_t i = 0, limit = state.size(); i != limit; i++) { - vector> *tmp = new vector>(); + std::vector> *tmp = new std::vector>(); *tmp = *(state[i].sequence); state[i].sequence = tmp; } } -int +size_t State::size() const { return state.size(); @@ -91,25 +91,25 @@ State::init(Node *initial) { state.clear(); - state.push_back(TNodeState(initial, new vector>(), false)); + state.push_back(TNodeState(initial, new std::vector>(), false)); state[0].sequence->clear(); epsilonClosure(); } bool -State::apply_into(vector* new_state, int const input, int index, bool dirty) +State::apply_into(std::vector* new_state, int const input, int index, bool dirty) { - map::const_iterator it; + std::map::const_iterator it; it = state[index].where->transitions.find(input); if(it != state[index].where->transitions.end()) { for(int j = 0; j != it->second.size; j++) { - vector> *new_v = new vector>(); + std::vector> *new_v = new std::vector>(); *new_v = *(state[index].sequence); if(it->first != 0) { - new_v->push_back(make_pair(it->second.out_tag[j], it->second.out_weight[j])); + new_v->push_back({it->second.out_tag[j], it->second.out_weight[j]}); } new_state->push_back(TNodeState(it->second.dest[j], new_v, state[index].dirty||dirty)); } @@ -119,25 +119,25 @@ } bool -State::apply_into_override(vector* new_state, int const input, int const old_sym, int const new_sym, int index, bool dirty) +State::apply_into_override(std::vector* new_state, int const input, int const old_sym, int const new_sym, int index, bool dirty) { - map::const_iterator it; + std::map::const_iterator it; it = state[index].where->transitions.find(input); if(it != state[index].where->transitions.end()) { for(int j = 0; j != it->second.size; j++) { - vector> *new_v = new vector>(); + std::vector> *new_v = new std::vector>(); *new_v = *(state[index].sequence); if(it->first != 0) { if(it->second.out_tag[j] == old_sym) { - new_v->push_back(make_pair(new_sym, it->second.out_weight[j])); + new_v->push_back({new_sym, it->second.out_weight[j]}); } else { - new_v->push_back(make_pair(it->second.out_tag[j], it->second.out_weight[j])); + new_v->push_back({it->second.out_tag[j], it->second.out_weight[j]}); } } new_state->push_back(TNodeState(it->second.dest[j], new_v, state[index].dirty||dirty)); @@ -156,7 +156,7 @@ return; } - vector new_state; + std::vector new_state; for(size_t i = 0, limit = state.size(); i != limit; i++) { apply_into(&new_state, input, i, false); @@ -175,7 +175,7 @@ return; } - vector new_state; + std::vector new_state; for(size_t i = 0, limit = state.size(); i != limit; i++) { apply_into_override(&new_state, input, old_sym, new_sym, i, false); @@ -201,7 +201,7 @@ return; } - vector new_state; + std::vector new_state; for(size_t i = 0, limit = state.size(); i != limit; i++) { apply_into_override(&new_state, input, old_sym, new_sym, i, false); @@ -222,7 +222,7 @@ return; } - vector new_state; + std::vector new_state; if(input == alt) { apply(input); @@ -248,7 +248,7 @@ return; } - vector new_state; + std::vector new_state; for(size_t i = 0, limit = state.size(); i != limit; i++) { if(!apply_into(&new_state, input, i, false)) @@ -266,17 +266,16 @@ { for(size_t i = 0; i != state.size(); i++) { - map::iterator it2; - it2 = state[i].where->transitions.find(0); + auto it2 = state[i].where->transitions.find(0); if(it2 != state[i].where->transitions.end()) { for(int j = 0 ; j != it2->second.size; j++) { - vector> *tmp = new vector>(); + std::vector> *tmp = new std::vector>(); *tmp = *(state[i].sequence); if(it2->second.out_tag[j] != 0) { - tmp->push_back(make_pair(it2->second.out_tag[j], it2->second.out_weight[j])); + tmp->push_back({it2->second.out_tag[j], it2->second.out_weight[j]}); } state.push_back(TNodeState(it2->second.dest[j], tmp, state[i].dirty)); } @@ -287,7 +286,7 @@ void State::apply(int const input, int const alt1, int const alt2) { - vector new_state; + std::vector new_state; if(input == 0 || alt1 == 0 || alt2 == 0) { state = new_state; @@ -317,11 +316,11 @@ } void -State::apply(int const input, set const alts) +State::apply(int const input, std::set const alts) { - vector new_state; + std::vector new_state; bool has_null = false; - for(set::iterator sit = alts.begin(); sit != alts.end(); sit++) + for(auto sit = alts.begin(); sit != alts.end(); sit++) { if(*sit == 0) { @@ -337,7 +336,7 @@ for(size_t i = 0, limit = state.size(); i != limit; i++) { apply_into(&new_state, input, i, false); - for(set::iterator sit = alts.begin(); sit != alts.end(); sit++) + for(auto sit = alts.begin(); sit != alts.end(); sit++) { if(*sit == input) continue; apply_into(&new_state, *sit, i, true); @@ -392,7 +391,7 @@ } void -State::step(int const input, set const alts) +State::step(int const input, std::set const alts) { apply(input, alts); epsilonClosure(); @@ -422,8 +421,19 @@ } +void +State::step_case_override(UChar32 val, bool caseSensitive) +{ + if (!u_isupper(val) || caseSensitive) { + step(val); + } else { + step_override(val, u_tolower(val), u_tolower(val), val); + } +} + + bool -State::isFinal(map const &finals) const +State::isFinal(std::map const &finals) const { for(size_t i = 0, limit = state.size(); i != limit; i++) { @@ -437,19 +447,19 @@ } -vector> -State::NFinals(vector> lf, int maxAnalyses, int maxWeightClasses) const +std::vector> +State::NFinals(std::vector> lf, int maxAnalyses, int maxWeightClasses) const { - vector> result; + std::vector> result; sort(lf.begin(), lf.end(), sort_weights()); - for(vector >::iterator it = lf.begin(); it != lf.end(); it++) + for(auto it = lf.begin(); it != lf.end(); it++) { double last_weight = 0.0000; if(maxAnalyses > 0 && maxWeightClasses > 0) { - result.push_back(make_pair(it->first, it->second)); + result.push_back({it->first, it->second}); maxAnalyses--; if(last_weight!=it->second) { @@ -463,13 +473,13 @@ UString -State::filterFinals(map const &finals, +State::filterFinals(std::map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + std::set const &escaped_chars, bool display_weights, int max_analyses, int max_weight_classes, bool uppercase, bool firstupper, int firstchar) const { - vector> response; + std::vector> response; UString result; double cost = 0.0000; @@ -522,15 +532,15 @@ // Add the weight of the final state cost += (*(finals.find(state[i].where))).second; - response.push_back(make_pair(result, cost)); + response.push_back({result, cost}); } } response = NFinals(response, max_analyses, max_weight_classes); result.clear(); - set seen; - for(vector>::iterator it = response.begin(); it != response.end(); it++) + std::set seen; + for(auto it = response.begin(); it != response.end(); it++) { if(seen.find(it->first) != seen.end()) { continue; @@ -551,15 +561,15 @@ } -set > > -State::filterFinalsLRX(map const &finals, +std::set > > +State::filterFinalsLRX(std::map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + std::set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { - set > > results; + std::set > > results; - vector current_result; + std::vector current_result; UString rule_id; for(size_t i = 0, limit = state.size(); i != limit; i++) @@ -577,7 +587,7 @@ } UString sym; alphabet.getSymbol(sym, ((*(state[i].sequence))[j]).first, uppercase); - if(sym == "<$>"_u) + if(sym == u"<$>"_uv) { if(!current_word.empty()) { @@ -591,7 +601,7 @@ } } rule_id = current_word; - results.insert(make_pair(rule_id, current_result)); + results.insert({rule_id, current_result}); } } @@ -600,9 +610,9 @@ UString -State::filterFinalsSAO(map const &finals, +State::filterFinalsSAO(std::map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + std::set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { UString result; @@ -652,10 +662,10 @@ } UString -State::filterFinalsTM(map const &finals, +State::filterFinalsTM(std::map const &finals, Alphabet const &alphabet, - set const &escaped_chars, - queue &blankqueue, vector &numbers) const + std::set const &escaped_chars, + std::queue &blankqueue, std::vector &numbers) const { UString result; @@ -677,7 +687,7 @@ UString result2; - vector fragment; + std::vector fragment; fragment.push_back(""_u); for(unsigned int i = 0, limit = result.size(); i != limit ; i++) @@ -696,7 +706,7 @@ { if(i != limit -1) { - if(fragment[i].size() >=2 && fragment[i].substr(fragment[i].size()-2) == "(#"_u) + if(fragment[i].size() >=2 && StringUtils::endswith(fragment[i], u"(#")) { UString whitespace = " "_u; if(blankqueue.size() != 0) @@ -768,7 +778,7 @@ for(unsigned int i = 0; i> seq = *state.at(i).sequence; + std::vector> seq = *state.at(i).sequence; if(lastPartHasRequiredSymbol(seq, requiredSymbol, separationSymbol)) { @@ -785,7 +795,7 @@ } // remove states with more than minimum number of compounds (or without the requiered symbol in the last part) - vector::iterator it = state.begin(); + auto it = state.begin(); int i=0; while(it != state.end()) { @@ -809,10 +819,10 @@ void State::pruneStatesWithForbiddenSymbol(int forbiddenSymbol) { - vector::iterator it = state.begin(); + auto it = state.begin(); while(it != state.end()) { - vector> *seq = (*it).sequence; + std::vector> *seq = (*it).sequence; bool found = false; for(int i = seq->size()-1; i>=0; i--) { @@ -838,7 +848,7 @@ for(size_t i = 0; i>* seq = state.at(i).sequence; + std::vector>* seq = state.at(i).sequence; if(seq != NULL) for (unsigned int j=0; jsize(); j++) { int symbol=(seq->at(j)).first; @@ -853,7 +863,7 @@ bool -State::lastPartHasRequiredSymbol(const vector> &seq, int requiredSymbol, int separationSymbol) +State::lastPartHasRequiredSymbol(const std::vector> &seq, int requiredSymbol, int separationSymbol) { // state is final - it should be restarted it with all elements in stateset restart_state, with old symbols conserved bool restart=false; @@ -875,7 +885,7 @@ void -State::restartFinals(const map &finals, int requiredSymbol, State *restart_state, int separationSymbol) +State::restartFinals(const std::map &finals, int requiredSymbol, State *restart_state, int separationSymbol) { for(unsigned int i=0; istate.size(); j++) { TNodeState initst = restart_state->state.at(j); - vector> *tnvec = new vector>; + std::vector> *tnvec = new std::vector>; for(unsigned int k=0; k < state_i.sequence->size(); k++) { tnvec->push_back(state_i.sequence->at(k)); } TNodeState tn(initst.where, tnvec, state_i.dirty); - tn.sequence->push_back(make_pair(separationSymbol, 0.0000)); + tn.sequence->push_back({separationSymbol, 0.0}); state.push_back(tn); } } @@ -919,7 +929,7 @@ for(unsigned int i=0; i>* seq = state.at(i).sequence; + std::vector>* seq = state.at(i).sequence; if(seq != NULL) for (unsigned int j=0; jsize(); j++) { UString ws; diff -Nru lttoolbox-3.6.6/lttoolbox/state.h lttoolbox-3.7.1/lttoolbox/state.h --- lttoolbox-3.6.6/lttoolbox/state.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/state.h 2022-11-01 08:36:47.000000000 +0000 @@ -32,7 +32,6 @@ #include -using namespace std; /** * Class to represent the current state of transducer processing @@ -46,11 +45,11 @@ struct TNodeState { Node *where; - vector> *sequence; + std::vector> *sequence; // a state is "dirty" if it was introduced at runtime (case variants, etc.) bool dirty; - TNodeState(Node * const &w, vector> * const &s, bool const &d): where(w), sequence(s), dirty(d){} + TNodeState(Node * const &w, std::vector> * const &s, bool const &d): where(w), sequence(s), dirty(d){} TNodeState(const TNodeState& other) : where(other.where) @@ -67,7 +66,7 @@ } }; - vector state; + std::vector state; /** * Destroy function @@ -78,9 +77,9 @@ * Helper functions for the various apply()s to reduce code duplication * @return whether any transitions were made */ - bool apply_into(vector* new_state, int const input, int index, bool dirty); + bool apply_into(std::vector* new_state, int const input, int index, bool dirty); - bool apply_into_override(vector* new_state, int const input, int const old_sym, int const new_sym, int index, bool dirty); + bool apply_into_override(std::vector* new_state, int const input, int const old_sym, int const new_sym, int index, bool dirty); /** * Make a transition, version for lowercase letters and symbols @@ -102,7 +101,7 @@ * @param input the input symbol * @param alts set of alternative input symbols */ - void apply(int const input, set const alts); + void apply(int const input, std::set const alts); /** * Make a transition, only applying lowercase version if @@ -128,7 +127,7 @@ */ void epsilonClosure(); - bool lastPartHasRequiredSymbol(const vector> &seq, int requiredSymbol, int separationSymbol); + bool lastPartHasRequiredSymbol(const std::vector> &seq, int requiredSymbol, int separationSymbol); public: @@ -166,7 +165,7 @@ * Number of alive transductions * @return the size */ - int size() const; + size_t size() const; /** * step = apply + epsilonClosure @@ -188,7 +187,7 @@ * @param input the input symbol * @param alt the alternative input symbols */ - void step(int const input, set const alts); + void step(int const input, std::set const alts); void step_case(UChar32 val, bool caseSensitive); @@ -200,6 +199,8 @@ void step_override(int const input, int const alt, int const old_sym, int const new_sym); + void step_case_override(const int val, const bool caseSensitive); + /** * Init the state with the initial node and empty output * @param initial the initial node of the transducer @@ -238,13 +239,13 @@ template struct sort_weights { - typedef pair type; + typedef std::pair type; bool operator ()(type const& a, type const& b) const { return a.second < b.second; } }; - vector> NFinals(vector> lf, + std::vector> NFinals(std::vector> lf, int maxAnalyses, int maxWeightClasses) const; @@ -260,9 +261,9 @@ * @param firstchar first character of the word * @return the result of the transduction */ - UString filterFinals(map const &finals, + UString filterFinals(std::map const &finals, Alphabet const &a, - set const &escaped_chars, + std::set const &escaped_chars, bool display_weights = false, int max_analyses = INT_MAX, int max_weight_classes = INT_MAX, @@ -281,9 +282,9 @@ * @param firstchar first character of the word * @return the result of the transduction */ - UString filterFinalsSAO(map const &finals, + UString filterFinalsSAO(std::map const &finals, Alphabet const &a, - set const &escaped_chars, + std::set const &escaped_chars, bool uppercase = false, bool firstupper = false, int firstchar = 0) const; @@ -301,9 +302,9 @@ * @return the result of the transduction */ - set > > filterFinalsLRX(map const &finals, + std::set > > filterFinalsLRX(std::map const &finals, Alphabet const &a, - set const &escaped_chars, + std::set const &escaped_chars, bool uppercase = false, bool firstupper = false, int firstchar = 0) const; @@ -320,7 +321,7 @@ * @param restart_state * @param separationSymbol */ - void restartFinals(const map &finals, int requiredSymbol, State *restart_state, int separationSymbol); + void restartFinals(const std::map &finals, int requiredSymbol, State *restart_state, int separationSymbol); /** @@ -329,18 +330,18 @@ * @param finals set of final nodes @return * @true if the state is final */ - bool isFinal(map const &finals) const; + bool isFinal(std::map const &finals) const; /** * Return the full states string (to allow debuging...) using a Java ArrayList.toString style */ UString getReadableString(const Alphabet &a); - UString filterFinalsTM(map const &finals, + UString filterFinalsTM(std::map const &finals, Alphabet const &alphabet, - set const &escaped_chars, - queue &blanks, - vector &numbers) const; + std::set const &escaped_chars, + std::queue &blanks, + std::vector &numbers) const; }; diff -Nru lttoolbox-3.6.6/lttoolbox/string_utils.cc lttoolbox-3.7.1/lttoolbox/string_utils.cc --- lttoolbox-3.6.6/lttoolbox/string_utils.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/string_utils.cc 2022-11-01 08:36:47.000000000 +0000 @@ -4,9 +4,10 @@ #include #include #include +#include -UString -StringUtils::trim(const UString& str) +UStringView +StringUtils::trim(UStringView str) { if (str.empty()) { return str; @@ -16,17 +17,17 @@ size_t i = 0; UChar32 c; while (begin < end) { - U16_GET(str.c_str(), begin, i, end, c); + U16_GET(str.data(), begin, i, end, c); if (!u_isspace(c)) { begin = i; break; } else { - U16_FWD_1(str.c_str(), i, end); + U16_FWD_1(str.data(), i, end); } } i = str.size(); - U16_BACK_1(str.c_str(), 0, i); - U16_GET(str.c_str(), 0, i, end, c); + U16_BACK_1(str.data(), 0, i); + U16_GET(str.data(), 0, i, end, c); if (!u_isspace(c)) { if (begin == 0) { return str; @@ -36,8 +37,8 @@ } while (end > begin) { end = i; - U16_BACK_1(str.c_str(), 0, i); - U16_GET(str.c_str(), 0, i, str.size(), c); + U16_BACK_1(str.data(), 0, i); + U16_GET(str.data(), 0, i, str.size(), c); if (!u_isspace(c)) { break; } @@ -46,20 +47,20 @@ } std::vector -StringUtils::split(const UString& str, const UString& delim) +StringUtils::split(UStringView str, UStringView delim) { size_t pos = 0; size_t new_pos; std::vector result; while (pos < str.size()) { new_pos = str.find(delim, pos); - if (new_pos == UString::npos) { + if (new_pos == UStringView::npos) { new_pos = str.size(); } if (new_pos > pos) { // if we have a non-empty substring between this delimiter // and the last one - result.push_back(str.substr(pos, new_pos-pos)); + result.push_back(US(str.substr(pos, new_pos-pos))); } pos = new_pos + delim.size(); } @@ -67,7 +68,7 @@ } UString -StringUtils::join(const std::vector& vec, const UString& delim) +StringUtils::join(const std::vector& vec, UStringView delim) { UString s; for (auto& piece : vec) { @@ -80,11 +81,11 @@ } UString -StringUtils::substitute(const UString& str, const UString& olds, const UString& news) +StringUtils::substitute(UStringView str, UStringView olds, UStringView news) { - UString s = str; + UString s{str}; size_t p = s.find(olds, 0); - while (p != UString::npos) { + while (p != UStringView::npos) { s.replace(p, olds.length(), news); p += news.length(); p = s.find(olds, p); @@ -132,6 +133,14 @@ { double ret; int c = u_sscanf(str.c_str(), "%lf", &ret); + if (str.size() == 3 && str[0] == 'i' && str[1] == 'n' && str[2] == 'f') { + ret = std::numeric_limits::infinity(); + c = 1; + } + if (str.size() == 4 && str[0] == '-' && str[1] == 'i' && str[2] == 'n' && str[3] == 'f') { + ret = -1*std::numeric_limits::infinity(); + c = 1; + } if (c != 1) { throw std::invalid_argument("unable to parse float"); } @@ -139,11 +148,11 @@ } UString -StringUtils::tolower(const UString& str) +StringUtils::tolower(UStringView str) { UChar buf[str.size()*2]; UErrorCode err = U_ZERO_ERROR; - u_strToLower(buf, str.size()*2, str.c_str(), str.size(), NULL, &err); + u_strToLower(buf, str.size()*2, str.data(), str.size(), NULL, &err); if (U_FAILURE(err)) { std::cerr << "Error: unable to lowercase string '" << str << "'.\n"; std::cerr << "error code: " << u_errorName(err) << std::endl; @@ -153,11 +162,11 @@ } UString -StringUtils::toupper(const UString& str) +StringUtils::toupper(UStringView str) { UChar buf[str.size()*2]; UErrorCode err = U_ZERO_ERROR; - u_strToUpper(buf, str.size()*2, str.c_str(), str.size(), NULL, &err); + u_strToUpper(buf, str.size()*2, str.data(), str.size(), NULL, &err); if (U_FAILURE(err)) { std::cerr << "Error: unable to uppercase string '" << str << "'.\n"; std::cerr << "error code: " << u_errorName(err) << std::endl; @@ -167,11 +176,11 @@ } UString -StringUtils::totitle(const UString& str) +StringUtils::totitle(UStringView str) { UChar buf[str.size()*2]; UErrorCode err = U_ZERO_ERROR; - u_strToTitle(buf, str.size()*2, str.c_str(), str.size(), NULL, NULL, &err); + u_strToTitle(buf, str.size()*2, str.data(), str.size(), NULL, NULL, &err); if (U_FAILURE(err)) { std::cerr << "Error: unable to titlecase string '" << str << "'.\n"; std::cerr << "error code: " << u_errorName(err) << std::endl; @@ -181,7 +190,7 @@ } UString -StringUtils::getcase(const UString& str) +StringUtils::getcase(UStringView str) { UString ret = "aa"_u; if (str.empty()) { @@ -190,12 +199,12 @@ size_t i = 0; size_t l = str.size(); UChar32 c; - U16_NEXT(str.c_str(), i, l, c); + U16_NEXT(str.data(), i, l, c); if (u_isupper(c)) { ret[0] = 'A'; if (i < l) { - U16_BACK_1(str.c_str(), i, l); // decrements l - U16_GET(str.c_str(), 0, l, str.size(), c); + U16_BACK_1(str.data(), i, l); // decrements l + U16_GET(str.data(), 0, l, str.size(), c); if (u_isupper(c)) { ret[1] = 'A'; } @@ -205,21 +214,21 @@ } UString -StringUtils::copycase(const UString& source, const UString& target) +StringUtils::copycase(UStringView source, UStringView target) { if (source.empty() || target.empty()) { - return target; + return US(target); } size_t i = 0; size_t l = source.size(); UChar32 c; - U16_NEXT(source.c_str(), i, l, c); + U16_NEXT(source.data(), i, l, c); bool firstupper = u_isupper(c); bool uppercase = false; if (firstupper) { if (i != l) { - U16_BACK_1(source.c_str(), i, l); // decrements l - U16_GET(source.c_str(), 0, l, source.size(), c); + U16_BACK_1(source.data(), i, l); // decrements l + U16_GET(source.data(), 0, l, source.size(), c); uppercase = u_isupper(c); } } @@ -235,10 +244,10 @@ } bool -StringUtils::caseequal(const UString& a, const UString& b) +StringUtils::caseequal(UStringView a, UStringView b) { UErrorCode err = U_ZERO_ERROR; - int cmp = u_strCaseCompare(a.c_str(), -1, b.c_str(), -1, 0, &err); + int cmp = u_strCaseCompare(a.data(), a.size(), b.data(), b.size(), 0, &err); if (U_FAILURE(err)) { std::cerr << "Error: caseless string comparison failed on '"; std::cerr << a << "' and '" << b << "'" << std::endl; @@ -247,3 +256,28 @@ } return (cmp == 0); } + +bool +StringUtils::startswith(UStringView str, UStringView prefix) +{ + return (prefix.size() <= str.size() && + str.substr(0, prefix.size()) == prefix); +} + +bool +StringUtils::endswith(UStringView str, UStringView suffix) +{ + return (suffix.size() <= str.size() && + str.substr(str.size()-suffix.size()) == suffix); +} + +UString +StringUtils::merge_wblanks(UStringView w1, UStringView w2) +{ + if (w1.empty()) return US(w2); + if (w2.empty()) return US(w1); + UString ret = US(w1.substr(0, w1.size()-2)); + ret += "; "_u; + ret += w2.substr(2); + return ret; +} diff -Nru lttoolbox-3.6.6/lttoolbox/string_utils.h lttoolbox-3.7.1/lttoolbox/string_utils.h --- lttoolbox-3.6.6/lttoolbox/string_utils.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/string_utils.h 2022-11-01 08:36:47.000000000 +0000 @@ -7,16 +7,16 @@ class StringUtils { public: // delete leading and trailing whitespace - static UString trim(const UString& str); + static UStringView trim(UStringView str); // split string on delimiter - static std::vector split(const UString& str, const UString& delim); + static std::vector split(UStringView str, UStringView delim=u" "); // inverse of split - static UString join(const std::vector& vec, const UString& delim); + static UString join(const std::vector& vec, UStringView delim); // replace each occurrence of olds with news - static UString substitute(const UString& str, const UString& olds, const UString& news); + static UString substitute(UStringView str, UStringView olds, UStringView news); static UString itoa(int n); static std::string itoa_string(int n); @@ -25,14 +25,19 @@ static int stoi(const UString& str); static double stod(const UString& str); - static UString tolower(const UString& str); - static UString toupper(const UString& str); - static UString totitle(const UString& str); + static UString tolower(UStringView str); + static UString toupper(UStringView str); + static UString totitle(UStringView str); - static UString getcase(const UString& str); - static UString copycase(const UString& source, const UString& target); + static UString getcase(UStringView str); + static UString copycase(UStringView source, UStringView target); - static bool caseequal(const UString& a, const UString& b); + static bool caseequal(UStringView a, UStringView b); + + static bool startswith(UStringView str, UStringView prefix); + static bool endswith(UStringView str, UStringView suffix); + + static UString merge_wblanks(UStringView w1, UStringView w2); }; #endif // __LT_STRING_UTILS_H__ diff -Nru lttoolbox-3.6.6/lttoolbox/tmx_compiler.cc lttoolbox-3.7.1/lttoolbox/tmx_compiler.cc --- lttoolbox-3.6.6/lttoolbox/tmx_compiler.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/tmx_compiler.cc 2022-11-01 08:36:47.000000000 +0000 @@ -24,23 +24,6 @@ #include #include -using namespace std; - -UString const TMXCompiler::TMX_COMPILER_TMX_ELEM = "tmx"_u; -UString const TMXCompiler::TMX_COMPILER_HEADER_ELEM = "header"_u; -UString const TMXCompiler::TMX_COMPILER_BODY_ELEM = "body"_u; -UString const TMXCompiler::TMX_COMPILER_TU_ELEM = "tu"_u; -UString const TMXCompiler::TMX_COMPILER_TUV_ELEM = "tuv"_u; -UString const TMXCompiler::TMX_COMPILER_HI_ELEM = "hi"_u; -UString const TMXCompiler::TMX_COMPILER_PH_ELEM = "ph"_u; -UString const TMXCompiler::TMX_COMPILER_XMLLANG_ATTR = "xml:lang"_u; -UString const TMXCompiler::TMX_COMPILER_LANG_ATTR = "lang"_u; -UString const TMXCompiler::TMX_COMPILER_SEG_ELEM = "seg"_u; -UString const TMXCompiler::TMX_COMPILER_PROP_ELEM = "prop"_u; -UString const TMXCompiler::TMX_COMPILER_TEXT_NODE = "#text"_u; -UString const TMXCompiler::TMX_COMPILER_COMMENT_NODE = "#comment"_u; -UString const TMXCompiler::TMX_COMPILER_NUMBER_TAG = ""_u; -UString const TMXCompiler::TMX_COMPILER_BLANK_TAG = ""_u; TMXCompiler::TMXCompiler() : reader(0), @@ -58,14 +41,14 @@ } void -TMXCompiler::parse(string const &file, UString const &lo, UString const &lm) +TMXCompiler::parse(std::string const &file, UStringView lo, UStringView lm) { origin_language = lo; meta_language = lm; reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - cerr << "Error: Cannot open '" << file << "'." << endl; + std::cerr << "Error: Cannot open '" << file << "'." << std::endl; exit(EXIT_FAILURE); } @@ -78,7 +61,7 @@ if(ret != 0) { - cerr << "Error: Parse error at the end of input." << endl; + std::cerr << "Error: Parse error at the end of input." << std::endl; } xmlFreeTextReader(reader); @@ -89,12 +72,12 @@ } void -TMXCompiler::requireEmptyError(UString const &name) +TMXCompiler::requireEmptyError(UStringView name) { if(!xmlTextReaderIsEmptyElement(reader)) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Non-empty element '<" << name << ">' should be empty." << std::endl; exit(EXIT_FAILURE); } } @@ -122,8 +105,8 @@ { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid construction." << std::endl; exit(EXIT_FAILURE); } } @@ -134,7 +117,7 @@ } void -TMXCompiler::skip(UString &name, UString const &elem) +TMXCompiler::skip(UString &name, UStringView elem) { xmlTextReaderRead(reader); name = XMLParseUtil::readName(reader); @@ -145,8 +128,8 @@ { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid construction." << std::endl; exit(EXIT_FAILURE); } } @@ -156,28 +139,27 @@ if(name != elem) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Expected '<" << elem << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Expected '<" << elem << ">'." << std::endl; exit(EXIT_FAILURE); } } UString -TMXCompiler::attrib(UString const &name) +TMXCompiler::attrib(UStringView name) { return XMLParseUtil::attrib(reader, name); } void -TMXCompiler::requireAttribute(UString const &value, UString const &attrname, - UString const &elemname) +TMXCompiler::requireAttribute(UStringView value, UStringView attrname, UStringView elemname) { if(value.empty()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): '<" << elemname; - cerr << "' element must specify non-void '"; - cerr << attrname << "' attribute." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): '<" << elemname; + std::cerr << "' element must specify non-void '"; + std::cerr << attrname << "' attribute." << std::endl; exit(EXIT_FAILURE); } } @@ -191,7 +173,7 @@ } void -TMXCompiler::insertTU(vector const &origin, vector const &meta) +TMXCompiler::insertTU(std::vector const &origin, std::vector const &meta) { if(origin.size() < 5 || meta.size() < 5) { @@ -228,7 +210,7 @@ } void -TMXCompiler::split(vector const &v, vector > &sv, int const symbol) const +TMXCompiler::split(std::vector const &v, std::vector > &sv, int const symbol) const { sv.clear(); @@ -236,7 +218,7 @@ { if(sv.size() == j) { - sv.push_back(vector()); + sv.push_back(std::vector()); } if(v[i] == symbol) { @@ -249,10 +231,10 @@ } } -vector -TMXCompiler::join(vector > const &v, int const s) const +std::vector +TMXCompiler::join(std::vector > const &v, int const s) const { - vector result; + std::vector result; for(unsigned int i = 0, limit = v.size(); i != limit; i++) { for(unsigned int j = 0, limit2 = v[i].size(); j != limit2; j++) @@ -269,12 +251,12 @@ } void -TMXCompiler::align_blanks(vector &o, vector &m) +TMXCompiler::align_blanks(std::vector &o, std::vector &m) { - vector puntos; - vector resultado_o, resultado_m; + std::vector puntos; + std::vector resultado_o, resultado_m; - vector > so, sm; + std::vector > so, sm; split(o, so, blank_tag); split(m, sm, blank_tag); @@ -328,9 +310,9 @@ { UString name = XMLParseUtil::readName(reader); int type = xmlTextReaderNodeType(reader); - vector origin; - vector meta; - vector foo; + std::vector origin; + std::vector meta; + std::vector foo; while(name != TMX_COMPILER_TU_ELEM || type != XML_READER_TYPE_END_ELEMENT) { @@ -341,7 +323,7 @@ l = attrib(TMX_COMPILER_LANG_ATTR); } - vector *ref; + std::vector *ref; if(l == meta_language) { ref = &meta; @@ -424,8 +406,8 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid node '<" << name << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -448,13 +430,13 @@ Compression::multibyte_write(0, output); // keeping file format transducer.write(output); - cout << origin_language << "->" << meta_language << " "; - cout << transducer.size() << " " << transducer.numberOfTransitions(); - cout << endl; + std::cout << origin_language << "->" << meta_language << " "; + std::cout << transducer.size() << " " << transducer.numberOfTransitions(); + std::cout << std::endl; } void -TMXCompiler::trim(vector &v) const +TMXCompiler::trim(std::vector &v) const { while(v.size() > 0) { @@ -469,7 +451,7 @@ } bool principio = true; - vector aux; + std::vector aux; for(auto c : v) { if(!u_isspace(c) || !principio) @@ -483,11 +465,11 @@ } void -TMXCompiler::align(vector &origin, vector &meta) +TMXCompiler::align(std::vector &origin, std::vector &meta) { - vector numbers_origin_start, + std::vector numbers_origin_start, numbers_origin_length; - vector modified_origin, modified_meta; + std::vector modified_origin, modified_meta; // compile information from origin for(unsigned int i = 0, limit = origin.size(); i != limit; i++) @@ -561,7 +543,7 @@ } unsigned int -TMXCompiler::numberLength(vector &v, unsigned int const position) const +TMXCompiler::numberLength(std::vector &v, unsigned int const position) const { for(unsigned int i = position, limit = v.size(); i < limit; i++) { @@ -600,9 +582,9 @@ } bool -TMXCompiler::vectorcmp(vector const &orig, unsigned int const begin_orig, - vector const &meta, unsigned int const begin_meta, - unsigned const int length) const +TMXCompiler::vectorcmp(std::vector const &orig, unsigned int begin_orig, + std::vector const &meta, unsigned int begin_meta, + unsigned int length) const { for(unsigned int i = begin_orig, j = begin_meta, count = 0; count != length; i++, j++, count++) @@ -617,13 +599,13 @@ } void -TMXCompiler::setOriginLanguageCode(UString const &code) +TMXCompiler::setOriginLanguageCode(UStringView code) { // nada } void -TMXCompiler::setMetaLanguageCode(UString const &code) +TMXCompiler::setMetaLanguageCode(UStringView code) { // nada } diff -Nru lttoolbox-3.6.6/lttoolbox/tmx_compiler.h lttoolbox-3.7.1/lttoolbox/tmx_compiler.h --- lttoolbox-3.6.6/lttoolbox/tmx_compiler.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/tmx_compiler.h 2022-11-01 08:36:47.000000000 +0000 @@ -28,7 +28,6 @@ #include #include -using namespace std; /** * A compiler of dictionaries to letter transducers @@ -95,21 +94,21 @@ * @param origin left part * @param meta right part */ - void insertTU(vector const &origin, vector const &meta); + void insertTU(std::vector const &origin, std::vector const &meta); /** * Gets an attribute value with their name and the current context * @param name the name of the attribute * @return the value of the attribute */ - UString attrib(UString const &name); + UString attrib(UStringView name); /** * Skip all document #text nodes before "elem" * @param name the name of the node * @param elem the name of the expected node */ - void skip(UString &name, UString const &elem); + void skip(UString &name, UStringView elem); /** * Skip all blank #text nodes before "name" @@ -121,7 +120,7 @@ * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(UString const &name); + void requireEmptyError(UStringView name); /** * Force an attribute to be specified, amd check for it @@ -129,8 +128,7 @@ * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(UString const &value, UString const &attrname, - UString const &elemname); + void requireAttribute(UStringView value, UStringView attrname, UStringView elemname); /** * True if all the elements in the current node are blanks @@ -139,15 +137,15 @@ bool allBlanks(); UString getTag(size_t const &val) const; - void trim(vector &v) const; - void align(vector &origin, vector &meta); - unsigned int numberLength(vector &v, unsigned int const position) const; - bool vectorcmp(vector const &orig, unsigned int const begin_orig, - vector const &meta, unsigned int const begin_meta, + void trim(std::vector &v) const; + void align(std::vector &origin, std::vector &meta); + unsigned int numberLength(std::vector &v, unsigned int const position) const; + bool vectorcmp(std::vector const &orig, unsigned int const begin_orig, + std::vector const &meta, unsigned int const begin_meta, unsigned const int length) const; - void split(vector const &v, vector > &sv, int const symbol) const; - void align_blanks(vector &o, vector &m); - vector join(vector > const &v, int const s) const; + void split(std::vector const &v, std::vector > &sv, int const symbol) const; + void align_blanks(std::vector &o, std::vector &m); + std::vector join(std::vector > const &v, int const s) const; public: @@ -155,21 +153,21 @@ * Constants to represent the element and the attributes of * translation memories in TMX format */ - static UString const TMX_COMPILER_TMX_ELEM; - static UString const TMX_COMPILER_HEADER_ELEM; - static UString const TMX_COMPILER_BODY_ELEM; - static UString const TMX_COMPILER_TU_ELEM; - static UString const TMX_COMPILER_TUV_ELEM; - static UString const TMX_COMPILER_HI_ELEM; - static UString const TMX_COMPILER_PH_ELEM; - static UString const TMX_COMPILER_XMLLANG_ATTR; - static UString const TMX_COMPILER_LANG_ATTR; - static UString const TMX_COMPILER_SEG_ELEM; - static UString const TMX_COMPILER_PROP_ELEM; - static UString const TMX_COMPILER_TEXT_NODE; - static UString const TMX_COMPILER_COMMENT_NODE; - static UString const TMX_COMPILER_NUMBER_TAG; - static UString const TMX_COMPILER_BLANK_TAG; + static constexpr UStringView TMX_COMPILER_TMX_ELEM = u"tmx"; + static constexpr UStringView TMX_COMPILER_HEADER_ELEM = u"header"; + static constexpr UStringView TMX_COMPILER_BODY_ELEM = u"body"; + static constexpr UStringView TMX_COMPILER_TU_ELEM = u"tu"; + static constexpr UStringView TMX_COMPILER_TUV_ELEM = u"tuv"; + static constexpr UStringView TMX_COMPILER_HI_ELEM = u"hi"; + static constexpr UStringView TMX_COMPILER_PH_ELEM = u"ph"; + static constexpr UStringView TMX_COMPILER_XMLLANG_ATTR = u"xml:lang"; + static constexpr UStringView TMX_COMPILER_LANG_ATTR = u"lang"; + static constexpr UStringView TMX_COMPILER_SEG_ELEM = u"seg"; + static constexpr UStringView TMX_COMPILER_PROP_ELEM = u"prop"; + static constexpr UStringView TMX_COMPILER_TEXT_NODE = u"#text"; + static constexpr UStringView TMX_COMPILER_COMMENT_NODE = u"#comment"; + static constexpr UStringView TMX_COMPILER_NUMBER_TAG = u""; + static constexpr UStringView TMX_COMPILER_BLANK_TAG = u""; /** @@ -185,7 +183,7 @@ /** * Compile dictionary to letter transducers */ - void parse(string const &file, UString const &lo, UString const &lm); + void parse(std::string const &file, UStringView lo, UStringView lm); /** * Write the result of compilation @@ -197,13 +195,13 @@ * Set origin language inner code * @param code the code of the origin language into the TMX file being compiled */ - void setOriginLanguageCode(UString const &code); + void setOriginLanguageCode(UStringView code); /** * Set meta language inner code * @param code the code of the meta language into the TMX file being compiled */ - void setMetaLanguageCode(UString const &code); + void setMetaLanguageCode(UStringView code); }; diff -Nru lttoolbox-3.6.6/lttoolbox/transducer.cc lttoolbox-3.7.1/lttoolbox/transducer.cc --- lttoolbox-3.6.6/lttoolbox/transducer.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/transducer.cc 2022-11-01 08:36:47.000000000 +0000 @@ -26,19 +26,6 @@ #include #include -UString const Transducer::HFST_EPSILON_SYMBOL_SHORT = "@0@"_u; -UString const Transducer::HFST_EPSILON_SYMBOL_LONG = "@_EPSILON_SYMBOL_@"_u; -UString const Transducer::LTTB_EPSILON_SYMBOL = u"ε"_u; -UString const Transducer::HFST_SPACE_SYMBOL = "@_SPACE_@"_u; -UString const Transducer::HFST_TAB_SYMBOL = "@_TAB_@"_u; -UString const Transducer::GROUP_SYMBOL = "#"_u; -UString const Transducer::JOIN_SYMBOL = "+"_u; -UString const Transducer::ANY_TAG_SYMBOL = ""_u; -UString const Transducer::ANY_CHAR_SYMBOL = ""_u; -UString const Transducer::LSX_BOUNDARY_SYMBOL = "<$>"_u; -UString const Transducer::COMPOUND_ONLY_L_SYMBOL = ""_u; -UString const Transducer::COMPOUND_R_SYMBOL = ""_u; - int Transducer::newState() @@ -94,7 +81,7 @@ { // new state int state = newState(); - transitions[source].insert(make_pair(tag, make_pair(state, weight))); + transitions[source].insert({tag, std::make_pair(state, weight)}); return state; } else if(transitions[source].count(tag) == 2) @@ -125,7 +112,7 @@ Transducer::insertNewSingleTransduction(int const tag, int const source, double const weight) { int state = newState(); - transitions[source].insert(make_pair(tag, make_pair(state, weight))); + transitions[source].insert({tag, std::make_pair(state, weight)}); return state; } @@ -133,7 +120,7 @@ Transducer::insertTransducer(int const source, Transducer &t, int const epsilon_tag) { - map relation; + std::map relation; if(t.transitions.empty()) { @@ -151,14 +138,11 @@ { for(auto& it2 : it.second) { - transitions[relation[it.first]].insert(make_pair(it2.first, - make_pair(relation[it2.second.first], - it2.second.second))); + transitions[relation[it.first]].insert({it2.first, std::make_pair(relation[it2.second.first], it2.second.second)}); } } - transitions[source].insert(make_pair(epsilon_tag, - make_pair(relation[t.initial], default_weight))); + transitions[source].insert({epsilon_tag, std::make_pair(relation[t.initial], default_weight)}); return relation[t.finals.begin()->first]; } @@ -181,12 +165,12 @@ } } // end of new code - transitions[source].insert(make_pair(tag, make_pair(target, weight))); + transitions[source].insert({tag, std::make_pair(target, weight)}); } else { - cerr << "Error: Trying to link nonexistent states (" << source; - cerr << ", " << target << ", " << tag << ")" << endl; + std::cerr << "Error: Trying to link nonexistent states (" << source; + std::cerr << ", " << target << ", " << tag << ")" << std::endl; exit(EXIT_FAILURE); } } @@ -204,12 +188,12 @@ int initial_copy = getInitial(); if(state == initial_copy) { - cerr << "Setting initial state to final" << endl; + std::cerr << "Setting initial state to final" << std::endl; } */ if(value) { - finals.insert(make_pair(state, weight)); + finals.insert({state, weight}); } else { @@ -223,16 +207,16 @@ return initial; } -set +std::set Transducer::closure(int const state, int const epsilon_tag) const { - return closure(state, set({epsilon_tag})); + return closure(state, std::set({epsilon_tag})); } -set -Transducer::closure(int const state, set const &epsilon_tags) const +std::set +Transducer::closure(int const state, std::set const &epsilon_tags) const { - set nonvisited, result; + std::set nonvisited, result; nonvisited.insert(state); result.insert(state); @@ -249,7 +233,7 @@ } range.first++; } - } catch (out_of_range const &e) { + } catch (std::out_of_range const &e) { // No transition from any of the epsilon_tags – this is fine } } @@ -259,6 +243,40 @@ return result; } +std::vector> +Transducer::closure_all(const int epsilon_tag) const +{ + std::vector> ret; + ret.reserve(transitions.size()); + std::vector> reversed; + reversed.resize(transitions.size()); + sorted_vector todo; + for (size_t i = 0; i < transitions.size(); i++) { + sorted_vector c; + c.insert(i); + auto range = transitions.at(i).equal_range(epsilon_tag); + for (; range.first != range.second; range.first++) { + c.insert(range.first->second.first); + reversed[range.first->second.first].push_back(i); + } + if (c.size() > 1) todo.insert(i); + ret.push_back(c); + } + while (!todo.empty()) { + sorted_vector new_todo; + for (auto& it : todo) { + sorted_vector temp = ret[it]; + for (auto& it2 : temp) { + ret[it].insert(ret[it2].begin(), ret[it2].end()); + } + if (ret[it].size() > temp.size()) + new_todo.insert(reversed[it].begin(), reversed[it].end()); + } + todo.swap(new_todo); + } + return ret; +} + void Transducer::joinFinals(int const epsilon_tag) { @@ -272,69 +290,46 @@ } finals.clear(); - finals.insert(make_pair(state, default_weight)); + finals.insert({state, default_weight}); } else if(finals.size() == 0) { - cerr << "Error: empty set of final states" < const &s1, set const &s2) -{ - - if(s1.size() < s2.size()) - { - for(auto& it : s1) - { - if(s2.count(it)) - { - return false; - } - } - } - else - { - for(auto& it : s2) - { - if(s1.count(it)) - { - return false; - } - } - } - - return true; -} - void Transducer::determinize(int const epsilon_tag) { - vector > R(2); - map > Q_prime; - map, int> Q_prime_inv; - - map > > transitions_prime; + std::vector> R(2); + std::vector> Q_prime; + std::map, int> Q_prime_inv; + + std::map > > transitions_prime; + + // We're almost certainly going to need the closure of (nearly) every + // state, and we're often going to need the closure several times, + // so it's faster to precompute. + std::vector> all_closures = closure_all(epsilon_tag); unsigned int size_Q_prime = 0; - Q_prime[0] = closure(initial, epsilon_tag); + Q_prime.push_back(all_closures[initial]); Q_prime_inv[Q_prime[0]] = 0; R[0].insert(0); int initial_prime = 0; - map finals_prime; + std::map finals_prime; if(isFinal(initial)) { - finals_prime.insert(make_pair(0, default_weight)); + finals_prime.insert({0, default_weight}); } int t = 0; - set finals_state; + sorted_vector finals_state; for(auto& it : finals) { finals_state.insert(it.first); } @@ -346,17 +341,16 @@ for(auto& it : R[t]) { - if(!isEmptyIntersection(Q_prime[it], finals_state)) - { + if (Q_prime[it].intersects(finals_state)) { double w = default_weight; auto it3 = finals.find(it); if (it3 != finals.end()) { w = it3->second; } - finals_prime.insert(make_pair(it, w)); + finals_prime.insert({it, w}); } - map, set > mymap; + std::map, sorted_vector > mymap; for(auto& it2 : Q_prime[it]) { @@ -364,29 +358,28 @@ { if(it3.first != epsilon_tag) { - auto c = closure(it3.second.first, epsilon_tag); - - for(auto& it4 : c) - { - mymap[make_pair(it3.first, it3.second.second)].insert(it4); - } + auto& it4 = all_closures[it3.second.first]; + mymap[std::make_pair(it3.first, it3.second.second)].insert(it4.begin(), it4.end()); } } } // adding new states + auto& state_prime = transitions_prime[it]; for(auto& it2 : mymap) { - if(Q_prime_inv.find(it2.second) == Q_prime_inv.end()) - { - int tag = Q_prime.size(); - Q_prime[tag] = it2.second; + int tag; + auto loc = Q_prime_inv.find(it2.second); + if(loc == Q_prime_inv.end()) { + tag = Q_prime.size(); + Q_prime.push_back(it2.second); Q_prime_inv[it2.second] = tag; - R[(t+1)%2].insert(Q_prime_inv[it2.second]); + R[(t+1)%2].insert(tag); transitions_prime[tag].clear(); + } else { + tag = loc->second; } - transitions_prime[it].insert(make_pair(it2.first.first, - make_pair(Q_prime_inv[it2.second], it2.first.second))); + state_prime.insert({it2.first.first, std::make_pair(tag, it2.first.second)}); } } @@ -420,7 +413,7 @@ state = newState(); linkStates(finals.begin()->first, state, epsilon_tag, finals.begin()->second); finals.clear(); - finals.insert(make_pair(state, default_weight)); + finals.insert({state, default_weight}); linkStates(initial, state, epsilon_tag, default_weight); } @@ -435,7 +428,7 @@ state = newState(); linkStates(finals.begin()->first, state, epsilon_tag, finals.begin()->second); finals.clear(); - finals.insert(make_pair(state, default_weight)); + finals.insert({state, default_weight}); linkStates(state, initial, epsilon_tag, default_weight); } @@ -466,16 +459,16 @@ return finals.size() == 0; } -map > >& +std::map > >& Transducer::getTransitions() { return transitions; } -map +std::map Transducer::getFinals() const { - return map(finals); + return std::map(finals); } int @@ -514,7 +507,7 @@ } // Determine whether any weights are non-default (0) -bool Transducer::weighted() { +bool Transducer::weighted() const { for (auto& it : finals) { if (it.second != default_weight) { return true; @@ -622,7 +615,7 @@ { base_weight = Compression::long_multibyte_read(input); } - new_t.finals.insert(make_pair(base, base_weight)); + new_t.finals.insert({base, base_weight}); } base = Compression::multibyte_read(input); @@ -645,7 +638,7 @@ { new_t.transitions[state].clear(); // force create } - new_t.transitions[current_state].insert(make_pair(tagbase, make_pair(state, base_weight))); + new_t.transitions[current_state].insert({tagbase, std::make_pair(state, base_weight)}); } number_of_states--; current_state++; @@ -658,16 +651,16 @@ Transducer::serialise(std::ostream &serialised) const { Serialiser::serialise(initial, serialised); - Serialiser >::serialise(finals, serialised); - Serialiser > > >::serialise(transitions, serialised); + Serialiser >::serialise(finals, serialised); + Serialiser > > >::serialise(transitions, serialised); } void Transducer::deserialise(std::istream &serialised) { initial = Deserialiser::deserialise(serialised); - finals = Deserialiser >::deserialise(serialised); - transitions = Deserialiser > > >::deserialise(serialised); + finals = Deserialiser >::deserialise(serialised); + transitions = Deserialiser > > >::deserialise(serialised); } void @@ -688,9 +681,9 @@ { joinFinals(epsilon_tag); - map > > tmp_transitions; + std::map > > tmp_transitions; - for(map > >::reverse_iterator it = transitions.rbegin(); it != transitions.rend(); it++) + for(std::map > >::reverse_iterator it = transitions.rbegin(); it != transitions.rend(); it++) { auto aux = it->second; it->second.clear(); @@ -698,11 +691,11 @@ { if(it2.second.first >= it->first) { - transitions[it2.second.first].insert(make_pair(it2.first, make_pair(it->first, it2.second.second))); + transitions[it2.second.first].insert({it2.first, std::make_pair(it->first, it2.second.second)}); } else { - tmp_transitions[it2.second.first].insert(make_pair(it2.first, make_pair(it->first, it2.second.second))); + tmp_transitions[it2.second.first].insert({it2.first, std::make_pair(it->first, it2.second.second)}); } } if(tmp_transitions.find(it->first) != tmp_transitions.end()) @@ -712,20 +705,20 @@ } } - for(map > >::reverse_iterator it = tmp_transitions.rbegin(), + for(std::map > >::reverse_iterator it = tmp_transitions.rbegin(), limit = tmp_transitions.rend(); it != limit; it++) { for(auto& it2 : it->second) { - transitions[it->first].insert(make_pair(it2.first, it2.second)); + transitions[it->first].insert({it2.first, it2.second}); } } int tmp = initial; initial = finals.begin()->first; finals.clear(); - finals.insert(make_pair(tmp, default_weight)); + finals.insert({tmp, default_weight}); } void @@ -742,11 +735,11 @@ symbol = LTTB_EPSILON_SYMBOL; } } - else if(hfst && symbol == " "_u) + else if(hfst && symbol == u" "_uv) { symbol = HFST_SPACE_SYMBOL; } - else if(hfst && symbol == "\t"_u) + else if(hfst && symbol == u"\t"_uv) { symbol = HFST_TAB_SYMBOL; } @@ -779,16 +772,10 @@ } } -void -Transducer::show(Alphabet const &alphabet, UFILE *output, int const epsilon_tag) const -{ - return show(alphabet, output, epsilon_tag, false); -} - int Transducer::getStateSize(int const state) { - set states; + std::set states; auto myclosure1 = closure(state, 0); states.insert(myclosure1.begin(), myclosure1.end()); int num_transitions = 0; @@ -802,22 +789,22 @@ } bool -Transducer::recognise(UString pattern, Alphabet &a, FILE *err) +Transducer::recognise(UStringView pattern, Alphabet &a, FILE *err) const { bool accepted = false; - set states; + std::set states; auto myclosure1 = closure(getInitial(), 0); states.insert(myclosure1.begin(), myclosure1.end()); // For each of the characters in the input string for(auto& it : pattern) { - set new_state; //Transducer::closure(int const state, int const epsilon_tag) + std::set new_state; //Transducer::closure(int const state, int const epsilon_tag) // For each of the current alive states //fprintf(err, "step: %ls %lc (%d)\n", pattern.c_str(), *it, sym); for(auto& it2 : states) { - auto& p = transitions[it2]; + auto& p = transitions.at(it2); // For each of the transitions in the state for(auto& it3 : p) @@ -834,9 +821,9 @@ if(l.find(it) != UString::npos) { auto myclosure = closure(it3.second.first, 0); - //cerr << "Before closure alives: " < const &loopback_symbols, int const epsilon_tag) +Transducer::appendDotStar(std::set const &loopback_symbols, int const epsilon_tag) { Transducer prefix_transducer(*this); @@ -889,23 +876,23 @@ Transducer new_t; Transducer lemq; - map states_this_new; - states_this_new.insert(make_pair(start, new_t.initial)); - map states_this_lemq; - states_this_new.insert(make_pair(start, lemq.initial)); + std::map states_this_new; + states_this_new.insert({start, new_t.initial}); + std::map states_this_lemq; + states_this_new.insert({start, lemq.initial}); typedef std::pair SearchState; // Each searchstate in the stack is a transition in this FST, along // with the last reached state of the lemq - std::list todo; + std::vector todo; std::set seen; std::set finally; SearchState current; - todo.push_front(make_pair(start,start)); + todo.push_back({start,start}); - while(todo.size() > 0) { - current = todo.front(); - todo.pop_front(); + while(!todo.empty()) { + current = todo.back(); + todo.pop_back(); seen.insert(current); int this_src = current.first, this_lemqlast = current.second; @@ -927,32 +914,32 @@ { // We've reached the first tag new_src = states_this_new[start]; - lemq.finals.insert(make_pair(this_lemqlast, default_weight)); + lemq.finals.insert({this_lemqlast, default_weight}); } else { if(states_this_new.find(this_src) == states_this_new.end()) { - states_this_new.insert(make_pair(this_src, new_t.newState())); + states_this_new.insert({this_src, new_t.newState()}); } new_src = states_this_new[this_src]; } if(states_this_new.find(this_trg) == states_this_new.end()) { - states_this_new.insert(make_pair(this_trg, new_t.newState())); + states_this_new.insert({this_trg, new_t.newState()}); } int new_trg = states_this_new[this_trg]; new_t.linkStates(new_src, new_trg, label, this_wt); if(isFinal(this_src)) { - finally.insert(make_pair(this_src, this_lemqlast)); + finally.insert({this_src, this_lemqlast}); } - if(seen.find(make_pair(this_trg, this_lemqlast)) == seen.end()) + if(seen.find(std::make_pair(this_trg, this_lemqlast)) == seen.end()) { - todo.push_front(make_pair(this_trg, this_lemqlast)); + todo.push_back({this_trg, this_lemqlast}); } } else @@ -961,13 +948,13 @@ int lemq_src = states_this_lemq[this_src]; if(states_this_lemq.find(this_trg) == states_this_lemq.end()) { - states_this_lemq.insert(make_pair(this_trg, lemq.newState())); + states_this_lemq.insert({this_trg, lemq.newState()}); } int lemq_trg = states_this_lemq[this_trg]; lemq.linkStates(lemq_src, lemq_trg, label, this_wt); - if(seen.find(make_pair(this_trg, this_trg)) == seen.end()) + if(seen.find({this_trg, this_trg}) == seen.end()) { - todo.push_front(make_pair(this_trg, this_trg)); + todo.push_back({this_trg, this_trg}); } } } // end for transitions @@ -980,14 +967,14 @@ // copy lemq, letting this_lemqlast be the only final state in newlemq Transducer newlemq = Transducer(lemq); newlemq.finals.clear(); - newlemq.finals.insert(make_pair(states_this_lemq[this_lemqlast], default_weight)); + newlemq.finals.insert({states_this_lemq[this_lemqlast], default_weight}); newlemq.minimize(); int group_start = new_t.newState(); new_t.linkStates(states_this_new[last_tag], group_start, group_label, default_weight); // append newlemq into the group after last_tag: - new_t.finals.insert(make_pair(new_t.insertTransducer(group_start, newlemq), default_weight)); + new_t.finals.insert({new_t.insertTransducer(group_start, newlemq), default_weight}); } return new_t; @@ -1000,16 +987,15 @@ Transducer new_t; typedef int SearchState; std::set seen; - std::list todo; - todo.push_front(initial); + std::vector todo; + todo.push_back(initial); - map states_this_new; - states_this_new.insert(make_pair(initial, new_t.initial)); + std::map states_this_new; + states_this_new.insert({initial, new_t.initial}); - while(todo.size() > 0) - { - int this_src = todo.front(); - todo.pop_front(); + while(!todo.empty()) { + int this_src = todo.back(); + todo.pop_back(); seen.insert(this_src); for(auto& trans_it : transitions[this_src]) { @@ -1022,21 +1008,19 @@ if(left == GROUP_SYMBOL) { Transducer tagsFirst = copyWithTagsFirst(this_trg, label, alphabet, epsilon_tag); - new_t.finals.insert(make_pair( - new_t.insertTransducer(new_src, tagsFirst, epsilon_tag), default_weight - )); + new_t.finals.insert({new_t.insertTransducer(new_src, tagsFirst, epsilon_tag), default_weight}); } else { if(states_this_new.find(this_trg) == states_this_new.end()) { - states_this_new.insert(make_pair(this_trg, new_t.newState())); + states_this_new.insert({this_trg, new_t.newState()}); } int new_trg = states_this_new[this_trg]; new_t.linkStates(new_src, new_trg, label, default_weight); if(seen.find(this_trg) == seen.end()) { - todo.push_front(this_trg); + todo.push_back(this_trg); } } } @@ -1044,7 +1028,7 @@ for(auto& it : finals) { - new_t.finals.insert(make_pair(states_this_new[it.first], it.second)); + new_t.finals.insert({states_this_new[it.first], it.second}); } return new_t; @@ -1052,7 +1036,7 @@ Transducer -Transducer::intersect(Transducer &trimmer, +Transducer::trim(Transducer &trimmer, Alphabet const &this_a, Alphabet const &trimmer_a, int const epsilon_tag) @@ -1075,18 +1059,34 @@ Transducer trimmed; std::map states_this_trimmed; - std::list todo; + std::vector todo; std::set seen; SearchState current; - SearchState next = make_pair(initial, make_pair(trimmer.initial, - trimmer.initial)); - todo.push_front(next); - states_this_trimmed.insert(make_pair(next, trimmed.initial)); - - while(todo.size() > 0) - { - current = todo.front(); - todo.pop_front(); + SearchState next{initial, {trimmer.initial, trimmer.initial}}; + todo.push_back(next); + states_this_trimmed.insert({next, trimmed.initial}); + + sorted_vector sym_wb, sym_lsx, sym_cmp_or_eps; + { + if (this_a.isSymbolDefined(LSX_BOUNDARY_SYMBOL)) + sym_lsx.insert(this_a(LSX_BOUNDARY_SYMBOL)); + if (this_a.isSymbolDefined(LSX_BOUNDARY_SPACE_SYMBOL)) + sym_lsx.insert(this_a(LSX_BOUNDARY_SPACE_SYMBOL)); + if (this_a.isSymbolDefined(LSX_BOUNDARY_NO_SPACE_SYMBOL)) + sym_lsx.insert(this_a(LSX_BOUNDARY_NO_SPACE_SYMBOL)); + sym_wb = sym_lsx; + sym_wb.insert(static_cast('+')); // JOIN_SYMBOL + + if (this_a.isSymbolDefined(COMPOUND_ONLY_L_SYMBOL)) + sym_cmp_or_eps.insert(this_a(COMPOUND_ONLY_L_SYMBOL)); + if (this_a.isSymbolDefined(COMPOUND_R_SYMBOL)) + sym_cmp_or_eps.insert(this_a(COMPOUND_R_SYMBOL)); + sym_cmp_or_eps.insert(0); // epsilon + } + + while(!todo.empty()) { + current = todo.back(); + todo.pop_back(); seen.insert(current); int this_src = current.first, trimmer_src = current.second.first, @@ -1094,31 +1094,30 @@ trimmer_preplus_next = trimmer_preplus; if(states_this_trimmed.find(current) == states_this_trimmed.end()) { - cerr <<"Error: couldn't find "<('+')); if(trimmer_preplus == trimmer_src) { // Keep the old preplus state if it was set; equal to current trimmer state means unset: trimmer_preplus_next = trimmer_src; // not _trg when join! } // Go to the start in trimmer, but record where we restarted from in case we later see a #: - next = make_pair(this_trg, make_pair(trimmer.initial, trimmer_preplus_next)); + next = std::make_pair(this_trg, std::make_pair(trimmer.initial, trimmer_preplus_next)); if(seen.find(next) == seen.end()) { - todo.push_front(next); + todo.push_back(next); } if(states_this_trimmed.find(next) == states_this_trimmed.end()) { - states_this_trimmed.insert(make_pair(next, trimmed.newState())); + states_this_trimmed.insert({next, trimmed.newState()}); } int trimmed_trg = states_this_trimmed[next]; trimmed.linkStates(trimmed_src, // fromState trimmed_trg, // toState this_label, // symbol-pair, using this alphabet this_wt); //weight of transduction - if(this_right == LSX_BOUNDARY_SYMBOL && isFinal(this_trg)) - { + if (sym_lsx.count(this_right) && isFinal(this_trg)) { trimmed.setFinal(trimmed_trg, default_weight); } } - else if ( this_right == COMPOUND_ONLY_L_SYMBOL - || this_right == COMPOUND_R_SYMBOL - || this_right.empty() ) - { + else if (sym_cmp_or_eps.count(this_right)) { + special = true; // Stay put in the trimmer FST int trimmer_trg = trimmer_src; @@ -1176,14 +1173,14 @@ trimmer_preplus_next = trimmer_trg; } - next = make_pair(this_trg, make_pair(trimmer_trg, trimmer_preplus_next)); + next = std::make_pair(this_trg, std::make_pair(trimmer_trg, trimmer_preplus_next)); if(seen.find(next) == seen.end()) { - todo.push_front(next); + todo.push_back(next); } if(states_this_trimmed.find(next) == states_this_trimmed.end()) { - states_this_trimmed.insert(make_pair(next, trimmed.newState())); + states_this_trimmed.insert({next, trimmed.newState()}); } int trimmed_trg = states_this_trimmed[next]; trimmed.linkStates(trimmed_src, // fromState @@ -1191,16 +1188,16 @@ this_label, // symbol-pair, using this alphabet this_wt); //weight of transduction } - else - { + + // if we're at a normal symbol or a + that might be part of a lemma + if (!special) { // Loop through non-epsilon arcs from the live state of trimmer // If we see a hash/group, we may have to rewind our trimmer state first: - if(this_right == GROUP_SYMBOL && trimmer_preplus != trimmer_src) + if(this_right == static_cast('#') && + trimmer_preplus != trimmer_src) { - states_this_trimmed.insert(make_pair(make_pair(this_src, make_pair(trimmer_preplus, - trimmer_preplus)), - trimmed_src)); + states_this_trimmed.insert({std::make_pair(this_src, std::make_pair(trimmer_preplus, trimmer_preplus)), trimmed_src}); trimmer_src = trimmer_preplus; } @@ -1208,26 +1205,23 @@ { int trimmer_label = trimmer_trans_it.first, trimmer_trg = trimmer_trans_it.second.first; - UString trimmer_left; - trimmer_a.getSymbol(trimmer_left, trimmer_a.decode(trimmer_label).first); + int32_t trimmer_left = trimmer_a.decode(trimmer_label).first; if(trimmer_preplus == trimmer_src) { // Keep the old preplus state if it was set; equal to current trimmer state means unset: trimmer_preplus_next = trimmer_trg; } - if(!trimmer_left.empty() && // we've already dealt with trimmer epsilons - (this_right == trimmer_left || - (this_right == ((trimmer_left[0] == '<') ? ANY_TAG_SYMBOL : ANY_CHAR_SYMBOL)))) - { - next = make_pair(this_trg, make_pair(trimmer_trg, trimmer_preplus_next)); + if (trimmer_left != 0 && // we've already dealt with trimmer epsilons + this_a.sameSymbol(this_right, trimmer_a, trimmer_left, true)) { + next = std::make_pair(this_trg, std::make_pair(trimmer_trg, trimmer_preplus_next)); if(seen.find(next) == seen.end()) { - todo.push_front(next); + todo.push_back(next); } if(states_this_trimmed.find(next) == states_this_trimmed.end()) { - states_this_trimmed.insert(make_pair(next, trimmed.newState())); + states_this_trimmed.insert(std::make_pair(next, trimmed.newState())); } int trimmed_trg = states_this_trimmed[next]; trimmed.linkStates(trimmed_src, // fromState @@ -1247,7 +1241,7 @@ int s_trimmed = it.second; if(isFinal(s_this) && trimmer.isFinal(s_trimmer)) { - trimmed.finals.insert(make_pair(s_trimmed, default_weight)); + trimmed.finals.insert(std::make_pair(s_trimmed, finals[s_this])); } } @@ -1260,8 +1254,8 @@ Transducer::updateAlphabet(Alphabet& old_alpha, Alphabet& new_alpha, bool has_pairs) { - set symbol_pairs; - set symbols; + std::set symbol_pairs; + std::set symbols; for (auto& it : transitions) { for (auto& it2 : it.second) { if (!has_pairs && it2.first < 0) { @@ -1279,7 +1273,7 @@ } } } - map symbol_update; + std::map symbol_update; for (auto& it : symbols) { UString s; old_alpha.getSymbol(s, it); @@ -1287,7 +1281,7 @@ symbol_update[it] = new_alpha(s); } if (has_pairs) { - map pair_update; + std::map pair_update; for (auto& it : symbol_pairs) { int32_t l1 = old_alpha.decode(it).first; int32_t r1 = old_alpha.decode(it).second; @@ -1297,7 +1291,7 @@ } symbol_update.swap(pair_update); } - map > > new_trans; + std::map > > new_trans; for (auto& it : transitions) { new_trans[it.first].clear(); for (auto& it2 : it.second) { @@ -1305,8 +1299,269 @@ if (symbol_update.find(s) != symbol_update.end()) { s = symbol_update[s]; } - new_trans[it.first].insert(make_pair(s, make_pair(it2.second.first, it2.second.second))); + new_trans[it.first].insert({s, std::make_pair(it2.second.first, it2.second.second)}); } } transitions.swap(new_trans); } + +void +Transducer::invert(Alphabet& alpha) +{ + std::map>> tmp_trans; + for (auto& it : transitions) { + std::multimap> tmp_state; + for (auto& it2 : it.second) { + auto pr = alpha.decode(it2.first); + int new_sym = alpha(pr.second, pr.first); + tmp_state.insert({new_sym, it2.second}); + } + tmp_trans.insert({it.first, tmp_state}); + } + transitions.swap(tmp_trans); +} + +void +Transducer::deleteSymbols(const sorted_vector& syms) +{ + for (auto& state : transitions) { + for (auto& sym : syms) { + state.second.erase(sym); + } + } +} + +void +Transducer::epsilonizeSymbols(const sorted_vector& syms) +{ + for (auto& state: transitions) { + for (auto& sym : syms) { + auto pr = state.second.equal_range(sym); + for (auto it = pr.first; it != pr.second; it++) { + state.second.insert({0, it->second}); + } + state.second.erase(sym); + } + } +} + +void +Transducer::applyACX(Alphabet& alpha, + const std::map>& acx) +{ + for (auto& state : transitions) { + std::vector>> to_insert; + for (auto& it : state.second) { + auto pr = alpha.decode(it.first); + auto loc = acx.find(pr.first); + if (loc != acx.end()) { + for (auto& sym : loc->second) { + to_insert.push_back({alpha(sym, pr.second), it.second}); + } + } + } + for (auto& it : to_insert) { + state.second.insert(it); + } + } +} + + +Transducer +Transducer::compose(Transducer const &g, + Alphabet &f_a, // this alphabet + Alphabet const &g_a, // alphabet of g + bool f_inverted, + bool g_anywhere, + int const epsilon_tag) +{ + /** + * g ∘ f = composed + * where f = this + * + * The basic algorithm is: + * + * Transducer gf; // composition + * queue = [ (f.init, g.init, gf.init) ] + * while(!queue.empty()): + * f_src, g_src, gf_src = queue.pop() + * for (f_input, f_output, f_trg) in state_f.transitions: + * for (g_input, g_output, g_trg) in state_g.transitions: + * if g_input == f_output: + * gf_trg = gf.add_state() + * gf.add_transition(gf_src, f_input, g_output, gf_trg) + * queue.add(f_trg, g_trg, gf_trg) + * gf_trg = gf.add_state() + * gf.add_transition(gf_src, f_input, f_output, gf_trg) + * queue.add(f_trg, g.init, gf_trg) + * + * with some added complications: + * + * 1. Since we can have loops, we need to keep track of which pairs + * (f_src, g_src) we've seen, so we don't keep re-adding them. And + * on seeing a previously seen pair, we need to know which state + * in gf was added from that pair. + * + * 2. We have to be able to skip epsilons on the input-side of g and + * the output-side of f. + * + * 3. When f_inverted, we swap f_left/f_right in matching. + * + * 4. If composing g_anywhere, we can add initials from g anywhere + * in gf, and finals in gf can be initials in g. + */ + joinFinals(epsilon_tag); + + // (f_state, g_state) + typedef std::pair SearchState; + + // State numbers will differ in fXg transducers and gf: + Transducer gf; + std::map states_f_g_gf; + + std::vector todo; + std::set seen; + SearchState current; + SearchState next{initial, g.initial}; + todo.push_back(next); + states_f_g_gf.insert({next, gf.initial}); + + while(!todo.empty()) { + current = todo.back(); + todo.pop_back(); + seen.insert(current); + int f_src = current.first, + g_src = current.second; + + if(states_f_g_gf.find(current) == states_f_g_gf.end()) { + std::cerr <<"Error: couldn't find "< g_leftright = g_a.decode(g_label); + int32_t g_left = g_leftright.first, + g_right = g_leftright.second; + + if(g_left == 0) + { + next = std::make_pair(f_src, g_trg); + if (seen.find(next) == seen.end()) { + todo.push_back(next); + } + if (states_f_g_gf.find(next) == states_f_g_gf.end()) { + states_f_g_gf.insert({next, gf.newState()}); + } + int gf_trg = states_f_g_gf[next]; + int32_t gf_label = composeLabel(f_a, g_a, 0, g_right, f_inverted); + gf.linkStates(gf_src, + gf_trg, + gf_label, + g_wt); + } + } + + // Loop through arcs from f_src; when the right-side of our arc + // matches left-side of an arc from g states, add that to todo: + for(auto& trans_it : transitions[f_src]) + { + int f_label = trans_it.first, + f_trg = trans_it.second.first; + double f_wt = trans_it.second.second; + std::pair f_leftright = f_a.decode(f_label); + int32_t f_input, f_output; + if(f_inverted) { + f_input = f_leftright.second; + f_output = f_leftright.first; + } + else { + f_input = f_leftright.first; + f_output = f_leftright.second; + } + + // Loop through non-epsilon arcs from the live state of g + for (auto &g_trans_it : g.transitions.at(g_src)) { + int g_label = g_trans_it.first, + g_trg = g_trans_it.second.first; + double g_wt = g_trans_it.second.second; + std::pair g_leftright = g_a.decode(g_label); + const int32_t g_left = g_leftright.first, + g_right = g_leftright.second; + + // output of f same as input of g? + // label becomes input of f and output of g + if (g_left != 0 && // we've already dealt with g epsilons + f_a.sameSymbol(f_output, g_a, g_left, true)) { + next = std::make_pair(f_trg, g_trg); + if (seen.find(next) == seen.end()) { + todo.push_back(next); + } + if (states_f_g_gf.find(next) == states_f_g_gf.end()) { + states_f_g_gf.insert({next, gf.newState()}); + } + int gf_trg = states_f_g_gf[next]; + int32_t gf_label = composeLabel(f_a, g_a, f_input, g_right, f_inverted); + gf.linkStates(gf_src, // fromState + gf_trg, // toState + gf_label, // symbol-pair, using f alphabet + f_wt + g_wt); // weight of transduction – composition adds weights! + } + } + if(g_anywhere && g_src == g.initial) { + // If g_anywhere, all g entries are optional – we always add + // the transitions that were already in f: + next = std::make_pair(f_trg, g_src); + if (seen.find(next) == seen.end()) { + todo.push_back(next); + } + if (states_f_g_gf.find(next) == states_f_g_gf.end()) { + states_f_g_gf.insert({next, gf.newState()}); + } + int gf_trg = states_f_g_gf[next]; + gf.linkStates(gf_src, // fromState + gf_trg, // toState + f_label, // symbol-pair, using f alphabet + f_wt); // weight of transduction + } + // If f has an epsilon, also add a transition not to g.initial but g_src: + if(f_output == 0) { // will be the left if f_inverted + next = std::make_pair(f_trg, g_src); + if (seen.find(next) == seen.end()) { + todo.push_back(next); + } + if (states_f_g_gf.find(next) == states_f_g_gf.end()) { + states_f_g_gf.insert({next, gf.newState()}); + } + int gf_trg = states_f_g_gf[next]; + gf.linkStates(gf_src, // fromState + gf_trg, // toState + f_label, // symbol-pair, using f alphabet + f_wt); // weight of transduction + } + } // end loop arcs from f_src + } // end while todo + + for(auto& it : states_f_g_gf) + { + int s_f = it.first.first; + int s_g = it.first.second; + int s_gf = it.second; + if(isFinal(s_f) && (g.isFinal(s_g) + // if we're in anywhere mode, every state will be paired with g.initial if it's not paired with something in the middle of g + || (g_anywhere && g.initial == s_g))) + { + double wt_gf = finals[s_f] + (g.isFinal(s_g) ? g.finals.at(s_g) + : default_weight); + gf.finals.insert({s_gf, wt_gf}); + } + } + + // We do not minimize here, in order to let lt_compose print a warning + // (instead of exiting the whole program) if no finals. + return gf; +} diff -Nru lttoolbox-3.6.6/lttoolbox/transducer.h lttoolbox-3.7.1/lttoolbox/transducer.h --- lttoolbox-3.6.6/lttoolbox/transducer.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/transducer.h 2022-11-01 08:36:47.000000000 +0000 @@ -22,11 +22,12 @@ #include #include +#include -using namespace std; /** - * Default value of weight + * Default value of weight. + * Used as the identity element by various operations. */ constexpr double default_weight = 0; @@ -49,13 +50,13 @@ * Final state set mapped to its weight walues * Schema: (state, weight) */ - map finals; + std::map finals; /** * Transitions of the transducer * Schema: (source state, tag, target state, weight) */ - map > > transitions; + std::map > > transitions; /** * New state creator @@ -64,14 +65,6 @@ int newState(); /** - * Test if the intersection of two sets is empty - * @param s1 first set - * @param s2 second set - * @return true if the intersection is empty - */ - static bool isEmptyIntersection(set const &s1, set const &s2); - - /** * Copy function * @param t the transducer to be copied */ @@ -94,18 +87,20 @@ /** * String constants */ - static UString const HFST_EPSILON_SYMBOL_SHORT; - static UString const HFST_EPSILON_SYMBOL_LONG; - static UString const LTTB_EPSILON_SYMBOL; - static UString const HFST_SPACE_SYMBOL; - static UString const HFST_TAB_SYMBOL; - static UString const GROUP_SYMBOL; - static UString const JOIN_SYMBOL; - static UString const ANY_TAG_SYMBOL; - static UString const ANY_CHAR_SYMBOL; - static UString const LSX_BOUNDARY_SYMBOL; - static UString const COMPOUND_ONLY_L_SYMBOL; - static UString const COMPOUND_R_SYMBOL; + static constexpr UStringView HFST_EPSILON_SYMBOL_SHORT = u"@0@"; + static constexpr UStringView HFST_EPSILON_SYMBOL_LONG = u"@_EPSILON_SYMBOL_@"; + static constexpr UStringView LTTB_EPSILON_SYMBOL = u"ε"; + static constexpr UStringView HFST_SPACE_SYMBOL = u"@_SPACE_@"; + static constexpr UStringView HFST_TAB_SYMBOL = u"@_TAB_@"; + static constexpr UStringView GROUP_SYMBOL = u"#"; + static constexpr UStringView JOIN_SYMBOL = u"+"; + static constexpr UStringView ANY_TAG_SYMBOL = u""; + static constexpr UStringView ANY_CHAR_SYMBOL = u""; + static constexpr UStringView LSX_BOUNDARY_SYMBOL = u"<$>"; + static constexpr UStringView LSX_BOUNDARY_SPACE_SYMBOL = u"<$_>"; + static constexpr UStringView LSX_BOUNDARY_NO_SPACE_SYMBOL= u"<$->"; + static constexpr UStringView COMPOUND_ONLY_L_SYMBOL = u""; + static constexpr UStringView COMPOUND_R_SYMBOL = u""; /** * Constructor @@ -134,7 +129,7 @@ * Determine whether any weight is non-default * @return bool true or false */ - bool weighted(); + bool weighted() const; /** * Insertion of a single transduction, creating a new target state @@ -144,7 +139,7 @@ * @param weight the weight value for the new transduction * @return the target state */ - int insertSingleTransduction(int const tag, int const source, double const weight = 0.0000); + int insertSingleTransduction(int tag, int source, double weight = 0.0000); /** * Insertion of a single transduction, forcing create a new target @@ -154,7 +149,7 @@ * @param weight the weight value for the new transduction * @return the target state */ - int insertNewSingleTransduction(int const tag, int const source, double const weight = 0.0000); + int insertNewSingleTransduction(int tag, int source, double weight = 0.0000); /** * Insertion of a transducer in a given source state, unifying their @@ -164,8 +159,8 @@ * @param epsilon_tag the epsilon tag * @return the new target state */ - int insertTransducer(int const source, Transducer &t, - int const epsilon_tag = 0); + int insertTransducer(int source, Transducer &t, + int epsilon_tag = 0); /** * Link two existing states by a transduction @@ -174,21 +169,21 @@ * @param tag the tag of the transduction * @param weight the weight value for the new transduction */ - void linkStates(int const source, int const target, int const tag, double const weight = 0.0000); + void linkStates(int source, int target, int tag, double weight = 0.0000); /** * Test if the state is a final state * @param state the state * @return true if is a final state */ - bool isFinal(int const state) const; + bool isFinal(int state) const; /** * Test if a pattern is recognised by the FST * @param a widestring of the pattern to be recognised * @return true if the pattern is recognised by the transducer */ - bool recognise(UString pattern, Alphabet &a, FILE *err = stderr); + bool recognise(UStringView pattern, Alphabet &a, FILE *err = stderr) const; /** * Set the state as a final or not, yes by default @@ -196,7 +191,7 @@ * @param weight the weight value for the final state * @param value if true, the state is set as final state */ - void setFinal(int const state, double const weight = 0.0000, bool value = true); + void setFinal(int state, double weight = 0.0000, bool value = true); /** * Returns the initial state of a transducer @@ -210,7 +205,7 @@ * @param epsilon_tag the tag to take as epsilon * @return the epsilon-connected states */ - set closure(int const state, int const epsilon_tag = 0) const; + std::set closure(int state, int epsilon_tag = 0) const; /** * Returns the epsilon closure of a given state @@ -218,50 +213,51 @@ * @param epsilon_tags the tags to treat as epsilon * @return the epsilon-connected states */ - set closure(int const state, set const &epsilon_tags) const; + std::set closure(int state, std::set const &epsilon_tags) const; + + std::vector> closure_all(int epsilon_tag) const; /** * Join all finals in one using epsilon transductions * @param epsilon_tag the tag to take as epsilon */ - void joinFinals(int const epsilon_tag = 0); + void joinFinals(int epsilon_tag = 0); /** * Return a copy of the final states */ - map getFinals() const; + std::map getFinals() const; /** * Return reference to the transitions */ - map > >& getTransitions(); + std::map > >& getTransitions(); /** * Reverse all the transductions of a transducer * @param epsilon_tag the tag to take as epsilon */ - void reverse(int const epsilon_tag = 0); + void reverse(int epsilon_tag = 0); /** * Print all the transductions of a transducer in ATT format - * @param hfst if true, use HFST-compatible escape characters * @param epsilon_tag the tag to take as epsilon + * @param hfst if true, use HFST-compatible escape characters */ - void show(Alphabet const &a, UFILE *output, int const epsilon_tag = 0, bool hfst = false) const; - void show(Alphabet const &a, UFILE *output, int const epsilon_tag = 0) const; + void show(Alphabet const &a, UFILE *output, int epsilon_tag = 0, bool hfst = false) const; /** * Determinize the transducer * @param epsilon_tag the tag to take as epsilon */ - void determinize(int const epsilon_tag = 0); + void determinize(int epsilon_tag = 0); /** * Minimize = reverse + determinize + reverse + determinize * @param epsilon_tag the tag to take as epsilon */ - void minimize(int const epsilon_tag = 0); + void minimize(int epsilon_tag = 0); /** @@ -269,20 +265,20 @@ * empty transductions) * @param epsilon_tag the tag to take as epsilon */ - void optional(int const epsilon_tag = 0); + void optional(int epsilon_tag = 0); /** * Make a transducer cyclic (link final states with initial state with * empty transductions) * @param epsilon_tag the tag to take as epsilon */ - void oneOrMore(int const epsilon_tag = 0); + void oneOrMore(int epsilon_tag = 0); /** * zeroOrMore = oneOrMore + optional * @param epsilon_tag the tag to take as epsilon */ - void zeroOrMore(int const epsilon_tag = 0); + void zeroOrMore(int epsilon_tag = 0); /** * Clear transducer @@ -318,27 +314,27 @@ * @param state the state to check * @return true if the state is empty */ - bool isEmpty(int const state) const; + bool isEmpty(int state) const; /** * Returns the number of transitions from a given state * @return the number of transitions */ - int getStateSize(int const state); + int getStateSize(int state); /** * Write method * @param output the stream to write to * @param decalage offset to sum to the tags */ - void write(FILE *output, int const decalage = 0); + void write(FILE *output, int decalage = 0); /** * Read method * @param input the stream to read from * @param decalage offset to sum to the tags */ - void read(FILE *input, int const decalage = 0); + void read(FILE *input, int decalage = 0); void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); @@ -352,7 +348,7 @@ */ void unionWith(Alphabet &my_a, Transducer &t, - int const epsilon_tag = 0); + int epsilon_tag = 0); /** * Converts this class into a "prefix transducer", ie. for any final @@ -367,8 +363,8 @@ * @param epsilon_tag the tag to take as epsilon * @return the prefix transducer */ - Transducer appendDotStar(set const &loopback_symbols, - int const epsilon_tag = 0); + Transducer appendDotStar(std::set const &loopback_symbols, + int epsilon_tag = 0); /** @@ -381,7 +377,7 @@ * @return the prefix transducer */ Transducer moveLemqsLast(Alphabet const &alphabet, - int const epsilon_tag = 0); + int epsilon_tag = 0); /** * Helper for moveLemqsLast. Starting from a certain state, make all * the tags go before the non-tags, so if " bar" is a @@ -397,24 +393,81 @@ Transducer copyWithTagsFirst(int start, int group_label, Alphabet const &alphabet, - int const epsilon_tag = 0); + int epsilon_tag = 0); /** - * Intersects two finite-state transducers + * Intersect with a bidix transducer, retaining only those analyses + * which would pass through the bidix, respecting multiwords. + * + * This is "intersection-and-then-some" – we assume this transducer + * is a monolingual analyser, and &t is a bidix, where outputs from + * the analyser would go through apertium-pretransfer before being + * sent to the bidix. So it has to handle the same transformations + * that apertium-pretransfer applies, e.g. "^a+b#c$" from the + * monodix analyser has to match "^a#c$" and "^b$" in the + * bidix. * * The returned transducer is not minimized! Minimization will exit * with failure if there are no finals, but we might want to * continue with intersecting the other sections. * - * @param t the Transducer with which this class is intersected + * @param bi the Transducer with which this class is intersected * @param my_a the alphabet of this transducer - * @param t_a the alphabet of the transducer t + * @param bi_a the alphabet of the transducer bi * @return the trimmed transducer */ - Transducer intersect(Transducer &t, + Transducer trim(Transducer &bi, Alphabet const &my_a, - Alphabet const &t_a, - int const epsilon_tag = 0); + Alphabet const &bi_a, + int epsilon_tag = 0); + + /** + * Composes two finite-state transducers + * + * The returned transducer is not minimized! Minimization will exit + * with failure if there are no finals, but we might want to + * continue with intersecting the other sections. + * + * Weights are added. The alphabet of this transducer is re-used – + * and may be altered if g adds new symbols! + * + * @param g the Transducer with which this is composed + * @param my_a the alphabet of this transducer + * @param g_a the alphabet of the transducer g + * @param f_inverted run composition right-to-left on this transducer + * @param g_anywhere don't require anchored matches, let g optionally compose at any sub-path + * @return the composition gf + */ + Transducer compose(Transducer const &g, + Alphabet &my_a, // not const since we may add symbols from g_a + Alphabet const &g_a, + bool f_inverted = false, + bool g_anywhere = false, + int epsilon_tag = 0); + + /** + * Helper for creating the arc label when composing g ∘ f, used by compose. + * + * @return symbol-pair going from input of f to output of g (left/right-inverted if f_inverted) + */ + inline + int32_t composeLabel(Alphabet &f_a, Alphabet const &g_a, + int32_t f_input, int32_t g_right, + bool f_inverted) const + { + int32_t gf_label; + if (g_right >= 0) { // optimisation: non-symbols are equal across alphabets + gf_label = f_inverted ? f_a(g_right, f_input) + : f_a(f_input, g_right); + } + else { + UString g_rightstr; + g_a.getSymbol(g_rightstr, g_right); + gf_label = f_inverted ? f_a(f_a(g_rightstr), f_input) + : f_a(f_input, f_a(g_rightstr)); + } + return gf_label; + } /** * Ensure that new_alpha contains all the symbols in old_alpha @@ -423,6 +476,28 @@ * single symbols rather than pairs. */ void updateAlphabet(Alphabet& old_alpha, Alphabet& new_alpha, bool has_pairs = true); + + /** + * Invert all transitions so x:y becomes y:x (this will update alpha). + */ + void invert(Alphabet& alpha); + + /** + * Deletes all transitions with a symbol pair in syms + */ + void deleteSymbols(const sorted_vector& syms); + + /** + * For every transition with a symbol pair in syms, + * change the symbol pair to epsilon + */ + void epsilonizeSymbols(const sorted_vector& syms); + + /** + * Given a map of symbols, x:[a,b,c], + * expand all x:y transitions to x:y, a:y, b:y, c:y + */ + void applyACX(Alphabet& alpha, const std::map>& acx); }; #endif diff -Nru lttoolbox-3.6.6/lttoolbox/trans_exe.cc lttoolbox-3.7.1/lttoolbox/trans_exe.cc --- lttoolbox-3.6.6/lttoolbox/trans_exe.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/trans_exe.cc 2022-11-01 08:36:47.000000000 +0000 @@ -93,7 +93,7 @@ int base = 0; double base_weight = default_weight; - map myfinals; + std::map myfinals; while(finals_size > 0) { @@ -104,7 +104,7 @@ { base_weight = Compression::long_multibyte_read(input); } - myfinals.insert(make_pair(base, base_weight)); + myfinals.insert({base, base_weight}); } @@ -114,10 +114,10 @@ int current_state = 0; new_t.node_list.resize(number_of_states); - for(map::iterator it = myfinals.begin(), limit = myfinals.end(); + for(auto it = myfinals.begin(), limit = myfinals.end(); it != limit; it++) { - new_t.finals.insert(make_pair(&new_t.node_list[it->first], it->second)); + new_t.finals.insert({&new_t.node_list[it->first], it->second}); } while(number_of_states > 0) @@ -158,7 +158,7 @@ } finals.clear(); - finals.insert(make_pair(newfinal, default_weight)); + finals.insert({newfinal, default_weight}); } Node * @@ -167,7 +167,7 @@ return &node_list[initial_id]; } -map & +std::map & TransExe::getFinals() { return finals; diff -Nru lttoolbox-3.6.6/lttoolbox/trans_exe.h lttoolbox-3.7.1/lttoolbox/trans_exe.h --- lttoolbox-3.6.6/lttoolbox/trans_exe.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/trans_exe.h 2022-11-01 08:36:47.000000000 +0000 @@ -27,7 +27,6 @@ #include #include -using namespace std; /** * Transducer class for execution of lexical processing algorithms @@ -48,12 +47,12 @@ /** * Node list */ - vector node_list; + std::vector node_list; /** * Final node set mapped to its weight walues */ - map finals; + std::map finals; /** * Copy function @@ -113,7 +112,7 @@ * Gets the set of final nodes * @return the set of final nodes */ - map & getFinals(); + std::map & getFinals(); }; #endif diff -Nru lttoolbox-3.6.6/lttoolbox/ustring.cc lttoolbox-3.7.1/lttoolbox/ustring.cc --- lttoolbox-3.6.6/lttoolbox/ustring.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/ustring.cc 2022-11-01 08:36:47.000000000 +0000 @@ -25,10 +25,10 @@ using namespace icu; void -write(const UString& str, UFILE* output) +write(UStringView str, UFILE* output) { // u_fputs() inserts a newline - u_fprintf(output, "%S", str.c_str()); + u_fprintf(output, "%.*S", str.size(), str.data()); } UString @@ -48,7 +48,7 @@ } void -ustring_to_vec32(const UString& str, std::vector& vec) +ustring_to_vec32(UStringView str, std::vector& vec) { if (str.empty()) { return; diff -Nru lttoolbox-3.6.6/lttoolbox/ustring.h lttoolbox-3.7.1/lttoolbox/ustring.h --- lttoolbox-3.6.6/lttoolbox/ustring.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/ustring.h 2022-11-01 08:36:47.000000000 +0000 @@ -21,34 +21,34 @@ #include #include #include +#include #include #include #include #include typedef std::basic_string UString; +typedef std::basic_string_view UStringView; -void write(const UString& str, UFILE* output); +void write(UStringView str, UFILE* output); UString to_ustring(const char* str); UString to_ustring(const uint8_t* str); // append UTF-16 string to UTF-32 vector of symbols -void ustring_to_vec32(const UString& str, std::vector& vec); +void ustring_to_vec32(UStringView str, std::vector& vec); inline std::ostream& operator<<(std::ostream& ostr, char16_t c) { - ostr << std::hex << static_cast(c); + utf8::utf16to8(&c, &c+1, std::ostream_iterator(ostr)); return ostr; } inline std::ostream& -operator<<(std::ostream& ostr, const UString& str) +operator<<(std::ostream& ostr, UStringView str) { - std::string res; - utf8::utf16to8(str.begin(), str.end(), std::back_inserter(res)); - ostr << res; + utf8::utf16to8(str.begin(), str.end(), std::ostream_iterator(ostr)); return ostr; } @@ -68,6 +68,14 @@ return us; } +inline UStringView operator "" _uv(const char16_t* str, std::size_t len) { + return UStringView(str, len); +} + +inline UString US(UStringView usv) { + return UString(usv); +} + inline void operator+=(UString& str, UChar32 c) { if (c <= 0xFFFF) { diff -Nru lttoolbox-3.6.6/lttoolbox/xml_parse_util.cc lttoolbox-3.7.1/lttoolbox/xml_parse_util.cc --- lttoolbox-3.6.6/lttoolbox/xml_parse_util.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/xml_parse_util.cc 2022-11-01 08:36:47.000000000 +0000 @@ -20,40 +20,43 @@ #include #include -using namespace std; - -UString -XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const &name) +xmlTextReaderPtr +XMLParseUtil::open_or_exit(const char* fname) { - return attrib(reader, name, ""_u); + xmlTextReaderPtr reader = xmlReaderForFile(fname, NULL, 0); + if (reader == NULL) { + std::cerr << "Error: cannot open '" << fname << "' for reading." << std::endl; + exit(EXIT_FAILURE); + } + return reader; } UString -XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const& name, const UString& fallback) +XMLParseUtil::attrib(xmlTextReaderPtr reader, UStringView name, UStringView fallback) { std::string temp; temp.reserve(name.size()); utf8::utf16to8(name.begin(), name.end(), std::back_inserter(temp)); - const xmlChar *attrname = reinterpret_cast(temp.c_str()); - xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); + auto attrname = reinterpret_cast(temp.c_str()); + auto myattr = xmlTextReaderGetAttribute(reader, attrname); if(myattr == NULL) { xmlFree(myattr); - return fallback; + return US(fallback); } else { - UString result = to_ustring(reinterpret_cast(myattr)); + auto result = to_ustring(reinterpret_cast(myattr)); xmlFree(myattr); return result; } } std::string -XMLParseUtil::attrib_str(xmlTextReaderPtr reader, const UString& name) +XMLParseUtil::attrib_str(xmlTextReaderPtr reader, UStringView name) { std::string temp; temp.reserve(name.size()); utf8::utf16to8(name.begin(), name.end(), std::back_inserter(temp)); - const xmlChar *attrname = reinterpret_cast(temp.c_str()); - xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); + auto attrname = reinterpret_cast(temp.c_str()); + auto myattr = xmlTextReaderGetAttribute(reader, attrname); if(myattr == NULL) { xmlFree(myattr); return ""; @@ -81,7 +84,7 @@ } void -XMLParseUtil::readValueInto32(xmlTextReaderPtr reader, vector& vec) +XMLParseUtil::readValueInto32(xmlTextReaderPtr reader, std::vector& vec) { const xmlChar* val = xmlTextReaderConstValue(reader); if (val == NULL) return; @@ -89,3 +92,12 @@ vec.reserve(vec.size() + sz); utf8::utf8to32(val, val+sz, std::back_inserter(vec)); } + +bool +XMLParseUtil::allBlanks(xmlTextReaderPtr reader) +{ + for (auto& c : readValue(reader)) { + if (!u_isspace(c)) return false; + } + return true; +} diff -Nru lttoolbox-3.6.6/lttoolbox/xml_parse_util.h lttoolbox-3.7.1/lttoolbox/xml_parse_util.h --- lttoolbox-3.6.6/lttoolbox/xml_parse_util.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/xml_parse_util.h 2022-11-01 08:36:47.000000000 +0000 @@ -24,23 +24,23 @@ #include #include -using namespace std; class XMLParseUtil { public: - /* If attrib does not exist (or other error), returns an empty string: */ - static UString attrib(xmlTextReaderPtr reader, UString const &name); + static xmlTextReaderPtr open_or_exit(const char* fname); /* If attrib does not exist (or other error), returns fallback: */ - static UString attrib(xmlTextReaderPtr reader, UString const &name, const UString& fallback); + static UString attrib(xmlTextReaderPtr reader, UStringView name, UStringView fallback=u""); - static string attrib_str(xmlTextReaderPtr reader, const UString& name); + static std::string attrib_str(xmlTextReaderPtr reader, UStringView name); static UString readName(xmlTextReaderPtr reader); static UString readValue(xmlTextReaderPtr reader); - static void readValueInto32(xmlTextReaderPtr reader, vector& vec); + static void readValueInto32(xmlTextReaderPtr reader, std::vector& vec); + + static bool allBlanks(xmlTextReaderPtr reader); }; #endif diff -Nru lttoolbox-3.6.6/lttoolbox/xml_walk_util.cc lttoolbox-3.7.1/lttoolbox/xml_walk_util.cc --- lttoolbox-3.6.6/lttoolbox/xml_walk_util.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/xml_walk_util.cc 2022-11-01 08:36:47.000000000 +0000 @@ -91,18 +91,12 @@ } UString -getattr(xmlNode* node, const UString& attr, const UString& fallback) +getattr(xmlNode* node, UStringView attr, UStringView fallback) { for (xmlAttr* i = node->properties; i != NULL; i = i->next) { if (to_ustring((const char*) i->name) == attr) { return to_ustring((const char*) i->children->content); } } - return fallback; -} - -UString -getattr(xmlNode* node, const UString& attr) -{ - return getattr(node, attr, ""_u); + return US(fallback); } diff -Nru lttoolbox-3.6.6/lttoolbox/xml_walk_util.h lttoolbox-3.7.1/lttoolbox/xml_walk_util.h --- lttoolbox-3.6.6/lttoolbox/xml_walk_util.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/xml_walk_util.h 2022-11-01 08:36:47.000000000 +0000 @@ -28,7 +28,6 @@ void error_and_die(xmlNode* node, const char* fmt, ...); UString getattr(xmlNode* node, const char* attr); -UString getattr(xmlNode* node, const UString& attr, const UString& fallback); -UString getattr(xmlNode* node, const UString& attr); +UString getattr(xmlNode* node, UStringView attr, UStringView fallback = u""); #endif diff -Nru lttoolbox-3.6.6/lttoolbox/xsd/acx.xsd lttoolbox-3.7.1/lttoolbox/xsd/acx.xsd --- lttoolbox-3.6.6/lttoolbox/xsd/acx.xsd 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/xsd/acx.xsd 2022-11-01 08:36:47.000000000 +0000 @@ -14,7 +14,7 @@ - + @@ -25,7 +25,7 @@ - + diff -Nru lttoolbox-3.6.6/lttoolbox.pc.in lttoolbox-3.7.1/lttoolbox.pc.in --- lttoolbox-3.6.6/lttoolbox.pc.in 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox.pc.in 2022-11-01 08:36:47.000000000 +0000 @@ -6,5 +6,5 @@ Name: lttoolbox Description: Augmented letter transducer tools for natural language processing Version: @VERSION@ -Libs: -L${libdir} -llttoolbox@VERSION_MAJOR@ -Cflags: -I${includedir}/lttoolbox-@VERSION_API@ +Cflags: -I${includedir} +Libs: -L${libdir} -l@PACKAGE_NAME@ diff -Nru lttoolbox-3.6.6/m4/ax_pthread.m4 lttoolbox-3.7.1/m4/ax_pthread.m4 --- lttoolbox-3.6.6/m4/ax_pthread.m4 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/m4/ax_pthread.m4 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,522 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_pthread.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) +# +# DESCRIPTION +# +# This macro figures out how to build C programs using POSIX threads. It +# sets the PTHREAD_LIBS output variable to the threads library and linker +# flags, and the PTHREAD_CFLAGS output variable to any special C compiler +# flags that are needed. (The user can also force certain compiler +# flags/libs to be tested by setting these environment variables.) +# +# Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is +# needed for multi-threaded programs (defaults to the value of CC +# respectively CXX otherwise). (This is necessary on e.g. AIX to use the +# special cc_r/CC_r compiler alias.) +# +# NOTE: You are assumed to not only compile your program with these flags, +# but also to link with them as well. For example, you might link with +# $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS +# $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS +# +# If you are only building threaded programs, you may wish to use these +# variables in your default LIBS, CFLAGS, and CC: +# +# LIBS="$PTHREAD_LIBS $LIBS" +# CFLAGS="$CFLAGS $PTHREAD_CFLAGS" +# CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS" +# CC="$PTHREAD_CC" +# CXX="$PTHREAD_CXX" +# +# In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant +# has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to +# that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX). +# +# Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the +# PTHREAD_PRIO_INHERIT symbol is defined when compiling with +# PTHREAD_CFLAGS. +# +# ACTION-IF-FOUND is a list of shell commands to run if a threads library +# is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it +# is not found. If ACTION-IF-FOUND is not specified, the default action +# will define HAVE_PTHREAD. +# +# Please let the authors know if this macro fails on any platform, or if +# you have any other suggestions or comments. This macro was based on work +# by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help +# from M. Frigo), as well as ac_pthread and hb_pthread macros posted by +# Alejandro Forero Cuervo to the autoconf macro repository. We are also +# grateful for the helpful feedback of numerous users. +# +# Updated for Autoconf 2.68 by Daniel Richard G. +# +# LICENSE +# +# Copyright (c) 2008 Steven G. Johnson +# Copyright (c) 2011 Daniel Richard G. +# Copyright (c) 2019 Marc Stevens +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 31 + +AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD]) +AC_DEFUN([AX_PTHREAD], [ +AC_REQUIRE([AC_CANONICAL_HOST]) +AC_REQUIRE([AC_PROG_CC]) +AC_REQUIRE([AC_PROG_SED]) +AC_LANG_PUSH([C]) +ax_pthread_ok=no + +# We used to check for pthread.h first, but this fails if pthread.h +# requires special compiler flags (e.g. on Tru64 or Sequent). +# It gets checked for in the link test anyway. + +# First of all, check if the user has set any of the PTHREAD_LIBS, +# etcetera environment variables, and if threads linking works using +# them: +if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then + ax_pthread_save_CC="$CC" + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" + AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"]) + AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"]) + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS]) + AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes]) + AC_MSG_RESULT([$ax_pthread_ok]) + if test "x$ax_pthread_ok" = "xno"; then + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" + fi + CC="$ax_pthread_save_CC" + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" +fi + +# We must check for the threads library under a number of different +# names; the ordering is very important because some systems +# (e.g. DEC) have both -lpthread and -lpthreads, where one of the +# libraries is broken (non-POSIX). + +# Create a list of thread flags to try. Items with a "," contain both +# C compiler flags (before ",") and linker flags (after ","). Other items +# starting with a "-" are C compiler flags, and remaining items are +# library names, except for "none" which indicates that we try without +# any flags at all, and "pthread-config" which is a program returning +# the flags for the Pth emulation library. + +ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" + +# The ordering *is* (sometimes) important. Some notes on the +# individual items follow: + +# pthreads: AIX (must check this before -lpthread) +# none: in case threads are in libc; should be tried before -Kthread and +# other compiler flags to prevent continual compiler warnings +# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) +# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64 +# (Note: HP C rejects this with "bad form for `-t' option") +# -pthreads: Solaris/gcc (Note: HP C also rejects) +# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it +# doesn't hurt to check since this sometimes defines pthreads and +# -D_REENTRANT too), HP C (must be checked before -lpthread, which +# is present but should not be used directly; and before -mthreads, +# because the compiler interprets this as "-mt" + "-hreads") +# -mthreads: Mingw32/gcc, Lynx/gcc +# pthread: Linux, etcetera +# --thread-safe: KAI C++ +# pthread-config: use pthread-config program (for GNU Pth library) + +case $host_os in + + freebsd*) + + # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) + # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) + + ax_pthread_flags="-kthread lthread $ax_pthread_flags" + ;; + + hpux*) + + # From the cc(1) man page: "[-mt] Sets various -D flags to enable + # multi-threading and also sets -lpthread." + + ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags" + ;; + + openedition*) + + # IBM z/OS requires a feature-test macro to be defined in order to + # enable POSIX threads at all, so give the user a hint if this is + # not set. (We don't define these ourselves, as they can affect + # other portions of the system API in unpredictable ways.) + + AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING], + [ +# if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS) + AX_PTHREAD_ZOS_MISSING +# endif + ], + [AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])]) + ;; + + solaris*) + + # On Solaris (at least, for some versions), libc contains stubbed + # (non-functional) versions of the pthreads routines, so link-based + # tests will erroneously succeed. (N.B.: The stubs are missing + # pthread_cleanup_push, or rather a function called by this macro, + # so we could check for that, but who knows whether they'll stub + # that too in a future libc.) So we'll check first for the + # standard Solaris way of linking pthreads (-mt -lpthread). + + ax_pthread_flags="-mt,-lpthread pthread $ax_pthread_flags" + ;; +esac + +# Are we compiling with Clang? + +AC_CACHE_CHECK([whether $CC is Clang], + [ax_cv_PTHREAD_CLANG], + [ax_cv_PTHREAD_CLANG=no + # Note that Autoconf sets GCC=yes for Clang as well as GCC + if test "x$GCC" = "xyes"; then + AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG], + [/* Note: Clang 2.7 lacks __clang_[a-z]+__ */ +# if defined(__clang__) && defined(__llvm__) + AX_PTHREAD_CC_IS_CLANG +# endif + ], + [ax_cv_PTHREAD_CLANG=yes]) + fi + ]) +ax_pthread_clang="$ax_cv_PTHREAD_CLANG" + + +# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC) + +# Note that for GCC and Clang -pthread generally implies -lpthread, +# except when -nostdlib is passed. +# This is problematic using libtool to build C++ shared libraries with pthread: +# [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25460 +# [2] https://bugzilla.redhat.com/show_bug.cgi?id=661333 +# [3] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=468555 +# To solve this, first try -pthread together with -lpthread for GCC + +AS_IF([test "x$GCC" = "xyes"], + [ax_pthread_flags="-pthread,-lpthread -pthread -pthreads $ax_pthread_flags"]) + +# Clang takes -pthread (never supported any other flag), but we'll try with -lpthread first + +AS_IF([test "x$ax_pthread_clang" = "xyes"], + [ax_pthread_flags="-pthread,-lpthread -pthread"]) + + +# The presence of a feature test macro requesting re-entrant function +# definitions is, on some systems, a strong hint that pthreads support is +# correctly enabled + +case $host_os in + darwin* | hpux* | linux* | osf* | solaris*) + ax_pthread_check_macro="_REENTRANT" + ;; + + aix*) + ax_pthread_check_macro="_THREAD_SAFE" + ;; + + *) + ax_pthread_check_macro="--" + ;; +esac +AS_IF([test "x$ax_pthread_check_macro" = "x--"], + [ax_pthread_check_cond=0], + [ax_pthread_check_cond="!defined($ax_pthread_check_macro)"]) + + +if test "x$ax_pthread_ok" = "xno"; then +for ax_pthread_try_flag in $ax_pthread_flags; do + + case $ax_pthread_try_flag in + none) + AC_MSG_CHECKING([whether pthreads work without any flags]) + ;; + + *,*) + PTHREAD_CFLAGS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\1/"` + PTHREAD_LIBS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\2/"` + AC_MSG_CHECKING([whether pthreads work with "$PTHREAD_CFLAGS" and "$PTHREAD_LIBS"]) + ;; + + -*) + AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag]) + PTHREAD_CFLAGS="$ax_pthread_try_flag" + ;; + + pthread-config) + AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no]) + AS_IF([test "x$ax_pthread_config" = "xno"], [continue]) + PTHREAD_CFLAGS="`pthread-config --cflags`" + PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" + ;; + + *) + AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag]) + PTHREAD_LIBS="-l$ax_pthread_try_flag" + ;; + esac + + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + + # Check for various functions. We must include pthread.h, + # since some functions may be macros. (On the Sequent, we + # need a special flag -Kthread to make this header compile.) + # We check for pthread_join because it is in -lpthread on IRIX + # while pthread_create is in libc. We check for pthread_attr_init + # due to DEC craziness with -lpthreads. We check for + # pthread_cleanup_push because it is one of the few pthread + # functions on Solaris that doesn't have a non-functional libc stub. + # We try pthread_create on general principles. + + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include +# if $ax_pthread_check_cond +# error "$ax_pthread_check_macro must be defined" +# endif + static void *some_global = NULL; + static void routine(void *a) + { + /* To avoid any unused-parameter or + unused-but-set-parameter warning. */ + some_global = a; + } + static void *start_routine(void *a) { return a; }], + [pthread_t th; pthread_attr_t attr; + pthread_create(&th, 0, start_routine, 0); + pthread_join(th, 0); + pthread_attr_init(&attr); + pthread_cleanup_push(routine, 0); + pthread_cleanup_pop(0) /* ; */])], + [ax_pthread_ok=yes], + []) + + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" + + AC_MSG_RESULT([$ax_pthread_ok]) + AS_IF([test "x$ax_pthread_ok" = "xyes"], [break]) + + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" +done +fi + + +# Clang needs special handling, because older versions handle the -pthread +# option in a rather... idiosyncratic way + +if test "x$ax_pthread_clang" = "xyes"; then + + # Clang takes -pthread; it has never supported any other flag + + # (Note 1: This will need to be revisited if a system that Clang + # supports has POSIX threads in a separate library. This tends not + # to be the way of modern systems, but it's conceivable.) + + # (Note 2: On some systems, notably Darwin, -pthread is not needed + # to get POSIX threads support; the API is always present and + # active. We could reasonably leave PTHREAD_CFLAGS empty. But + # -pthread does define _REENTRANT, and while the Darwin headers + # ignore this macro, third-party headers might not.) + + # However, older versions of Clang make a point of warning the user + # that, in an invocation where only linking and no compilation is + # taking place, the -pthread option has no effect ("argument unused + # during compilation"). They expect -pthread to be passed in only + # when source code is being compiled. + # + # Problem is, this is at odds with the way Automake and most other + # C build frameworks function, which is that the same flags used in + # compilation (CFLAGS) are also used in linking. Many systems + # supported by AX_PTHREAD require exactly this for POSIX threads + # support, and in fact it is often not straightforward to specify a + # flag that is used only in the compilation phase and not in + # linking. Such a scenario is extremely rare in practice. + # + # Even though use of the -pthread flag in linking would only print + # a warning, this can be a nuisance for well-run software projects + # that build with -Werror. So if the active version of Clang has + # this misfeature, we search for an option to squash it. + + AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread], + [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG], + [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown + # Create an alternate version of $ac_link that compiles and + # links in two steps (.c -> .o, .o -> exe) instead of one + # (.c -> exe), because the warning occurs only in the second + # step + ax_pthread_save_ac_link="$ac_link" + ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g' + ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"` + ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)" + ax_pthread_save_CFLAGS="$CFLAGS" + for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do + AS_IF([test "x$ax_pthread_try" = "xunknown"], [break]) + CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS" + ac_link="$ax_pthread_save_ac_link" + AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], + [ac_link="$ax_pthread_2step_ac_link" + AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], + [break]) + ]) + done + ac_link="$ax_pthread_save_ac_link" + CFLAGS="$ax_pthread_save_CFLAGS" + AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no]) + ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try" + ]) + + case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in + no | unknown) ;; + *) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;; + esac + +fi # $ax_pthread_clang = yes + + + +# Various other checks: +if test "x$ax_pthread_ok" = "xyes"; then + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + + # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. + AC_CACHE_CHECK([for joinable pthread attribute], + [ax_cv_PTHREAD_JOINABLE_ATTR], + [ax_cv_PTHREAD_JOINABLE_ATTR=unknown + for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [int attr = $ax_pthread_attr; return attr /* ; */])], + [ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break], + []) + done + ]) + AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \ + test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \ + test "x$ax_pthread_joinable_attr_defined" != "xyes"], + [AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], + [$ax_cv_PTHREAD_JOINABLE_ATTR], + [Define to necessary symbol if this constant + uses a non-standard name on your system.]) + ax_pthread_joinable_attr_defined=yes + ]) + + AC_CACHE_CHECK([whether more special flags are required for pthreads], + [ax_cv_PTHREAD_SPECIAL_FLAGS], + [ax_cv_PTHREAD_SPECIAL_FLAGS=no + case $host_os in + solaris*) + ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS" + ;; + esac + ]) + AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \ + test "x$ax_pthread_special_flags_added" != "xyes"], + [PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS" + ax_pthread_special_flags_added=yes]) + + AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT], + [ax_cv_PTHREAD_PRIO_INHERIT], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], + [[int i = PTHREAD_PRIO_INHERIT; + return i;]])], + [ax_cv_PTHREAD_PRIO_INHERIT=yes], + [ax_cv_PTHREAD_PRIO_INHERIT=no]) + ]) + AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \ + test "x$ax_pthread_prio_inherit_defined" != "xyes"], + [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.]) + ax_pthread_prio_inherit_defined=yes + ]) + + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" + + # More AIX lossage: compile with *_r variant + if test "x$GCC" != "xyes"; then + case $host_os in + aix*) + AS_CASE(["x/$CC"], + [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6], + [#handle absolute path differently from PATH based program lookup + AS_CASE(["x$CC"], + [x/*], + [ + AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"]) + AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])]) + ], + [ + AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC]) + AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])]) + ] + ) + ]) + ;; + esac + fi +fi + +test -n "$PTHREAD_CC" || PTHREAD_CC="$CC" +test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX" + +AC_SUBST([PTHREAD_LIBS]) +AC_SUBST([PTHREAD_CFLAGS]) +AC_SUBST([PTHREAD_CC]) +AC_SUBST([PTHREAD_CXX]) + +# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: +if test "x$ax_pthread_ok" = "xyes"; then + ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1]) + : +else + ax_pthread_ok=no + $2 +fi +AC_LANG_POP +])dnl AX_PTHREAD diff -Nru lttoolbox-3.6.6/python/setup.py.in lttoolbox-3.7.1/python/setup.py.in --- lttoolbox-3.6.6/python/setup.py.in 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/python/setup.py.in 2022-11-01 08:36:47.000000000 +0000 @@ -19,7 +19,7 @@ swig_opts = ["-c++", '-I..', "-I@top_srcdir@", "-Wall"], include_dirs=['@top_srcdir@', '@top_srcdir@/lttoolbox'] + '@LIBXML_CFLAGS@'.replace('-I', '').split() + '@ICU_CFLAGS@'.replace('-I', '').split(), library_dirs=['@top_srcdir@/lttoolbox/.libs'], - libraries=['lttoolbox@VERSION_MAJOR@', 'xml2', 'icuio', 'icui18n', 'icuuc', 'icudata'], + libraries=['lttoolbox', 'xml2', 'icuio', 'icui18n', 'icuuc', 'icudata'], extra_compile_args=compile_args, extra_link_args=link_args, ) diff -Nru lttoolbox-3.6.6/README lttoolbox-3.7.1/README --- lttoolbox-3.6.6/README 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/README 2022-11-01 08:36:47.000000000 +0000 @@ -8,8 +8,8 @@ process of splitting a word like `cats` into its lemma `cat` and the grammatical information ``. Generation is the opposite process. -The three programs main programs are lt-comp, the compiler, lt-proc, -the processor, and lt-expand, which generates all possible mappings +The three main programs are lt-comp, the compiler, lt-proc, the +processor, and lt-expand, which generates all possible mappings between surface forms and lexical forms in the dictionary. Executables built by this package: @@ -29,7 +29,18 @@ would pass through a compiled bidix, creating a new compiled and trimmed analyser. -* `lt-print`: print the arcs of a transducer in [ATT format][3]. +* `lt-compose`: composes two compiled transducers (applying output of + the first to input of the second), with support for flipping labels + and allowing incomplete matches. + +* `lt-print`: prints the arcs of a transducer in [ATT format][3]. + +* `lt-append`: merges two compiled dictionaries. + +* `lt-paradigm`: extracts all paths from a compiled dictionary + matching an input pattern. + +* `lsx-comp`: an alias of `lt-comp`. There is also a C++ API that you can link to (see how [apertium][1] or [apertium-lex-tools][2] do this). diff -Nru lttoolbox-3.6.6/README.md lttoolbox-3.7.1/README.md --- lttoolbox-3.6.6/README.md 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/README.md 2022-11-01 08:36:47.000000000 +0000 @@ -8,8 +8,8 @@ process of splitting a word like `cats` into its lemma `cat` and the grammatical information ``. Generation is the opposite process. -The three programs main programs are lt-comp, the compiler, lt-proc, -the processor, and lt-expand, which generates all possible mappings +The three main programs are lt-comp, the compiler, lt-proc, the +processor, and lt-expand, which generates all possible mappings between surface forms and lexical forms in the dictionary. Executables built by this package: @@ -29,7 +29,18 @@ would pass through a compiled bidix, creating a new compiled and trimmed analyser. -* `lt-print`: print the arcs of a transducer in [ATT format][3]. +* `lt-compose`: composes two compiled transducers (applying output of + the first to input of the second), with support for flipping labels + and allowing incomplete matches. + +* `lt-print`: prints the arcs of a transducer in [ATT format][3]. + +* `lt-append`: merges two compiled dictionaries. + +* `lt-paradigm`: extracts all paths from a compiled dictionary + matching an input pattern. + +* `lsx-comp`: an alias of `lt-comp`. There is also a C++ API that you can link to (see how [apertium][1] or [apertium-lex-tools][2] do this). diff -Nru lttoolbox-3.6.6/tests/basictest.py lttoolbox-3.7.1/tests/basictest.py --- lttoolbox-3.6.6/tests/basictest.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/basictest.py 2022-11-01 08:36:47.000000000 +0000 @@ -1,5 +1,12 @@ # -*- coding: utf-8 -*- +import os +from shutil import rmtree import signal +from subprocess import run, call, PIPE, Popen +from sys import stderr +from tempfile import mkdtemp +from typing import List + class Alarm(Exception): pass @@ -29,12 +36,136 @@ try: char = self.withTimeout(2, process.stdout.read, 1) except Alarm: - pass + print("Timeout before reading a single character!", file=stderr) while char and char != b'\0': output.append(char) try: char = self.withTimeout(2, process.stdout.read, 1) except Alarm: + print("Timeout before reading %s chars" % len(output), + file=stderr) break # send what we got up till now return b"".join(output).decode('utf-8').replace('\r\n', '\n') + + def openPipe(self, procName, args): + return Popen([os.environ['LTTOOLBOX_PATH']+'/'+procName] + args, + stdin=PIPE, stdout=PIPE, stderr=PIPE) + def closePipe(self, proc, expectFail=False): + proc.communicate() # let it terminate + proc.stdin.close() + proc.stdout.close() + proc.stderr.close() + retCode = proc.poll() + if expectFail: + self.assertNotEqual(retCode, 0) + else: + self.assertEqual(retCode, 0) + + def compileDix(self, dir, dix, flags=None, binName='compiled.bin', + expectFail=False): + return self.callProc('lt-comp', + [dir, dix, binName], + flags, + expectFail) + + def callProc(self, name, bins, flags=None, expectFail=False): + cmd = [os.environ['LTTOOLBOX_PATH']+'/'+name] + (flags or []) + bins + res = run(cmd, capture_output=True) + if (res.returncode == 0) == expectFail: + print("\nFAILED CMD: " + " ".join(cmd)) + print("\nSTDOUT:", res.stdout) + print("STDERR:", res.stderr) + if expectFail: + self.assertNotEqual(res.returncode, 0) + else: + self.assertEqual(res.returncode, 0) + + +class TempDir: + def __enter__(self): + self.tmpd = mkdtemp() + return self.tmpd + + def __exit__(self, *args): + rmtree(self.tmpd) + + +class PrintTest(BasicTest): + """See lt_print test for how to use this. Override runTest if you don't + want to use NUL flushing.""" + + printdix = "data/minimal-mono.dix" + printdir = "lr" + expectedOutput = "" + expectedRetCodeFail = False + printflags = [] + + def compileTest(self, tmpd): + return self.compileDix(self.printdir, self.printdix, + binName=tmpd+'/compiled.bin') + + def runTest(self): + with TempDir() as tmpd: + self.compileTest(tmpd) + self.printresult = self.openPipe('lt-print', + self.printflags + + [tmpd+'/compiled.bin']) + + self.assertEqual(self.communicateFlush(None, self.printresult), + self.expectedOutput) + + self.closePipe(self.printresult, self.expectedRetCodeFail) + + +class ProcTest(BasicTest): + """See lt_proc test for how to use this. Override runTest if you don't + want to use NUL flushing.""" + + procdix = "data/minimal-mono.dix" + procdir = "lr" + compflags = [] # type: List[str] + procflags = ["-z"] + inputs = [""] + expectedOutputs = [""] + expectedRetCodeFail = False + expectedCompRetCodeFail = False + flushing = True + + def compileTest(self, tmpd): + return self.compileDix(self.procdir, self.procdix, + flags=self.compflags, + binName=tmpd+'/compiled.bin', + expectFail=self.expectedCompRetCodeFail) + + def runTest(self): + with TempDir() as tmpd: + if not self.compileTest(tmpd): + return + if self.flushing: + self.runTestFlush(tmpd) + else: + self.runTestNoFlush(tmpd) + + def openProc(self, tmpd): + return self.openPipe('lt-proc', self.procflags+[tmpd+'/compiled.bin']) + + def runTestFlush(self, tmpd): + proc = self.openProc(tmpd) + self.assertEqual(len(self.inputs), + len(self.expectedOutputs)) + for inp, exp in zip(self.inputs, self.expectedOutputs): + self.assertEqual(self.communicateFlush(inp+"[][\n]", proc), + exp+"[][\n]") + self.closePipe(proc, self.expectedRetCodeFail) + + def runTestNoFlush(self, tmpd): + for inp, exp in zip(self.inputs, self.expectedOutputs): + proc = self.openProc(tmpd) + self.assertEqual(proc.communicate(input=inp.encode('utf-8'))[0], + exp.encode('utf-8')) + retCode = proc.poll() + if self.expectedRetCodeFail: + self.assertNotEqual(retCode, 0) + else: + self.assertEqual(retCode, 0) diff -Nru lttoolbox-3.6.6/tests/data/a2b.dix lttoolbox-3.7.1/tests/data/a2b.dix --- lttoolbox-3.6.6/tests/data/a2b.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/a2b.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,12 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + +
+

ab

+
+
diff -Nru lttoolbox-3.6.6/tests/data/basic.acx lttoolbox-3.7.1/tests/data/basic.acx --- lttoolbox-3.6.6/tests/data/basic.acx 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/basic.acx 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,7 @@ + + + + + + + diff -Nru lttoolbox-3.6.6/tests/data/basic.lsx lttoolbox-3.7.1/tests/data/basic.lsx --- lttoolbox-3.6.6/tests/data/basic.lsx 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/basic.lsx 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,16 @@ + + + + + + + + + +
+

+ prpers + +

+
+
diff -Nru lttoolbox-3.6.6/tests/data/compose1.dix lttoolbox-3.7.1/tests/data/compose1.dix --- lttoolbox-3.6.6/tests/data/compose1.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/compose1.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,34 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠčČðđÐýÝñÑüÜíÍıİËë-0123456789̇ + + + + + + + +

¤

+
+
+ +
+ + + +

opp¤ opp

+ + +

app app

+ + py

+ + + upp

+ + upp

+ + tupp

+ +
+
diff -Nru lttoolbox-3.6.6/tests/data/expand-re.dix lttoolbox-3.7.1/tests/data/expand-re.dix --- lttoolbox-3.6.6/tests/data/expand-re.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/expand-re.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,24 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + + + + + + + + +
+

abcab

+

abab

+

yy

+

nn

+ xyz:abc[qxj]+

+
+ +
diff -Nru lttoolbox-3.6.6/tests/data/oci-pgen.dix lttoolbox-3.7.1/tests/data/oci-pgen.dix --- lttoolbox-3.6.6/tests/data/oci-pgen.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/oci-pgen.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,165 @@ + + + + + + [aeiouàáéèíóòúü] + + + + +

+ detlo + del +

+
+ +

+ detlo + del' +

+ + + +

+ detla + del' +

+ + + +

+ detla + dela +

+ + +

+ detlos + dels +

+ + +

+ detlas + delas +

+ + +

+ lo + del' +

+ + + +

+ la + del' +

+ + + + +

+ detla + del' +

+ + +
+ +
+ +
+ +

+ de + +

+ + + + +

+ ade + a +

+ + + + +

+ adetlo + al +

+ + +

+ adetlos + als +

+ + +

+ adetlo + al' +

+ + + +

+ detla + l' +

+ + + +

+ detlo + l' +

+ + + +

+ lo + l' +

+ + + +

+ la + l' +

+ + + +

+ detla + la +

+ + +

+ detlo + lo +

+ + +

+ detlas + las +

+ + +

+ detlos + los +

+ + +
+ +
diff -Nru lttoolbox-3.6.6/tests/data/plus-lemma-bi.dix lttoolbox-3.7.1/tests/data/plus-lemma-bi.dix --- lttoolbox-3.6.6/tests/data/plus-lemma-bi.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/plus-lemma-bi.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,24 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + + + + + +
+

I+Dqqq

+

abxy

+

yz

+
+ +
+

jj

+

gg

+
+ +
diff -Nru lttoolbox-3.6.6/tests/data/plus-lemma-mono.dix lttoolbox-3.7.1/tests/data/plus-lemma-mono.dix --- lttoolbox-3.6.6/tests/data/plus-lemma-mono.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/plus-lemma-mono.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,28 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + + + + + + + + + + + +
+

I+DI+D

+

abcab

+

abab

+

yy

+

nn

+

jgjg

+
+ +
diff -Nru lttoolbox-3.6.6/tests/data/postgen-overlap.dix lttoolbox-3.7.1/tests/data/postgen-overlap.dix --- lttoolbox-3.6.6/tests/data/postgen-overlap.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/postgen-overlap.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,37 @@ + + + + + + + + " + + + +
+ + +

+ abc + xbc +

+
+ + +

+ bc + yc +

+
+ + +

+ c + z +

+
+ +
+ +
diff -Nru lttoolbox-3.6.6/tests/data/postgen-short.dix lttoolbox-3.7.1/tests/data/postgen-short.dix --- lttoolbox-3.6.6/tests/data/postgen-short.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/postgen-short.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,32 @@ + + + + + + + + + + +

+ la + sela +

+
+
+ + +
+ +
+ + +

+ ea + a +

+ + +
+ + diff -Nru lttoolbox-3.6.6/tests/data/pp2p.dix lttoolbox-3.7.1/tests/data/pp2p.dix --- lttoolbox-3.6.6/tests/data/pp2p.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/pp2p.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,13 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + +
+

pp¤p

+

pp¤pp

+
+
diff -Nru lttoolbox-3.6.6/tests/data/underscore.dix lttoolbox-3.7.1/tests/data/underscore.dix --- lttoolbox-3.6.6/tests/data/underscore.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/underscore.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,18 @@ + + + + + + + + + + +
+

_n_n

+

||

+
+ + + +
diff -Nru lttoolbox-3.6.6/tests/data/upp2up.dix lttoolbox-3.7.1/tests/data/upp2up.dix --- lttoolbox-3.6.6/tests/data/upp2up.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/upp2up.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,14 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + +
+

upp¤up

+

upp¤upp

+

pypy

+
+
diff -Nru lttoolbox-3.6.6/tests/data/variants.dix lttoolbox-3.7.1/tests/data/variants.dix --- lttoolbox-3.6.6/tests/data/variants.dix 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/data/variants.dix 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,30 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + + + + + + + + +
+

abcab

+

abab

+

ababbb

+

yy

+

nn

+
+ +
+

jgjg

+

jhjh

+

kgkg

+
+ +
diff -Nru lttoolbox-3.6.6/tests/lt_append/__init__.py lttoolbox-3.7.1/tests/lt_append/__init__.py --- lttoolbox-3.6.6/tests/lt_append/__init__.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_append/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -1,15 +1,8 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - -import os -from proctest import ProcTest +from basictest import ProcTest import unittest -from subprocess import Popen, PIPE, call -from tempfile import mkdtemp -from shutil import rmtree - -class AppendProcTest(ProcTest): +class AppendProcTest(unittest.TestCase, ProcTest): dix1 = "data/append1.dix" dix2 = "data/append2.dix" dir1 = "lr" @@ -17,21 +10,11 @@ procflags = ["-z"] def compileTest(self, tmpd): - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-comp", - self.dir1, - self.dix1, - tmpd+"/dix1.bin"], - stdout=PIPE)) - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-comp", - self.dir2, - self.dix2, - tmpd+"/dix2.bin"], - stdout=PIPE)) - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-append", - tmpd+"/dix1.bin", - tmpd+"/dix2.bin", - tmpd+"/compiled.bin"], - stdout=PIPE)) + self.compileDix(self.dir1, self.dix1, binName=tmpd+'/dix1.bin') + self.compileDix(self.dir2, self.dix2, binName=tmpd+'/dix2.bin') + self.callProc('lt-append', [tmpd+"/dix1.bin", + tmpd+"/dix2.bin", + tmpd+"/compiled.bin"]) return True class SimpleAppend(AppendProcTest): diff -Nru lttoolbox-3.6.6/tests/lt_apply_acx/__init__.py lttoolbox-3.7.1/tests/lt_apply_acx/__init__.py --- lttoolbox-3.6.6/tests/lt_apply_acx/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_apply_acx/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,19 @@ +import unittest +from basictest import ProcTest + +class AcxTest(unittest.TestCase, ProcTest): + dix = 'data/minimal-mono.dix' + acx = 'data/basic.acx' + procdir = 'lr' + inputs = ['abc', 'ábc', 'äbc'] + expectedOutputs = ['^abc/ab$', + '^ábc/ab$', + '^äbc/ab$'] + + def compileTest(self, tmpd): + ret = self.compileDix(self.procdir, self.dix, + binName=tmpd+'/plain.bin') + if not ret: return ret + self.callProc('lt-apply-acx', + [tmpd+'/plain.bin', self.acx, tmpd+'/compiled.bin']) + return True diff -Nru lttoolbox-3.6.6/tests/lt_comp/__init__.py lttoolbox-3.7.1/tests/lt_comp/__init__.py --- lttoolbox-3.6.6/tests/lt_comp/__init__.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_comp/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -1,50 +1,115 @@ # -*- coding: utf-8 -*- -from proctest import ProcTest +from basictest import ProcTest, PrintTest import unittest -class CompNormalAndJoin(ProcTest): +class CompNormalAndJoin(unittest.TestCase, ProcTest): inputs = ["abc", "ab", "y", "n", "jg", "jh", "kg"] expectedOutputs = ["^abc/ab$", "^ab/ab$", "^y/y$", "^n/n$", "^jg/j+g$", "^jh/j+h$", "^kg/k+g$"] -class EmptyDixOk(ProcTest): +class EmptyDixOk(unittest.TestCase, ProcTest): procdix = "data/entirely-empty.dix" inputs = ["abc"] expectedOutputs = ["^abc/*abc$"] -class CompEmptyLhsShouldError(ProcTest): +class CompEmptyLhsShouldError(unittest.TestCase, ProcTest): procdix = "data/lhs-empty-mono.dix" expectedCompRetCodeFail = True -class CompEmptyRhsShouldError(ProcTest): +class CompEmptyRhsShouldError(unittest.TestCase, ProcTest): procdir = "rl" procdix = "data/rhs-empty-mono.dix" expectedCompRetCodeFail = True -class CompLhsInitialSpaceShouldError(ProcTest): +class CompLhsInitialSpaceShouldError(unittest.TestCase, ProcTest): procdix = "data/lhs-ws-mono.dix" expectedCompRetCodeFail = True -class CompRhsInitialSpaceShouldError(ProcTest): +class CompRhsInitialSpaceShouldError(unittest.TestCase, ProcTest): procdix = "data/rhs-ws-mono.dix" procdir = "rl" expectedCompRetCodeFail = True -class CompAttEpsilonLoopShouldError(ProcTest): +class CompAttEpsilonLoopShouldError(unittest.TestCase, ProcTest): procdix = "data/cat-epsilon-loop.att" expectedCompRetCodeFail = True -class CompAttEpsilonToFinalShouldError(ProcTest): +class CompAttEpsilonToFinalShouldError(unittest.TestCase, ProcTest): procdix = "data/cat-epsilon-to-final.att" expectedCompRetCodeFail = True -class CompSplitMultichar(ProcTest): +class CompSplitMultichar(unittest.TestCase, ProcTest): procdix = "data/multichar.att" inputs = ["א"] expectedOutputs = ["^א/אַן$"] + +class CompLSX(unittest.TestCase, PrintTest): + printdix = "data/basic.lsx" + expectedOutput = '''0 1 0.000000\t +1 1 0.000000\t +1 2 0.000000\t +2 3 0.000000\t +3 3 0.000000\t +3 4 <$> 0.000000\t +4 5 p <$> 0.000000\t +5 6 r ε 0.000000\t +6 7 p ε 0.000000\t +7 8 e ε 0.000000\t +8 9 r ε 0.000000\t +9 10 s ε 0.000000\t +10 11 ε 0.000000\t +11 12 <$> ε 0.000000\t +12 14 ε ε 0.000000\t +12 13 <$> <$> 0.000000\t +13 14 ε ε 0.000000\t +14 0.000000 +''' + + +class VariantNoTest(unittest.TestCase, ProcTest): + procdix = 'data/variants.dix' + procdir = 'lr' + compflags = [] + inputs = ['y'] + expectedOutputs = ['^y/*y$'] + + +class VariantHoTest(unittest.TestCase, ProcTest): + procdix = 'data/variants.dix' + procdir = 'lr' + compflags = ['--var-right="ho"'] + inputs = ['y'] + expectedOutputs = ['^y/y$'] + + +class RestrictTest(unittest.TestCase, ProcTest): + procdix = 'data/variants.dix' + procdir = 'lr' + restrictflags = [] + inputs = ['abc', 'ab'] + expectedOutputs = ['^abc/ab$', '^ab/*ab$'] + + def compileTest(self, tmpd): + ret = self.compileDix('u', self.procdix, binName=tmpd+'/uni.bin') + if not ret: return ret + self.callProc('lt-restrict', + [self.procdir, tmpd+'/uni.bin', tmpd+'/compiled.bin'], + self.restrictflags) + +class RestrictRL1(RestrictTest): + procdir = 'rl' + restrictflags = ['-v', 'gascon'] + inputs = ['abc', 'ab'] + expectedOutputs = ['^abc/*abc$', '^ab/ab$'] + +class RestrictRL2(RestrictTest): + procdir = 'rl' + restrictflags = ['-v', 'oci'] + inputs = ['abc', 'ab'] + expectedOutputs = ['^abc/*abc$', '^ab/abbb$'] diff -Nru lttoolbox-3.6.6/tests/lt_compose/__init__.py lttoolbox-3.7.1/tests/lt_compose/__init__.py --- lttoolbox-3.6.6/tests/lt_compose/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_compose/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +from basictest import ProcTest +import unittest + + +class ComposeProcTest(unittest.TestCase, ProcTest): + monodix = "data/compose1.dix" + monodir = "lr" + bidix = "data/pp2p.dix" + bidir = "lr" + procflags = ["-z"] + composeflags = ["--inverted", "--anywhere"] + + def compileTest(self, tmpd): + self.compileDix(self.monodir, self.monodix, binName=tmpd+'/f.bin') + self.compileDix(self.bidir, self.bidix, binName=tmpd+'/g.bin') + self.callProc('lt-compose', + self.composeflags + [tmpd+"/f.bin", + tmpd+"/g.bin", + tmpd+"/compiled.bin"]) + # The above already asserts retcode, so if we got this far we know it + # compiled fine: + return True + + +class ComposeSimpleCompound(ComposeProcTest): + procflags = ["-e", "-z"] + inputs = ["oppy", "appy", + "py", + "opp", "app"] + expectedOutputs = ["^oppy/opp+py$", "^appy/app+py$", + "^py/py$", + "^opp/*opp$", "^app/*app$"] + + +class ComposeNotEverywhere(ComposeProcTest): + procflags = ["-e", "-z"] + inputs = ["upp", "up", "uppy"] + expectedOutputs = ["^upp/upp$", + "^up/*up$", + "^uppy/upp+py$"] + + +class ComposeAnchored(ComposeProcTest): + composeflags = ["--inverted"] + bidix = "data/upp2up.dix" + procflags = ["-e", "-z"] + inputs = ["upp", "up", + "tuppy", "tupp", + "uppy", "upppy", "py", + "opp", "oppy", + "app", "appy"] + expectedOutputs = ["^upp/*upp$", "^up/*up$", + "^tuppy/*tuppy$", "^tupp/*tupp$", + "^uppy/upp+py$", "^upppy/upp+py$", "^py/py$", + "^opp/*opp$", "^oppy/*oppy$", + "^app/*app$", "^appy/*appy$" + ] diff -Nru lttoolbox-3.6.6/tests/lt_expand/__init__.py lttoolbox-3.7.1/tests/lt_expand/__init__.py --- lttoolbox-3.6.6/tests/lt_expand/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_expand/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,30 @@ +import unittest +from basictest import BasicTest + +class ExpandTest(unittest.TestCase, BasicTest): + expanddix = 'data/minimal-mono.dix' + expanddir = 'lr' + expectedOutput = '''abc:ab +ab:ab +y:y +n:n +jg:j+g +jh:j+h +kg:k+g +''' + expandflags = [] + + def runTest(self): + pp = self.openPipe('lt-expand', self.expandflags + [self.expanddix]) + self.assertEqual(self.communicateFlush(None, pp), + self.expectedOutput) + self.closePipe(pp, False) + +class ExpandRegex(ExpandTest): + expanddix = 'data/expand-re.dix' + expectedOutput = '''abc:ab +ab:ab +y:y +n:n +__REGEXP__xyz\\:abc[qxj]\\+:__REGEXP__xyz\\:abc[qxj]\\+ +''' diff -Nru lttoolbox-3.6.6/tests/lt_paradigm/__init__.py lttoolbox-3.7.1/tests/lt_paradigm/__init__.py --- lttoolbox-3.6.6/tests/lt_paradigm/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_paradigm/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,39 @@ +from basictest import ProcTest +import unittest + +class ParadigmTest(unittest.TestCase, ProcTest): + inputs = ['ab<*>', 'y<*>', '*'] + expectedOutputs = ['ab:abc\nab:ab', + 'y:y', + 'ab:abc'] + procdix = 'data/minimal-mono.dix' + procdir = 'rl' + sortoutput = True + + def runTestFlush(self, tmpd): + proc = self.openPipe('lt-paradigm', + self.procflags+[tmpd+'/compiled.bin']) + self.assertEqual(len(self.inputs), len(self.expectedOutputs)) + for inp, exp in zip(self.inputs, self.expectedOutputs): + out = self.communicateFlush(inp + '\n', proc).strip() + if self.sortoutput: + srt = '\n'.join(sorted(out.splitlines())) + self.assertEqual(exp, srt) + else: + self.assertEqual(exp, out) + self.closePipe(proc, expectFail=self.expectedRetCodeFail) + +class ParadigmAnalyzerTest(ParadigmTest): + procdir = 'lr' + procflags = ['-a'] + +class ExcludeTest(ParadigmTest): + procflags = ['-e', ''] + inputs = ['*<*>'] + expectedOutputs = ['ab:abc'] + +class SortTest(ParadigmTest): + procflags = ['-s'] + inputs = ['*<*>'] + expectedOutputs = ['ab:abc\nab:ab\nn:n\ny:y'] + sortoutput = False diff -Nru lttoolbox-3.6.6/tests/lt_print/__init__.py lttoolbox-3.7.1/tests/lt_print/__init__.py --- lttoolbox-3.6.6/tests/lt_print/__init__.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_print/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - import unittest -from printtest import PrintTest +from basictest import PrintTest class NonWeightedFst(unittest.TestCase, PrintTest): diff -Nru lttoolbox-3.6.6/tests/lt_proc/__init__.py lttoolbox-3.7.1/tests/lt_proc/__init__.py --- lttoolbox-3.6.6/tests/lt_proc/__init__.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_proc/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -1,10 +1,9 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from proctest import ProcTest - -from typing import List +from basictest import ProcTest as _ProcTest +import unittest +class ProcTest(unittest.TestCase, _ProcTest): + pass class ValidInput(ProcTest): inputs = ["ab", @@ -155,53 +154,55 @@ class PostgenerationWordboundBlankTest(ProcTest): procdix = "data/postgen.dix" procflags = ["-p", "-z"] - inputs = [ "xyz ejemplo [[t:i:123456]]~o[[/]] [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:iopmnb]]nombre[[/]].", - "xyz ejemplo [[t:b:poim230]]~o[[/]] ho [[t:i:mnbj203]]nombre[[/]].", - "xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:iopmnb]]nombre[[/]].", - "xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] ~le la [[t:b:iopmnb]]nombre[[/]].", - "xyz ejemplo [[t:i:1235gb]]~o[[/]] [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:i4x56fb]]~le[[/]] la nombre.", - "xyz [[t:i:123456]]~le[[/]] [[t:b:123gfv]]la[[/]] pelota.", - "xyz ~le [[t:b:123gfv]]la[[/]] pelota.", - "xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] ~le [[t:b:io1245b]]la[[/]] [[t:b:iopmnb]]nombre[[/]].", - "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", - "[[t:b:h5lVhA]]El[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]perro[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:npAFwg]]amigo[[/]][]", - "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]~de[[/]] el [[t:i:wSM6RQ]]amigo[[/]][]", - "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] ~de [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", - "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] ~de el [[t:i:wSM6RQ]]amigo[[/]][]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:abc123; t:i:123456]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:abc123; t:i:123456]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:text:NaNaNa]]pla~ss[[/]]", - "[[t:text:NaNaNa]]pla~sss[[/]]", - "[[t:text:NaNaNa]]pla~ssar[[/]]", - "[[t:text:NaNaNa]]pla~sssar[[/]]"] - - expectedOutputs = [ "xyz ejemplo [[t:i:123456; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].", - "xyz ejemplo [[t:b:poim230]]u ho[[/]] [[t:i:mnbj203]]nombre[[/]].", - "xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].", - "xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] se la [[t:b:iopmnb]]nombre[[/]].", - "xyz ejemplo [[t:i:1235gb; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:i4x56fb]]se la[[/]] nombre.", - "xyz [[t:i:123456; t:b:123gfv]]se la[[/]] pelota.", - "xyz [[t:b:123gfv]]se la[[/]] pelota.", - "xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:io1245b]]se la[[/]] [[t:b:iopmnb]]nombre[[/]].", - "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", - "[[t:b:h5lVhA]]El[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]perro[[/]] [[t:b:Z9eiLA; t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:npAFwg]]amigo[[/]][]", - "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", - "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", - "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] del [[t:i:wSM6RQ]]amigo[[/]][]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:b:abc123; t:i:123456]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] le pe test [[t:b:abc123; t:i:123456]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456; t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:text:NaNaNa]]plass[[/]]", - "[[t:text:NaNaNa]]plass[[/]]", - "[[t:text:NaNaNa]]plassar[[/]]", - "[[t:text:NaNaNa]]plassar[[/]]"] + inputs = [ + "[l1] xyz ejemplo [[t:i:123456]]~o[[/]] [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:iopmnb]]nombre[[/]].", + "[l2] xyz ejemplo [[t:b:poim230]]~o[[/]] ho [[t:i:mnbj203]]nombre[[/]].", + "[l3] xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:iopmnb]]nombre[[/]].", + "[l4] xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] ~le la [[t:b:iopmnb]]nombre[[/]].", + "[l5] xyz ejemplo [[t:i:1235gb]]~o[[/]] [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:i4x56fb]]~le[[/]] la nombre.", + "[l6] xyz [[t:i:123456]]~le[[/]] [[t:b:123gfv]]la[[/]] pelota.", + "[l7] xyz ~le [[t:b:123gfv]]la[[/]] pelota.", + "[l8] xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] ~le [[t:b:io1245b]]la[[/]] [[t:b:iopmnb]]nombre[[/]].", + "[l9] [[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[l10] [[t:b:h5lVhA]]El[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]perro[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:npAFwg]]amigo[[/]][]", + "[l11] [[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]~de[[/]] el [[t:i:wSM6RQ]]amigo[[/]][]", + "[l12] [[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] ~de [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[l13] [[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] ~de el [[t:i:wSM6RQ]]amigo[[/]][]", + "[l14] [[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:abc123; t:i:123456]]testword[[/]]", + "[l15] [[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:abc123; t:i:123456]]testword[[/]]", + "[l16] [[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[l17] [[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[l18] [[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[l19] [[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[l20] [[t:text:NaNaNa]]pla~ss[[/]]", + "[l21] [[t:text:NaNaNa]]pla~sss[[/]]", + "[l22] [[t:text:NaNaNa]]pla~ssar[[/]]", + "[l23] [[t:text:NaNaNa]]pla~sssar[[/]]"] + + expectedOutputs = [ + "[l1] xyz ejemplo [[t:i:123456; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].", + "[l2] xyz ejemplo [[t:b:poim230]]u ho[[/]] [[t:i:mnbj203]]nombre[[/]].", + "[l3] xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].", + "[l4] xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] se la [[t:b:iopmnb]]nombre[[/]].", + "[l5] xyz ejemplo [[t:i:1235gb; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:i4x56fb]]se la[[/]] nombre.", + "[l6] xyz [[t:i:123456; t:b:123gfv]]se la[[/]] pelota.", + "[l7] xyz [[t:b:123gfv]]se la[[/]] pelota.", + "[l8] xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:io1245b]]se la[[/]] [[t:b:iopmnb]]nombre[[/]].", + "[l9] [[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[l10] [[t:b:h5lVhA]]El[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]perro[[/]] [[t:b:Z9eiLA; t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:npAFwg]]amigo[[/]][]", + "[l11] [[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[l12] [[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[l13] [[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] del [[t:i:wSM6RQ]]amigo[[/]][]", + "[l14] [[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:b:abc123; t:i:123456]]testword[[/]]", + "[l15] [[t:b:Z9eiLA]]abc[[/]] le pe test [[t:b:abc123; t:i:123456]]testword[[/]]", + "[l16] [[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[l17] [[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[l18] [[t:b:Z9eiLA]]abc[[/]] [[t:i:123456; t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[l19] [[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[l20] [[t:text:NaNaNa]]plass[[/]]", + "[l21] [[t:text:NaNaNa]]plass[[/]]", + "[l22] [[t:text:NaNaNa]]plassar[[/]]", + "[l23] [[t:text:NaNaNa]]plassar[[/]]"] class PostgenerationWordboundBlankEscapingTest(ProcTest): @@ -224,7 +225,7 @@ procdix = "data/space-eof-incond.dix" inputs = ['. '] expectedOutputs = ['^./.$ '] - procflags = [] # type: List[str] + procflags = [] flushing = False @@ -327,5 +328,117 @@ procflags = ['-d', '-b', '-z'] procdir = "rl" +class WordboundBlankNoNestingPostgenTest(ProcTest): + procdix = "data/postgen.dix" + procflags = ["-p", "-z"] + inputs = [ + "[[t:text:SyTAKg]]xyz~le[[/]][[t:text:SyTAKg]]pqr[[/]]", + "[[t:text:SyTAKg]]xyz~les[[/]][[t:text:SyTAKg]]pqr[[/]]", + ] + expectedOutputs = [ + "[[t:text:SyTAKg]]xyzle[[/]][[t:text:SyTAKg]]pqr[[/]]", + "[[t:text:SyTAKg]]xyzle pe test[[/]][[t:text:SyTAKg]]pqr[[/]]", + ] + +class PostgenShort(ProcTest): + # test for https://github.com/apertium/lttoolbox/issues/123 + procdix = "data/postgen-short.dix" + inputs = ["~e aga", "~E aga"] + expectedOutputs = ["aga", "Aga"] + procflags = ['-p', '-z'] + +class PostgenBacktrack(ProcTest): + procdix = "data/postgen-overlap.dix" + inputs = ["abc"] + expectedOutputs = ["xyz"] + procflags = ['-p', '-z'] + +class PostgenOciBacktrack(ProcTest): + # data from https://github.com/apertium/lttoolbox/issues/123#issuecomment-1152352856 + procdix = "data/oci-pgen.dix" + inputs = ['[1] ~detlo lièch', + '[2] ~Detlo lièch.', + '[3] Cap ~a ~detlo lièch.', + '[4] Ostal ~de ~detlo lièch.', + '[5] ~detlo amic, ~Detlo amic.', + '[6] Cap ~a ~detlo amic.', + '[7] Ostal ~de ~detlo amic.', + '[8] ~detla fin, ~Detla fin.', + '[9] Cap ~a ~detla fin.', + '[10] Ostal ~de ~detla fin.', + '[11] ~detla amiga, ~Detla amiga.', + '[12] Cap ~a ~detla amiga.', + '[13] Ostal ~de ~detla amiga.', + '[14] ~detlos lièchs, ~Detlos lièchs.', + '[15] Cap ~a ~detlos lièchs.', + '[16] Ostal ~de ~detlos lièchs.', + '[17] ~detlos amics, ~Detlos amics.', + '[18] Cap ~a ~detlos amics.', + '[19] Ostal ~de ~detlos amics.', + '[20] ~detlas fins, ~Detlas fins.', + '[21] Cap ~a ~detlas fins.', + '[22] Ostal ~de ~detlas fins.', + '[23] ~detlas amigas, ~Detlas amigas.', + '[24] Cap ~a ~detlas amigas.', + '[25] Ostal ~de ~detlas amigas.'] + expectedOutputs = ["[1] lo lièch", + "[2] Lo lièch.", + "[3] Cap al lièch.", + "[4] Ostal del lièch.", + "[5] l'amic, L'amic.", + "[6] Cap a l'amic.", + "[7] Ostal de l'amic.", + "[8] la fin, La fin.", + "[9] Cap a la fin.", + "[10] Ostal de la fin.", + "[11] l'amiga, L'amiga.", + "[12] Cap a l'amiga.", + "[13] Ostal de l'amiga.", + "[14] los lièchs, Los lièchs.", + "[15] Cap als lièchs.", + "[16] Ostal dels lièchs.", + "[17] los amics, Los amics.", + "[18] Cap als amics.", + "[19] Ostal dels amics.", + "[20] las fins, Las fins.", + "[21] Cap a las fins.", + "[22] Ostal de las fins.", + "[23] las amigas, Las amigas.", + "[24] Cap a las amigas.", + "[25] Ostal de las amigas."] + procflags = ['-p', '-z'] + +class PostgenRetainCaps(ProcTest): + procdix = "data/oci-pgen.dix" + procflags = ['-p', '-z'] + inputs = ['[01] ~detlo ostal', + '[02] ~detlo Ostal'] + expectedOutputs = ["[01] l'ostal", + "[02] l'Ostal"] + +class BufferIndex(ProcTest): + procdix = "data/underscore.dix" + inputs = ["_a", + "_n", + "_𐐔", + "x|x", + "a­b", + ] + expectedOutputs = ["_^a/*a$", + "^_n/_n$", + "_^𐐔/*𐐔$", + "^x/*x$|^x/*x$", + "^ab/*ab$", + ] + + +class Bigen(ProcTest): + """Test that we can run -b with -g before, and -b should override it.""" + procdix = "data/minimal-mono.dix" + procflags = ['-g', '-b', '-z'] + procdir = "rl" + inputs = ["^ab$"] + expectedOutputs = ["^ab/abc$"] + # These fail on some systems: #from null_flush_invalid_stream_format import * diff -Nru lttoolbox-3.6.6/tests/lt_trim/__init__.py lttoolbox-3.7.1/tests/lt_trim/__init__.py --- lttoolbox-3.6.6/tests/lt_trim/__init__.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/lt_trim/__init__.py 2022-11-01 08:36:47.000000000 +0000 @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - # If you have HFST installed, you can diff lttoolbox binaries like this: # $ lt-print -H full.bin | hfst-txt2fst | hfst-fst2strings -c1 > full.strings # $ lt-print -H trim.bin | hfst-txt2fst | hfst-fst2strings -c1 > trim.strings @@ -8,15 +6,11 @@ # This is similar to diffing the lt-expand of uncompiled XML dictionaries. # See also `man hfst-fst2strings'. -import os -from proctest import ProcTest +from basictest import ProcTest, TempDir import unittest -from subprocess import Popen, PIPE, call -from tempfile import mkdtemp -from shutil import rmtree -class TrimProcTest(ProcTest): +class TrimProcTest(unittest.TestCase, ProcTest): monodix = "data/minimal-mono.dix" monodir = "lr" bidix = "data/minimal-bi.dix" @@ -24,21 +18,14 @@ procflags = ["-z"] def compileTest(self, tmpd): - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-comp", - self.monodir, - self.monodix, - tmpd+"/mono.bin"], - stdout=PIPE)) - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-comp", - self.bidir, - self.bidix, - tmpd+"/bi.bin"], - stdout=PIPE)) - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-trim", - tmpd+"/mono.bin", + self.compileDix(self.monodir, self.monodix, binName=tmpd+'/mono.bin') + self.compileDix(self.bidir, self.bidix, binName=tmpd+'/bi.bin') + self.callProc('lt-trim', [tmpd+"/mono.bin", tmpd+"/bi.bin", - tmpd+"/compiled.bin"], - stdout=PIPE)) + tmpd+"/compiled.bin"]) + # The above already asserts retcode, so if we got this far we know it + # compiled fine: + return True class TrimNormalAndJoin(TrimProcTest): @@ -110,7 +97,6 @@ monodix = "data/group-mono.dix" bidix = "data/group-bi.dix" - class FinalEpsilons(TrimProcTest): inputs = ["ea"] expectedOutputs = ["^ea/e#a$"] @@ -138,7 +124,6 @@ bidix = "data/double-clitics-bi.dix" bidir = "lr" - class GroupAfterJoin(TrimProcTest): "https://sourceforge.net/p/apertium/tickets/117/" inputs = ["notG a"] @@ -147,27 +132,23 @@ bidix = "data/group-after-join-bi.dix" bidir = "lr" - class Empty(TrimProcTest): def runTest(self): - tmpd = mkdtemp() - try: - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-comp", - "lr", - "data/empty-mono.dix", - tmpd+"/empty-mono.bin"], - stdout=PIPE)) - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-comp", - "rl", # rl! - "data/empty-bi.dix", - tmpd+"/empty-bi.bin"], - stdout=PIPE)) - self.assertEqual(1, call([os.environ['LTTOOLBOX_PATH']+"/lt-trim", - tmpd+"/empty-mono.bin", + with TempDir() as tmpd: + self.compileDix('lr', 'data/empty-mono.dix', + binName=tmpd+'/empty-mono.bin') + self.compileDix('rl', 'data/empty-bi.dix', + binName=tmpd+'/empty-bi.bin') + self.callProc('lt-trim', [tmpd+"/empty-mono.bin", tmpd+"/empty-bi.bin", tmpd+"/empty-trimmed.bin"], - stdout=PIPE, - stderr=PIPE)) + expectFail=True) - finally: - rmtree(tmpd) +class PlusLemma(TrimProcTest): + monodix = 'data/plus-lemma-mono.dix' + bidix = 'data/plus-lemma-bi.dix' + bidir = 'lr' + inputs = ['abc', 'I+D', 'jg'] + expectedOutputs = ['^abc/ab$', + '^I+D/I+D$', + '^jg/j+g$'] diff -Nru lttoolbox-3.6.6/tests/printtest.py lttoolbox-3.7.1/tests/printtest.py --- lttoolbox-3.6.6/tests/printtest.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/printtest.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- - -import os -from subprocess import Popen, PIPE, call -from tempfile import mkdtemp -from shutil import rmtree -from basictest import BasicTest - - -class PrintTest(BasicTest): - """See lt_print test for how to use this. Override runTest if you don't - want to use NUL flushing.""" - - printdix = "data/minimal-mono.dix" - printdir = "lr" - expectedOutput = "" - expectedRetCodeFail = False - printflags = [] - - def compileTest(self, tmpd): - self.assertEqual(0, call([os.environ['LTTOOLBOX_PATH']+"/lt-comp", - self.printdir, - self.printdix, - tmpd+"/compiled.bin"], - stdout=PIPE)) - - def runTest(self): - tmpd = mkdtemp() - try: - self.compileTest(tmpd) - self.printresult = Popen([os.environ['LTTOOLBOX_PATH']+"/lt-print"] + self.printflags + [tmpd+"/compiled.bin"], - stdout=PIPE, - stderr=PIPE) - - self.assertEqual(self.communicateFlush(None, self.printresult), self.expectedOutput) - - self.printresult.communicate() # let it terminate - self.printresult.stdout.close() - self.printresult.stderr.close() - retCode = self.printresult.poll() - if self.expectedRetCodeFail: - self.assertNotEqual(retCode, 0) - else: - self.assertEqual(retCode, 0) - - finally: - rmtree(tmpd) diff -Nru lttoolbox-3.6.6/tests/proctest.py lttoolbox-3.7.1/tests/proctest.py --- lttoolbox-3.6.6/tests/proctest.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/proctest.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,84 +0,0 @@ -# -*- coding: utf-8 -*- - -import os -from subprocess import Popen, PIPE, call -from tempfile import mkdtemp -from shutil import rmtree -from basictest import BasicTest -import unittest - -from typing import List - - -class ProcTest(unittest.TestCase, BasicTest): - """See lt_proc test for how to use this. Override runTest if you don't - want to use NUL flushing.""" - - procdix = "data/minimal-mono.dix" - procdir = "lr" - procflags = ["-z"] - inputs = [""] - expectedOutputs = [""] - expectedRetCodeFail = False - expectedCompRetCodeFail = False - flushing = True - - def compileTest(self, tmpd): - retCode = call([os.environ['LTTOOLBOX_PATH']+"/lt-comp", - self.procdir, - self.procdix, - tmpd+"/compiled.bin"], - stdout=PIPE, stderr=PIPE) - if self.expectedCompRetCodeFail: - self.assertNotEqual(retCode, 0) - else: - self.assertEqual(retCode, 0) - return retCode == 0 - - def runTest(self): - tmpd = mkdtemp() - try: - if not self.compileTest(tmpd): - return - if self.flushing: - self.runTestFlush(tmpd) - else: - self.runTestNoFlush(tmpd) - finally: - rmtree(tmpd) - - def openProc(self, tmpd): - return Popen([os.environ['LTTOOLBOX_PATH']+"/lt-proc"] - + self.procflags - + [tmpd+"/compiled.bin"], - stdin=PIPE, - stdout=PIPE, - stderr=PIPE) - - def runTestFlush(self, tmpd): - proc = self.openProc(tmpd) - self.assertEqual(len(self.inputs), - len(self.expectedOutputs)) - for inp, exp in zip(self.inputs, self.expectedOutputs): - self.assertEqual(self.communicateFlush(inp+"[][\n]", proc), - exp+"[][\n]") - proc.communicate() # let it terminate - proc.stdin.close() - proc.stdout.close() - proc.stderr.close() - retCode = proc.poll() - if self.expectedRetCodeFail: - self.assertNotEqual(retCode, 0) - else: - self.assertEqual(retCode, 0) - - def runTestNoFlush(self, tmpd): - for inp, exp in zip(self.inputs, self.expectedOutputs): - proc = self.openProc(tmpd) - self.assertEqual(proc.communicate(input=inp.encode('utf-8'))[0], - exp.encode('utf-8')) - retCode = proc.poll() - if self.expectedRetCodeFail: - self.assertNotEqual(retCode, 0) - else: - self.assertEqual(retCode, 0) diff -Nru lttoolbox-3.6.6/tests/run_tests.py lttoolbox-3.7.1/tests/run_tests.py --- lttoolbox-3.6.6/tests/run_tests.py 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/tests/run_tests.py 2022-11-01 08:36:47.000000000 +0000 @@ -3,23 +3,20 @@ import sys import os sys.path.append(os.path.realpath(".")) - import unittest -import lt_proc -import lt_trim -import lt_print -import lt_comp -import lt_append os.environ['LTTOOLBOX_PATH'] = '../lttoolbox' if len(sys.argv) > 1: - os.environ['LTTOOLBOX_PATH'] = sys.argv[1] + os.environ['LTTOOLBOX_PATH'] = sys.argv[1] + +modules = ['lt_proc', 'lt_trim', 'lt_print', 'lt_comp', 'lt_append', + 'lt_paradigm', 'lt_expand', 'lt_apply_acx', 'lt_compose'] if __name__ == "__main__": os.chdir(os.path.dirname(__file__)) failures = 0 - for module in [lt_trim, lt_proc, lt_print, lt_comp, lt_append]: - suite = unittest.TestLoader().loadTestsFromModule(module) + for module in modules: + suite = unittest.TestLoader().loadTestsFromName(module) res = unittest.TextTestRunner(verbosity = 2).run(suite) failures += len(res.failures) sys.exit(min(failures, 255))