? + */ + bool entry_debugging = false; + /** * Identifier of all the symbols during the compilation @@ -138,42 +141,41 @@ /** * List of named transducers-paradigms */ - map paradigms; + std::map> paradigms; /** * List of named dictionary sections */ - map sections; + std::map sections; /** * List of named prefix copy of a paradigm */ - map > prefix_paradigms; + std::map > prefix_paradigms; /** * List of named suffix copy of a paradigm */ - map > suffix_paradigms; + std::map, std::less<>> suffix_paradigms; /** * List of named endings of a suffix copy of a paradgim */ - map > postsuffix_paradigms; + std::map > postsuffix_paradigms; /** * Mapping of aliases of characters specified in ACX files */ - map > acx_map; + std::map > acx_map; /** - * Original char being mapped + * LSX symbols */ - int acx_current_char = 0; - - /* - static string range(char const a, char const b); - string readAlphabet(); - */ + int32_t any_tag = 0; + int32_t any_char = 0; + int32_t word_boundary = 0; + int32_t word_boundary_s = 0; + int32_t word_boundary_ns = 0; /** * Method to parse an XML Node @@ -181,12 +183,6 @@ void procNode(); /** - * Method to parse an XML Node in ACX files - */ - void procNodeACX(); - - - /** * Parse the <alphabet> element */ void procAlphabet(); @@ -207,6 +203,13 @@ void procEntry(); /** + * Return true if the filter (command line) is consistent with + * the value (attribute) and false otherwise + */ + bool filterEntry(UStringView value, UStringView filter, bool keep_on_empty_filter); + void symbolFilters(UStringView value, UStringView prefix, std::vector>& symbols); + + /** * Parse the <re> element * @return a list of tokens from the dictionary's entry */ @@ -222,7 +225,7 @@ * @param name the name of the attribute * @return the value of the attribute */ - UString attrib(UString const &name); + UString attrib(UStringView name); /** * Construct symbol pairs by align left side of both parts and insert @@ -233,7 +236,7 @@ * @param t the transducer * @return the last state of the inserted transduction */ - int matchTransduction(vector const &lp, vector const &rp, + int matchTransduction(std::vector const &lp, std::vector const &rp, int state, Transducer &t, double const &entry_weight); /** * Parse the element @@ -257,14 +260,7 @@ * Insert a list of tokens into the paradigm / section being processed * @param elements the list */ - void insertEntryTokens(vector const &elements); - - /** - * Skip all document #text nodes before "elem" - * @param name the name of the node - * @param elem the name of the expected node - */ - void skip(UString &name, UString const &elem); + void insertEntryTokens(std::vector const &elements); /** * Skip all document #text nodes before "elem" @@ -272,7 +268,7 @@ * @param elem the name of the expected node * @param open true for open element, false for closed */ - void skip(UString &name, UString const &elem, bool open); + void skip(UString &name, UStringView elem, bool open = true); /** * Skip all blank #text nodes before "name" @@ -281,13 +277,13 @@ void skipBlanks(UString &name); - void readString(vector &result, UString const &name); + void readString(std::vector &result, UStringView name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(UString const &name); + void requireEmptyError(UStringView name); /** * Force an attribute to be specified, amd check for it @@ -295,8 +291,7 @@ * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(UString const &value, UString const &attrname, - UString const &elemname); + void requireAttribute(UStringView value, UStringView attrname, UStringView elemname); /** * True if all the elements in the current node are blanks @@ -304,7 +299,7 @@ */ bool allBlanks(); - bool valid(UString const& dir) const; + bool valid(UStringView dir) const; public: @@ -312,50 +307,56 @@ * Constants to represent the element and the attributes of * dictionaries */ - LTTOOLBOX_IMPORTS static UString const COMPILER_DICTIONARY_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ALPHABET_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_SDEFS_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_SDEF_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_N_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_PARDEFS_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_PARDEF_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_PAR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ENTRY_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_LR_VAL; - LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_RL_VAL; - LTTOOLBOX_IMPORTS static UString const COMPILER_PAIR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_LEFT_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_RIGHT_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_S_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_M_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_REGEXP_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_SECTION_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ID_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_TYPE_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_IDENTITY_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_IDENTITYGROUP_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_JOIN_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_BLANK_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_POSTGENERATOR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_GROUP_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_LEMMA_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_IGNORE_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_IGNORE_YES_VAL; - LTTOOLBOX_IMPORTS static UString const COMPILER_ALT_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_V_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_VL_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_VR_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_WEIGHT_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_TEXT_NODE; - LTTOOLBOX_IMPORTS static UString const COMPILER_COMMENT_NODE; - LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_ANALYSIS_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_CHAR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_EQUIV_CHAR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_VALUE_ATTR; - LTTOOLBOX_IMPORTS static UString const COMPILER_LSX_WB_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_LSX_CHAR_ELEM; - LTTOOLBOX_IMPORTS static UString const COMPILER_LSX_TAG_ELEM; + static constexpr UStringView COMPILER_DICTIONARY_ELEM = u"dictionary"; + static constexpr UStringView COMPILER_ALPHABET_ELEM = u"alphabet"; + static constexpr UStringView COMPILER_SDEFS_ELEM = u"sdefs"; + static constexpr UStringView COMPILER_SDEF_ELEM = u"sdef"; + static constexpr UStringView COMPILER_N_ATTR = u"n"; + static constexpr UStringView COMPILER_PARDEFS_ELEM = u"pardefs"; + static constexpr UStringView COMPILER_PARDEF_ELEM = u"pardef"; + static constexpr UStringView COMPILER_PAR_ELEM = u"par"; + static constexpr UStringView COMPILER_ENTRY_ELEM = u"e"; + static constexpr UStringView COMPILER_RESTRICTION_ATTR = u"r"; + static constexpr UStringView COMPILER_RESTRICTION_LR_VAL = u"LR"; + static constexpr UStringView COMPILER_RESTRICTION_RL_VAL = u"RL"; + static constexpr UStringView COMPILER_RESTRICTION_U_VAL = u"U"; + static constexpr UStringView COMPILER_PAIR_ELEM = u"p"; + static constexpr UStringView COMPILER_LEFT_ELEM = u"l"; + static constexpr UStringView COMPILER_RIGHT_ELEM = u"r"; + static constexpr UStringView COMPILER_S_ELEM = u"s"; + static constexpr UStringView COMPILER_M_ELEM = u"m"; + static constexpr UStringView COMPILER_REGEXP_ELEM = u"re"; + static constexpr UStringView COMPILER_SECTION_ELEM = u"section"; + static constexpr UStringView COMPILER_ID_ATTR = u"id"; + static constexpr UStringView COMPILER_TYPE_ATTR = u"type"; + static constexpr UStringView COMPILER_SEQUENTIAL_VAL = u"sequential"; + static constexpr UStringView COMPILER_SEPARABLE_VAL = u"separable"; + static constexpr UStringView COMPILER_IDENTITY_ELEM = u"i"; + static constexpr UStringView COMPILER_IDENTITYGROUP_ELEM = u"ig"; + static constexpr UStringView COMPILER_JOIN_ELEM = u"j"; + static constexpr UStringView COMPILER_BLANK_ELEM = u"b"; + static constexpr UStringView COMPILER_POSTGENERATOR_ELEM = u"a"; + static constexpr UStringView COMPILER_GROUP_ELEM = u"g"; + static constexpr UStringView COMPILER_LEMMA_ATTR = u"lm"; + static constexpr UStringView COMPILER_IGNORE_ATTR = u"i"; + static constexpr UStringView COMPILER_IGNORE_YES_VAL = u"yes"; + static constexpr UStringView COMPILER_ALT_ATTR = u"alt"; + static constexpr UStringView COMPILER_V_ATTR = u"v"; + static constexpr UStringView COMPILER_VL_ATTR = u"vl"; + static constexpr UStringView COMPILER_VR_ATTR = u"vr"; + static constexpr UStringView COMPILER_WEIGHT_ATTR = u"w"; + static constexpr UStringView COMPILER_TEXT_NODE = u"#text"; + static constexpr UStringView COMPILER_COMMENT_NODE = u"#comment"; + static constexpr UStringView COMPILER_ACX_ANALYSIS_ELEM = u"analysis-chars"; + static constexpr UStringView COMPILER_ACX_CHAR_ELEM = u"char"; + static constexpr UStringView COMPILER_ACX_EQUIV_CHAR_ELEM= u"equiv-char"; + static constexpr UStringView COMPILER_ACX_VALUE_ATTR = u"value"; + static constexpr UStringView COMPILER_LSX_WB_ELEM = u"d"; + static constexpr UStringView COMPILER_LSX_CHAR_ELEM = u"w"; + static constexpr UStringView COMPILER_LSX_TAG_ELEM = u"t"; + static constexpr UStringView COMPILER_LSX_SPACE_ATTR = u"space"; + static constexpr UStringView COMPILER_LSX_SPACE_YES_VAL = u"yes"; + static constexpr UStringView COMPILER_LSX_SPACE_NO_VAL = u"no"; /** * Constructor @@ -370,12 +371,12 @@ /** * Compile dictionary to letter transducers */ - void parse(string const &file, UString const &dir); + void parse(std::string const &file, UStringView dir); /** * Read ACX file */ - void parseACX(string const &file, UString const &dir); + void parseACX(std::string const &file, UStringView dir); /** @@ -408,25 +409,27 @@ * Set the alt value to use in compilation * @param a the value */ - void setAltValue(UString const &a); + void setAltValue(UStringView a); /** * Set the variant value to use in compilation * @param v the value */ - void setVariantValue(UString const &v); + void setVariantValue(UStringView v); /** * Set the variant_left value to use in compilation * @param v the value */ - void setVariantLeftValue(UString const &v); + void setVariantLeftValue(UStringView v); /** * Set the variant_right value to use in compilation * @param v the value */ - void setVariantRightValue(UString const &v); + void setVariantRightValue(UStringView v); + + void setEntryDebugging(bool b); }; diff -Nru lttoolbox-3.6.6/lttoolbox/compression.cc lttoolbox-3.7.1/lttoolbox/compression.cc --- lttoolbox-3.6.6/lttoolbox/compression.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/compression.cc 2022-11-01 08:36:47.000000000 +0000 @@ -28,7 +28,7 @@ { if(fwrite_unlocked(&byte, 1, 1, output) != 1) { - cerr << "I/O Error writing" << endl; + std::cerr << "I/O Error writing" << std::endl; exit(EXIT_FAILURE); } } @@ -40,7 +40,7 @@ if(fread_unlocked(&value, 1, 1, input) != 1) { // Not uncomment this code since -// cerr << "I/O Error reading" << endl; +// std::cerr << "I/O Error reading" << std::endl; // exit(EXIT_FAILURE); } @@ -87,13 +87,13 @@ } else { - cerr << "Out of range: " << value << endl; + std::cerr << "Out of range: " << value << std::endl; exit(EXIT_FAILURE); } } void -Compression::multibyte_write(unsigned int value, ostream &output) +Compression::multibyte_write(unsigned int value, std::ostream &output) { if(value < 0x00000040) { @@ -134,7 +134,7 @@ } else { - cerr << "Out of range: " << value << endl; + std::cerr << "Out of range: " << value << std::endl; exit(EXIT_FAILURE); } } @@ -194,7 +194,7 @@ } unsigned int -Compression::multibyte_read(istream &input) +Compression::multibyte_read(std::istream &input) { unsigned char up; unsigned int result = 0; @@ -255,9 +255,9 @@ void -Compression::string_write(UString const &str, FILE *output) +Compression::string_write(UStringView str, FILE *output) { - vector vec; + std::vector vec; ustring_to_vec32(str, vec); Compression::multibyte_write(vec.size(), output); for(auto c : vec) @@ -289,6 +289,16 @@ unsigned int mantissa = static_cast(static_cast(0x40000000 * frexp(value, &exp))); unsigned int exponent = static_cast(static_cast(exp)); + if (std::isinf(value)) { + mantissa = std::numeric_limits::max(); + if (value < 0) { + exponent = std::numeric_limits::max() - 1; + } + else { + exponent = std::numeric_limits::max(); + } + } + if(mantissa < 0x04000000) { multibyte_write(mantissa, output); @@ -319,13 +329,23 @@ } void -Compression::long_multibyte_write(const double& value, ostream &output) +Compression::long_multibyte_write(const double& value, std::ostream &output) { int exp = 0; unsigned int mantissa = static_cast(static_cast(0x40000000 * frexp(value, &exp))); unsigned int exponent = static_cast(static_cast(exp)); + if (std::isinf(value)) { + mantissa = std::numeric_limits::max(); + if (value < 0) { + exponent = std::numeric_limits::max() - 1; + } + else { + exponent = std::numeric_limits::max(); + } + } + if(mantissa < 0x04000000) { multibyte_write(mantissa, output); @@ -393,13 +413,23 @@ } double value = static_cast(static_cast(mantissa)) / 0x40000000; - result = ldexp(value, static_cast(exponent)); + if (mantissa == std::numeric_limits::max() && exponent >= std::numeric_limits::max() - 1) { + if (exponent == std::numeric_limits::max() - 1) { + result = -1.0*std::numeric_limits::infinity(); + } + else { + result = std::numeric_limits::infinity(); + } + } + else { + result = ldexp(value, static_cast(exponent)); + } return result; } double -Compression::long_multibyte_read(istream &input) +Compression::long_multibyte_read(std::istream &input) { double result = 0.0; @@ -436,7 +466,17 @@ } double value = static_cast(static_cast(mantissa)) / 0x40000000; - result = ldexp(value, static_cast(exponent)); + if (mantissa == std::numeric_limits::max() && exponent >= std::numeric_limits::max() - 1) { + if (exponent == std::numeric_limits::max() - 1) { + result = -1.0*std::numeric_limits::infinity(); + } + else { + result = std::numeric_limits::infinity(); + } + } + else { + result = ldexp(value, static_cast(exponent)); + } return result; } diff -Nru lttoolbox-3.6.6/lttoolbox/compression.h lttoolbox-3.7.1/lttoolbox/compression.h --- lttoolbox-3.6.6/lttoolbox/compression.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/compression.h 2022-11-01 08:36:47.000000000 +0000 @@ -24,8 +24,6 @@ #include #include -using namespace std; - // Global lttoolbox features constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; enum LT_FEATURES : uint64_t { @@ -157,7 +155,7 @@ * @param value integer to write. * @param output output stream. */ - static void multibyte_write(unsigned int value, ostream &os); + static void multibyte_write(unsigned int value, std::ostream &os); /** * Read and decode an integer from the input stream. @@ -173,7 +171,7 @@ * @param input input stream. * @return the integer value readed. */ - static unsigned int multibyte_read(istream &is); + static unsigned int multibyte_read(std::istream &is); /** * This method allows to write a plain string to an output stream @@ -182,7 +180,7 @@ * @param str the string to write. * @param output the output stream. */ - static void string_write(UString const &str, FILE *output); + static void string_write(UStringView str, FILE *output); /** * This method reads a plain string from the input stream. @@ -206,7 +204,7 @@ * @param value double to write. * @param output output stream. */ - static void long_multibyte_write(const double& value, ostream &os); + static void long_multibyte_write(const double& value, std::ostream &os); /** * Read and decode a double from the input stream. @@ -222,7 +220,7 @@ * @param input input stream. * @return the double value read. */ - static double long_multibyte_read(istream &is); + static double long_multibyte_read(std::istream &is); }; #endif diff -Nru lttoolbox-3.6.6/lttoolbox/deserialiser.h lttoolbox-3.7.1/lttoolbox/deserialiser.h --- lttoolbox-3.6.6/lttoolbox/deserialiser.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/deserialiser.h 2022-11-01 08:36:47.000000000 +0000 @@ -119,7 +119,7 @@ std::istream &Stream_) { first_type a = Deserialiser::type>::deserialise(Stream_); second_type b = Deserialiser::type>::deserialise(Stream_); - return std::make_pair(a, b); + return {a, b}; } template diff -Nru lttoolbox-3.6.6/lttoolbox/dix.dtd lttoolbox-3.7.1/lttoolbox/dix.dtd --- lttoolbox-3.6.6/lttoolbox/dix.dtd 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/dix.dtd 2022-11-01 08:36:47.000000000 +0000 @@ -18,7 +18,13 @@ --> - + + + + @@ -99,7 +105,7 @@ > - + @@ -108,13 +114,13 @@ - + - + - + - + + + + + + + + + + + + + diff -Nru lttoolbox-3.6.6/lttoolbox/entry_token.cc lttoolbox-3.7.1/lttoolbox/entry_token.cc --- lttoolbox-3.6.6/lttoolbox/entry_token.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/entry_token.cc 2022-11-01 08:36:47.000000000 +0000 @@ -61,14 +61,14 @@ } void -EntryToken::setParadigm(UString const &np) +EntryToken::setParadigm(UStringView np) { parName = np; type = paradigm; } void -EntryToken::setSingleTransduction(vector const &pi, vector const &pd, double const ew) +EntryToken::setSingleTransduction(std::vector const &pi, std::vector const &pd, double const ew) { weight = ew; leftSide = pi; @@ -77,7 +77,7 @@ } void -EntryToken::setRegexp(UString const &r) +EntryToken::setRegexp(UStringView r) { myregexp.clear(); ustring_to_vec32(r, myregexp); @@ -85,6 +85,13 @@ } void +EntryToken::setRegexp(const std::vector& r) +{ + myregexp = r; + type = regexp; +} + +void EntryToken::readRegexp(xmlTextReaderPtr reader) { XMLParseUtil::readValueInto32(reader, myregexp); @@ -115,19 +122,19 @@ return parName; } -vector const & +std::vector const & EntryToken::left() const { return leftSide; } -vector const & +std::vector const & EntryToken::right() const { return rightSide; } -vector const & +std::vector const & EntryToken::regExp() const { return myregexp; diff -Nru lttoolbox-3.6.6/lttoolbox/entry_token.h lttoolbox-3.7.1/lttoolbox/entry_token.h --- lttoolbox-3.6.6/lttoolbox/entry_token.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/entry_token.h 2022-11-01 08:36:47.000000000 +0000 @@ -23,8 +23,6 @@ #include #include -using namespace std; - /** * This is a "Compiler" helper class, to store the parts of each entry * before combining it to build the transducer being "compiled". @@ -54,17 +52,17 @@ /** * Left side of transduction (if 'single_transduction') */ - vector leftSide; + std::vector leftSide; /** * Right side of transduction (if 'single_transduction') */ - vector rightSide; + std::vector rightSide; /** * Regular expression (if 'regexp') */ - vector myregexp; + std::vector myregexp; /** * copy method @@ -101,7 +99,7 @@ * Sets the name of the paradigm. * @param np the paradigm name */ - void setParadigm(UString const &np); + void setParadigm(UStringView np); /** * Set both parts of a single transduction. @@ -109,13 +107,14 @@ * @param pd right part * @param ew entry weight */ - void setSingleTransduction(vector const &pi, vector const &pd, double const ew = 0); + void setSingleTransduction(std::vector const &pi, std::vector const &pd, double const ew = 0); /** * Set regular expression. * @param r the regular expression specification. */ - void setRegexp(UString const &r); + void setRegexp(UStringView r); + void setRegexp(const std::vector& r); /** * More efficient version of setRegexp() @@ -151,19 +150,19 @@ * Retrieve the left part of the paradigm. * @return the left part of the paradigm. */ - vector const & left() const; + std::vector const & left() const; /** * Retrieve the right part of the paradigm. * @return the right part of the paradigm. */ - vector const & right() const; + std::vector const & right() const; /** * Retrieve the regular expression specification. * @return the regular expression specification. */ - vector const & regExp() const; + std::vector const & regExp() const; /** * Retrieve the weight value of the entry. diff -Nru lttoolbox-3.6.6/lttoolbox/expander.cc lttoolbox-3.7.1/lttoolbox/expander.cc --- lttoolbox-3.6.6/lttoolbox/expander.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/expander.cc 2022-11-01 08:36:47.000000000 +0000 @@ -17,21 +17,15 @@ #include #include -#include #include -#include #include #include #include -using namespace std; - -Expander::Expander() : -reader(0) +Expander::Expander() { - LtLocale::tryToSetLocale(); } Expander::~Expander() @@ -39,14 +33,9 @@ } void -Expander::expand(string const &file, UFILE* output) +Expander::expand(std::string const &file, UFILE* output) { - reader = xmlReaderForFile(file.c_str(), NULL, 0); - if(reader == NULL) - { - cerr << "Error: Cannot open '" << file << "'." << endl; - exit(EXIT_FAILURE); - } + reader = XMLParseUtil::open_or_exit(file.c_str()); int ret = xmlTextReaderRead(reader); while(ret == 1) @@ -57,7 +46,7 @@ if(ret != 0) { - cerr << "Error: Parse error at the end of input." << endl; + std::cerr << "Error: Parse error at the end of input." << std::endl; } xmlFreeTextReader(reader); @@ -80,12 +69,12 @@ } void -Expander::requireEmptyError(UString const &name) +Expander::requireEmptyError(UStringView name) { if(!xmlTextReaderIsEmptyElement(reader)) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Non-empty element '<" << name << ">' should be empty." << std::endl; exit(EXIT_FAILURE); } } @@ -93,19 +82,11 @@ bool Expander::allBlanks() { - bool flag = true; - UString text = XMLParseUtil::readValue(reader); - - for(auto c : text) - { - flag = flag && isspace(c); - } - - return flag; + return XMLParseUtil::allBlanks(reader); } void -Expander::readString(UString &result, UString const &name) +Expander::readString(UString &result, UStringView name) { if(name == Compiler::COMPILER_TEXT_NODE) { @@ -168,9 +149,9 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid specification of element '<" << name; - cerr << ">' in this context." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid specification of element '<" << name; + std::cerr << ">' in this context." << std::endl; exit(EXIT_FAILURE); } } @@ -182,8 +163,8 @@ { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid construction." << std::endl; exit(EXIT_FAILURE); } xmlTextReaderRead(reader); @@ -192,7 +173,7 @@ } void -Expander::skip(UString &name, UString const &elem) +Expander::skip(UString &name, UStringView elem) { xmlTextReaderRead(reader); name = XMLParseUtil::readName(reader); @@ -201,8 +182,8 @@ { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid construction." << std::endl; exit(EXIT_FAILURE); } xmlTextReaderRead(reader); @@ -211,8 +192,8 @@ if(name != elem) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Expected '<" << elem << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Expected '<" << elem << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -240,7 +221,7 @@ return both_sides; } -pair +std::pair Expander::procIdentityGroup() { UString lhs; @@ -265,11 +246,11 @@ lhs += both_sides; rhs += both_sides; - pair e(lhs, rhs); + std::pair e(lhs, rhs); return e; } -pair +std::pair Expander::procTransduction() { UString lhs, rhs; @@ -311,12 +292,12 @@ skip(name, Compiler::COMPILER_PAIR_ELEM); - pair e(lhs, rhs); + std::pair e(lhs, rhs); return e; } UString -Expander::attrib(UString const &name) +Expander::attrib(UStringView name) { return XMLParseUtil::attrib(reader, name); } @@ -330,15 +311,14 @@ } void -Expander::requireAttribute(UString const &value, UString const &attrname, - UString const &elemname) +Expander::requireAttribute(UStringView value, UStringView attrname, UStringView elemname) { if(value.empty()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): '<" << elemname; - cerr << "' element must specify non-void '"; - cerr<< attrname << "' attribute." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): '<" << elemname; + std::cerr << "' element must specify non-void '"; + std::cerr<< attrname << "' attribute." << std::endl; exit(EXIT_FAILURE); } } @@ -367,8 +347,8 @@ int ret = xmlTextReaderRead(reader); if(ret != 1) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Parse error." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Parse error." << std::endl; exit(EXIT_FAILURE); } myname = XMLParseUtil::readName(reader); @@ -399,8 +379,8 @@ int ret = xmlTextReaderRead(reader); if(ret != 1) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Parse error." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Parse error." << std::endl; exit(EXIT_FAILURE); } UString name = XMLParseUtil::readName(reader); @@ -409,7 +389,7 @@ int type = xmlTextReaderNodeType(reader); if(name == Compiler::COMPILER_PAIR_ELEM) { - pair p = procTransduction(); + std::pair p = procTransduction(); append(items, p); append(items_lr, p); append(items_rl, p); @@ -423,7 +403,7 @@ } else if(name == Compiler::COMPILER_IDENTITYGROUP_ELEM) { - pair p = procIdentityGroup(); + std::pair p = procIdentityGroup(); append(items, p); append(items_lr, p); append(items_rl, p); @@ -444,8 +424,8 @@ paradigm_lr.find(p) == paradigm_lr.end() && paradigm_rl.find(p) == paradigm_rl.end()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Undefined paradigm '" <'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid inclusion of '<" << name << ">' into '<" << Compiler::COMPILER_ENTRY_ELEM; + std::cerr << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -539,7 +519,7 @@ { UString name = XMLParseUtil::readName(reader); - // DO: optimize the execution order of this string "ifs" + // TODO: optimize the execution order of this string "ifs" if(name == Compiler::COMPILER_TEXT_NODE) { @@ -583,8 +563,8 @@ } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; + std::cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid node '<" << name << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -593,9 +573,25 @@ Expander::procRegexp() { xmlTextReaderRead(reader); - UString re = XMLParseUtil::readValue(reader); + UString val = XMLParseUtil::readValue(reader); + UString escaped = "^$/<>{}*@#+~:"_u; + UString ret; + bool esc = false; + for (auto& c : val) { + if (esc) { + ret += c; + esc = false; + continue; + } + if (escaped.find(c) != UString::npos) { + ret += '\\'; + } else if (c == '\\') { + esc = true; + } + ret += c; + } xmlTextReaderRead(reader); - return re; + return ret; } void @@ -608,7 +604,7 @@ { for(auto& it2 : endings) { - temp.push_back(pair(it.first + it2.first, + temp.push_back(std::pair(it.first + it2.first, it.second + it2.second)); } } @@ -617,7 +613,7 @@ } void -Expander::append(EntList &result, UString const &endings) +Expander::append(EntList &result, UStringView endings) { for(auto& it : result) { @@ -628,7 +624,7 @@ void Expander::append(EntList &result, - pair const &endings) + std::pair const &endings) { for(auto& it : result) { @@ -638,25 +634,25 @@ } void -Expander::setAltValue(UString const &a) +Expander::setAltValue(UStringView a) { alt = a; } void -Expander::setVariantValue(UString const &v) +Expander::setVariantValue(UStringView v) { variant = v; } void -Expander::setVariantLeftValue(UString const &v) +Expander::setVariantLeftValue(UStringView v) { variant_left = v; } void -Expander::setVariantRightValue(UString const &v) +Expander::setVariantRightValue(UStringView v) { variant_right = v; } diff -Nru lttoolbox-3.6.6/lttoolbox/expander.h lttoolbox-3.7.1/lttoolbox/expander.h --- lttoolbox-3.6.6/lttoolbox/expander.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/expander.h 2022-11-01 08:36:47.000000000 +0000 @@ -19,14 +19,11 @@ #include -#include #include #include #include -using namespace std; - -typedef list > EntList; +typedef std::vector > EntList; /** * An expander of dictionaries @@ -37,7 +34,7 @@ /** * The libxml2's XML reader */ - xmlTextReaderPtr reader; + xmlTextReaderPtr reader = nullptr; /** * The alt value @@ -78,11 +75,11 @@ /** * Paradigms */ - map paradigm; + std::map paradigm; - map paradigm_lr; + std::map paradigm_lr; - map paradigm_rl; + std::map paradigm_rl; /** * Method to parse an XML Node @@ -110,13 +107,13 @@ * @param name the name of the attribute * @return the value of the attribute */ - UString attrib(UString const &name); + UString attrib(UStringView name); /** * Parse the element * @return a pair of strings, left part and right part of a transduction */ - pair procTransduction(); + std::pair procTransduction(); /** * Parse the element @@ -129,7 +126,7 @@ * @return a pair of strings, whose right part begins with '#' * but are otherwise identical */ - pair procIdentityGroup(); + std::pair procIdentityGroup(); /** * Parse the <par> element @@ -142,7 +139,7 @@ * @param name the name of the node * @param elem the name of the expected node */ - void skip(UString &name, UString const &elem); + void skip(UString &name, UStringView elem); /** * Skip all blank #text nodes before "name" @@ -151,13 +148,13 @@ void skipBlanks(UString &name); - void readString(UString &result, UString const &name); + void readString(UString &result, UStringView name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(UString const &name); + void requireEmptyError(UStringView name); /** * Force an attribute to be specified, amd check for it @@ -165,8 +162,7 @@ * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(UString const &value, UString const &attrname, - UString const &elemname); + void requireAttribute(UStringView value, UStringView attrname, UStringView elemname); /** * True if all the elements in the current node are blanks @@ -180,8 +176,8 @@ * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - list > const &endings); + static void append(EntList &result, + EntList const &endings); /** * Append a list of endings to a list of current transductions. @@ -189,8 +185,7 @@ * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - UString const &endings); + static void append(EntList &result, UStringView endings); /** * Append a list of endings to a list of current transductions. @@ -198,8 +193,8 @@ * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - pair const &endings); + static void append(EntList &result, + std::pair const &endings); public: /** @@ -215,31 +210,31 @@ /** * Compile dictionary to letter transducers */ - void expand(string const &file, UFILE* output); + void expand(std::string const &file, UFILE* output); /** * Set the alt value to use in compilation * @param a the value */ - void setAltValue(UString const &a); + void setAltValue(UStringView a); /** * Set the variant value to use in expansion * @param v the value */ - void setVariantValue(UString const &v); + void setVariantValue(UStringView v); /** * Set the variant_left value to use in expansion * @param v the value */ - void setVariantLeftValue(UString const &v); + void setVariantLeftValue(UStringView v); /** * Set the variant_right value to use in expansion * @param v the value */ - void setVariantRightValue(UString const &v); + void setVariantRightValue(UStringView v); /** * Set if we are going to keep morpheme boundaries diff -Nru lttoolbox-3.6.6/lttoolbox/file_utils.cc lttoolbox-3.7.1/lttoolbox/file_utils.cc --- lttoolbox-3.6.6/lttoolbox/file_utils.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/file_utils.cc 2022-11-01 08:36:47.000000000 +0000 @@ -21,14 +21,14 @@ #include UFILE* -openOutTextFile(const string& fname) +openOutTextFile(const std::string& fname) { if (fname.empty() || fname == "-") { return u_finit(stdout, NULL, NULL); } else { UFILE* ret = u_fopen(fname.c_str(), "wb", NULL, NULL); if (!ret) { - cerr << "Error: Cannot open file '" << fname << "' for writing." << endl; + std::cerr << "Error: Cannot open file '" << fname << "' for writing." << std::endl; exit(EXIT_FAILURE); } return ret; @@ -36,14 +36,14 @@ } FILE* -openOutBinFile(const string& fname) +openOutBinFile(const std::string& fname) { if (fname.empty() || fname == "-") { return stdout; } else { FILE* ret = fopen(fname.c_str(), "wb"); if (!ret) { - cerr << "Error: Cannot open file '" << fname << "' for writing." << endl; + std::cerr << "Error: Cannot open file '" << fname << "' for writing." << std::endl; exit(EXIT_FAILURE); } return ret; @@ -51,14 +51,14 @@ } FILE* -openInBinFile(const string& fname) +openInBinFile(const std::string& fname) { if (fname.empty() || fname == "-") { return stdin; } else { FILE* ret = fopen(fname.c_str(), "rb"); if (!ret) { - cerr << "Error: Cannot open file '" << fname << "' for reading." << endl; + std::cerr << "Error: Cannot open file '" << fname << "' for reading." << std::endl; exit(EXIT_FAILURE); } return ret; @@ -66,9 +66,9 @@ } void -writeTransducerSet(FILE* output, const UString& letters, +writeTransducerSet(FILE* output, UStringView letters, Alphabet& alpha, - map& trans) + std::map& trans) { fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); uint64_t features = 0; @@ -80,13 +80,21 @@ for (auto& it : trans) { Compression::string_write(it.first, output); it.second.write(output); - cout << it.first << " " << it.second.size(); - cout << " " << it.second.numberOfTransitions() << endl; + std::cout << it.first << " " << it.second.size(); + std::cout << " " << it.second.numberOfTransitions() << std::endl; } } void -readShared(FILE* input, set& letters, Alphabet& alpha) +writeTransducerSet(FILE* output, const std::set& letters, + Alphabet& alpha, + std::map& trans) +{ + writeTransducerSet(output, UString(letters.begin(), letters.end()), alpha, trans); +} + +void +readShared(FILE* input, std::set& letters, Alphabet& alpha) { fpos_t pos; if (fgetpos(input, &pos) == 0) { @@ -111,9 +119,9 @@ } void -readTransducerSet(FILE* input, set& letters, +readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, - map& trans) + std::map& trans) { readShared(input, letters, alpha); @@ -124,9 +132,9 @@ } void -readTransducerSet(FILE* input, set& letters, +readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, - map& trans) + std::map& trans) { readShared(input, letters, alpha); diff -Nru lttoolbox-3.6.6/lttoolbox/file_utils.h lttoolbox-3.7.1/lttoolbox/file_utils.h --- lttoolbox-3.6.6/lttoolbox/file_utils.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/file_utils.h 2022-11-01 08:36:47.000000000 +0000 @@ -23,18 +23,21 @@ #include -UFILE* openOutTextFile(const string& fname); -FILE* openOutBinFile(const string& fname); -FILE* openInBinFile(const string& fname); +UFILE* openOutTextFile(const std::string& fname); +FILE* openOutBinFile(const std::string& fname); +FILE* openInBinFile(const std::string& fname); -void writeTransducerSet(FILE* output, const UString& letters, +void writeTransducerSet(FILE* output, UStringView letters, Alphabet& alpha, - map& trans); -void readTransducerSet(FILE* input, set& letters, + std::map& trans); +void writeTransducerSet(FILE* output, const std::set& letters, + Alphabet& alpha, + std::map& trans); +void readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, - map& trans); -void readTransducerSet(FILE* input, set& letters, + std::map& trans); +void readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, - map& trans); + std::map& trans); #endif // __FILE_UTILS_H__ diff -Nru lttoolbox-3.6.6/lttoolbox/fst_processor.cc lttoolbox-3.7.1/lttoolbox/fst_processor.cc --- lttoolbox-3.6.6/lttoolbox/fst_processor.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/fst_processor.cc 2022-11-01 08:36:47.000000000 +0000 @@ -19,27 +19,13 @@ #include #include #include +#include #include #include #include -using namespace std; - - -UString const FSTProcessor::XML_TEXT_NODE = "#text"_u; -UString const FSTProcessor::XML_COMMENT_NODE = "#comment"_u; -UString const FSTProcessor::XML_IGNORED_CHARS_ELEM = "ignored-chars"_u; -UString const FSTProcessor::XML_RESTORE_CHAR_ELEM = "restore-char"_u; -UString const FSTProcessor::XML_RESTORE_CHARS_ELEM = "restore-chars"_u; -UString const FSTProcessor::XML_VALUE_ATTR = "value"_u; -UString const FSTProcessor::XML_CHAR_ELEM = "char"_u; -UString const FSTProcessor::WBLANK_START = "[["_u; -UString const FSTProcessor::WBLANK_END = "]]"_u; -UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; - - FSTProcessor::FSTProcessor() { // escaped_chars chars @@ -68,14 +54,14 @@ } void -FSTProcessor::parseICX(string const &file) +FSTProcessor::parseICX(std::string const &file) { if(useIgnoredChars) { reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - cerr << "Error: cannot open '" << file << "'." << endl; + std::cerr << "Error: cannot open '" << file << "'." << std::endl; exit(EXIT_FAILURE); } int ret = xmlTextReaderRead(reader); @@ -93,14 +79,14 @@ } void -FSTProcessor::parseRCX(string const &file) +FSTProcessor::parseRCX(std::string const &file) { if(useRestoreChars) { reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - cerr << "Error: cannot open '" << file << "'." << endl; + std::cerr << "Error: cannot open '" << file << "'." << std::endl; exit(EXIT_FAILURE); } int ret = xmlTextReaderRead(reader); @@ -134,8 +120,8 @@ } else { - cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; + std::cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid node '<" << name << ">'." << std::endl; exit(EXIT_FAILURE); } } @@ -172,82 +158,18 @@ } else { - cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << name << ">'." << endl; + std::cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); + std::cerr << "): Invalid node '<" << name << ">'." << std::endl; exit(EXIT_FAILURE); } } -bool -FSTProcessor::wblankPostGen(InputFile& input, UFILE *output) -{ - UString result = WBLANK_START; - UChar32 c = 0; - bool in_content = false; - - while(!input.eof()) - { - c = input.get(); - if(in_content && c == '~') - { - if(result[result.size()-1] == ']') { - // We just saw the end of a wblank, may want to merge - wblankqueue.push(result); - } - else { - // wake-up-mark happened some characters into the wblanked word - write(result, output); - } - return true; - } - else - { - result += c; - } - - if(c == '\\') - { - if (input.eof()) streamError(); - result += input.get(); - } - else if(c == ']') - { - c = input.get(); - result += c; - - if(c == ']') - { - int resultlen = result.size(); - if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]] - { - write(result, output); - break; - } - else - { - in_content = true; // Assumption: No nested wblanks, always balanced - } - } - } - } - - if(c != ']') - { - streamError(); - } - - return false; -} - int FSTProcessor::readAnalysis(InputFile& input) { if (!input_buffer.isEmpty()) { UChar32 val = input_buffer.next(); - while ((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) { - val = input_buffer.next(); - } return val; } @@ -263,7 +185,6 @@ while ((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) { - input_buffer.add(val); val = input.get(); } @@ -373,9 +294,9 @@ val = input.get(); } while(u_isdigit(val)); input.unget(val); - input_buffer.add(alphabet(""_u)); + input_buffer.add(alphabet(u"")); numbers.push_back(ws); - return alphabet(""_u); + return alphabet(u""); } break; @@ -388,68 +309,93 @@ return val; } -int32_t -FSTProcessor::readPostgeneration(InputFile& input, UFILE *output) +bool +FSTProcessor::readTransliterationBlank(InputFile& input) { - if(!input_buffer.isEmpty()) - { - return input_buffer.next(); + UString blank; + while (!input.eof()) { + UChar32 c = input.get(); + if (u_isspace(c)) { + blank += c; + } else if (c == '[') { + if (input.peek() == '[') { + break; + } + blank += input.readBlock('[', ']'); + } else { + input.unget(c); + break; + } } - - UChar32 val = input.get(); - int32_t altval = 0; - is_wblank = false; - if(input.eof()) - { - return 0; + if (!blank.empty()) { + blankqueue.push(blank); } + return !blank.empty(); +} - switch(val) - { - case '<': - altval = alphabet(input.readBlock('<', '>')); - input_buffer.add(altval); - return altval; +bool +FSTProcessor::readTransliterationWord(InputFile& input) +{ + if (input.eof() || input.peek() == '\0') { + return false; + } - case '[': - val = input.get(); + if (!readTransliterationBlank(input)) { + blankqueue.push(""_u); + } - if(val == '[') - { - if(collect_wblanks) - { - wblankqueue.push(input.finishWBlank()); - is_wblank = true; - return static_cast(' '); - } - else if(wblankPostGen(input, output)) - { - return static_cast('~'); - } - else - { - is_wblank = true; - return static_cast(' '); + UString wblank; + std::vector word; + if (input.peek() == '[') { + input.get(); + wblank = input.finishWBlank(); + while (!input.eof()) { + if (readTransliterationBlank(input)) { + word.push_back(static_cast(' ')); + if (input.peek() == '[') break; + } else { + UChar32 c = input.get(); + if (c == '[') { + input.unget(c); + break; + } else if (c == '\\') { + word.push_back(static_cast(input.get())); + } else if (c == '<') { + word.push_back(alphabet(input.readBlock('<', '>'))); + } else if (c == '\0') { + input.unget(c); + break; + } else { + word.push_back(static_cast(c)); } } - else - { - input.unget(val); - blankqueue.push(input.readBlock('[', ']')); - - input_buffer.add(static_cast(' ')); - return static_cast(' '); + } + if (input.peek() == '[') { + input.get(); + input.finishWBlank(); + } + } else { + while (!input.eof()) { + UChar32 c = input.get(); + if (u_isspace(c) || c == '[' || c == '\0') { + input.unget(c); + break; + } else if (c == '\\') { + word.push_back(static_cast(input.get())); + } else if (c == '<') { + word.push_back(alphabet(input.readBlock('<', '>'))); + } else { + word.push_back(static_cast(c)); } - - case '\\': - val = input.get(); - input_buffer.add(static_cast(val)); - return val; - - default: - input_buffer.add(val); - return val; + } } + if (word.empty()) { + return false; + } + wblankqueue.push_back(wblank); + transliteration_queue.push_back(word); + + return true; } void @@ -583,7 +529,7 @@ return 0x7fffffff; } -pair +std::pair FSTProcessor::readBilingual(InputFile& input, UFILE *output) { UChar32 val = input.get(); @@ -591,7 +537,7 @@ if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } if(outOfWord) @@ -601,7 +547,7 @@ val = input.get(); if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } } else if(val == '\\') @@ -610,14 +556,14 @@ val = input.get(); if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } u_fputc(val,output); skipUntil(input, output, '^'); val = input.get(); if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } } else @@ -627,7 +573,7 @@ val = input.get(); if(input.eof()) { - return pair(symbol, 0x7fffffff); + return std::pair(symbol, 0x7fffffff); } } outOfWord = false; @@ -636,12 +582,12 @@ if(val == '\\') { val = input.get(); - return pair(symbol, val); + return std::pair(symbol, val); } else if(val == '$') { outOfWord = true; - return pair(symbol, static_cast('$')); + return std::pair(symbol, static_cast('$')); } else if(val == '<') { @@ -653,7 +599,7 @@ { symbol = cad; } - return pair(symbol, res); + return std::pair(symbol, res); } else if(val == '[') { @@ -671,7 +617,7 @@ return readBilingual(input, output); } - return pair(symbol, val); + return std::pair(symbol, val); } void @@ -685,65 +631,6 @@ } void -FSTProcessor::flushWblanks(UFILE *output) -{ - while(wblankqueue.size() > 0) - { - write(wblankqueue.front(), output); - wblankqueue.pop(); - } -} - -UString -FSTProcessor::combineWblanks() -{ - UString final_wblank; - UString last_wblank; - bool seen_wblank = false; - - while(wblankqueue.size() > 0) - { - if(wblankqueue.front().compare(WBLANK_FINAL) == 0) - { - if(seen_wblank) { - if(final_wblank.empty()) - { - final_wblank += WBLANK_START; - } - else if(final_wblank.size() > 2) - { - final_wblank += "; "_u; - } - - final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]] - } - else { - need_end_wblank = true; - } - last_wblank.clear(); - } - else - { - seen_wblank = true; - last_wblank = wblankqueue.front(); - } - wblankqueue.pop(); - } - - if(!last_wblank.empty()) - { - wblankqueue.push(last_wblank); - } - - if(!final_wblank.empty()) - { - final_wblank += WBLANK_END; - need_end_wblank = true; - } - return final_wblank; -} - -void FSTProcessor::calcInitial() { for(auto& it : transducers) { @@ -753,54 +640,41 @@ initial_state.init(&root); } -bool -FSTProcessor::endsWith(UString const &str, UString const &suffix) -{ - if(str.size() < suffix.size()) - { - return false; - } - else - { - return str.substr(str.size()-suffix.size()) == suffix; - } -} - void FSTProcessor::classifyFinals() { for(auto& it : transducers) { - if(endsWith(it.first, "@inconditional"_u)) + if(StringUtils::endswith(it.first, u"@inconditional")) { inconditional.insert(it.second.getFinals().begin(), it.second.getFinals().end()); } - else if(endsWith(it.first, "@standard"_u)) + else if(StringUtils::endswith(it.first, u"@standard")) { standard.insert(it.second.getFinals().begin(), it.second.getFinals().end()); } - else if(endsWith(it.first, "@postblank"_u)) + else if(StringUtils::endswith(it.first, u"@postblank")) { postblank.insert(it.second.getFinals().begin(), it.second.getFinals().end()); } - else if(endsWith(it.first, "@preblank"_u)) + else if(StringUtils::endswith(it.first, u"@preblank")) { preblank.insert(it.second.getFinals().begin(), it.second.getFinals().end()); } else { - cerr << "Error: Unsupported transducer type for '"; - cerr << it.first << "'." << endl; + std::cerr << "Error: Unsupported transducer type for '"; + std::cerr << it.first << "'." << std::endl; exit(EXIT_FAILURE); } } } UString -FSTProcessor::filterFinals(const State& state, const UString& casefrom) +FSTProcessor::filterFinals(const State& state, UStringView casefrom) { bool firstupper = false, uppercase = false; if (!dictionaryCase) { @@ -814,7 +688,7 @@ } void -FSTProcessor::writeEscaped(UString const &str, UFILE *output) +FSTProcessor::writeEscaped(UStringView str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { @@ -827,7 +701,7 @@ } size_t -FSTProcessor::writeEscapedPopBlanks(UString const &str, UFILE *output) +FSTProcessor::writeEscapedPopBlanks(UStringView str, UFILE *output) { size_t postpop = 0; for (unsigned int i = 0, limit = str.size(); i < limit; i++) @@ -848,7 +722,7 @@ } void -FSTProcessor::writeEscapedWithTags(UString const &str, UFILE *output) +FSTProcessor::writeEscapedWithTags(UStringView str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { @@ -869,7 +743,7 @@ void -FSTProcessor::printWord(UString const &sf, UString const &lf, UFILE *output) +FSTProcessor::printWord(UStringView sf, UStringView lf, UFILE *output) { u_fputc('^', output); writeEscaped(sf, output); @@ -878,11 +752,11 @@ } void -FSTProcessor::printWordPopBlank(UString const &sf, UString const &lf, UFILE *output) +FSTProcessor::printWordPopBlank(UStringView sf, UStringView lf, UFILE *output) { u_fputc('^', output); size_t postpop = writeEscapedPopBlanks(sf, output); - u_fprintf(output, "%S$", lf.c_str()); + u_fprintf(output, "%.*S$", lf.size(), lf.data()); while (postpop-- && blankqueue.size() > 0) { write(blankqueue.front(), output); @@ -891,13 +765,13 @@ } void -FSTProcessor::printWordBilingual(UString const &sf, UString const &lf, UFILE *output) +FSTProcessor::printWordBilingual(UStringView sf, UStringView lf, UFILE *output) { - u_fprintf(output, "^%S%S$", sf.c_str(), lf.c_str()); + u_fprintf(output, "^%.*S%.*S$", sf.size(), sf.data(), lf.size(), lf.data()); } void -FSTProcessor::printUnknownWord(UString const &sf, UFILE *output) +FSTProcessor::printUnknownWord(UStringView sf, UFILE *output) { u_fputc('^', output); writeEscaped(sf, output); @@ -908,7 +782,7 @@ } unsigned int -FSTProcessor::lastBlank(UString const &str) +FSTProcessor::lastBlank(UStringView str) { for(int i = static_cast(str.size())-1; i >= 0; i--) { @@ -922,7 +796,7 @@ } void -FSTProcessor::printSpace(UChar32 const val, UFILE *output) +FSTProcessor::printSpace(UChar32 val, UFILE *output) { if(blankqueue.size() > 0) { @@ -935,7 +809,7 @@ } void -FSTProcessor::printChar(const UChar32 val, UFILE* output) +FSTProcessor::printChar(UChar32 val, UFILE* output) { if (u_isspace(val)) { if (blankqueue.size() > 0) { @@ -955,13 +829,13 @@ } bool -FSTProcessor::isEscaped(UChar32 const c) const +FSTProcessor::isEscaped(UChar32 c) const { return escaped_chars.find(c) != escaped_chars.end(); } bool -FSTProcessor::isAlphabetic(UChar32 const c) const +FSTProcessor::isAlphabetic(UChar32 c) const { return u_isalnum(c) || alphabetic_chars.find(c) != alphabetic_chars.end(); } @@ -1006,7 +880,7 @@ } void -FSTProcessor::initPostgeneration() +FSTProcessor::initTransliteration() { initGeneration(); } @@ -1029,12 +903,12 @@ { UChar val=input_word[i]; - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); if(current_state.size() > MAX_COMBINATIONS) { - cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << endl; - cerr << " gave up at char " << i << " '" << val << "'." << endl; + std::cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << std::endl; + std::cerr << " gave up at char " <"_u)) == 0 - && (compoundOnlyLSymbol=alphabet("<:compound:only-L>"_u)) == 0 - && (compoundOnlyLSymbol=alphabet("<@co:only-L>"_u)) == 0 - && (compoundOnlyLSymbol=alphabet("<@compound:only-L>"_u)) == 0 - && (compoundOnlyLSymbol=alphabet(""_u)) == 0) + if((compoundOnlyLSymbol=alphabet(u"<:co:only-L>")) == 0 + && (compoundOnlyLSymbol=alphabet(u"<:compound:only-L>")) == 0 + && (compoundOnlyLSymbol=alphabet(u"<@co:only-L>")) == 0 + && (compoundOnlyLSymbol=alphabet(u"<@compound:only-L>")) == 0 + && (compoundOnlyLSymbol=alphabet(u"")) == 0) { - cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << endl; + std::cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << std::endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundOnlyLSymbol, ""_u); + alphabet.setSymbol(compoundOnlyLSymbol, u""); } - if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0 - && (compoundRSymbol=alphabet("<:compound:R>"_u)) == 0 - && (compoundRSymbol=alphabet("<@co:R>"_u)) == 0 - && (compoundRSymbol=alphabet("<@compound:R>"_u)) == 0 - && (compoundRSymbol=alphabet(""_u)) == 0) + if((compoundRSymbol=alphabet(u"<:co:R>")) == 0 + && (compoundRSymbol=alphabet(u"<:compound:R>")) == 0 + && (compoundRSymbol=alphabet(u"<@co:R>")) == 0 + && (compoundRSymbol=alphabet(u"<@compound:R>")) == 0 + && (compoundRSymbol=alphabet(u"")) == 0) { - cerr << "Warning: Decomposition symbol <:compound:R> not found" << endl; + std::cerr << "Warning: Decomposition symbol <:compound:R> not found" << std::endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundRSymbol, ""_u); + alphabet.setSymbol(compoundRSymbol, u""); } } @@ -1113,9 +987,10 @@ UString sf; // surface form UString lf_spcmp; // space compound analysis bool seen_cpL = false; // have we seen a tag so far + size_t last_start = input_buffer.getPos(); // position in input_buffer when sf was last cleared size_t last = 0; // position in input_buffer after last analysis size_t last_size = 0; // size of sf at last analysis - map >::iterator rcx_map_ptr; + std::map >::iterator rcx_map_ptr; UChar32 val; do @@ -1192,8 +1067,8 @@ if(useRestoreChars && rcx_map.find(val) != rcx_map.end()) { rcx_map_ptr = rcx_map.find(val); - set tmpset = rcx_map_ptr->second; - if(!u_isupper(val) || caseSensitive) + std::set tmpset = rcx_map_ptr->second; + if(!u_isupper(val) || beCaseSensitive(current_state)) { current_state.step(val, tmpset); } @@ -1212,7 +1087,7 @@ } else { - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); } if(current_state.size() != 0) @@ -1291,18 +1166,16 @@ } while((val = readAnalysis(input)) && isAlphabetic(val)); - unsigned int limit = firstNotAlpha(sf); - unsigned int size = sf.size(); - limit = (limit == static_cast(UString::npos)?size:limit); - if(limit == 0) + auto limit = firstNotAlpha(sf); + if(limit.i_codepoint == 0) { - input_buffer.back(sf.size()); + input_buffer.setPos(1 + last_start); writeEscaped(sf.substr(0,1), output); } else { - input_buffer.back(1+(size-limit)); - UString unknown_word = sf.substr(0, limit); + input_buffer.setPos(last_start + limit.i_codepoint); + UString unknown_word = sf.substr(0, limit.i_utf16); if(do_decomposition) { UString compound = compoundAnalysis(unknown_word); @@ -1323,18 +1196,16 @@ } else if(lf.empty()) { - unsigned int limit = firstNotAlpha(sf); - unsigned int size = sf.size(); - limit = (limit == static_cast(UString::npos)?size:limit); - if(limit == 0) + auto limit = firstNotAlpha(sf); + if(limit.i_codepoint == 0) { - input_buffer.back(sf.size()); + input_buffer.setPos(1 + last_start); writeEscaped(sf.substr(0,1), output); } else { - input_buffer.back(1+(size-limit)); - UString unknown_word = sf.substr(0, limit); + input_buffer.setPos(last_start + limit.i_codepoint); + UString unknown_word = sf.substr(0, limit.i_utf16); if(do_decomposition) { UString compound = compoundAnalysis(unknown_word); @@ -1369,6 +1240,7 @@ current_state = initial_state; lf.clear(); sf.clear(); + last_start = input_buffer.getPos(); last_incond = false; last_postblank = false; last_preblank = false; @@ -1412,42 +1284,6 @@ } void -FSTProcessor::postgeneration_wrapper_null_flush(InputFile& input, UFILE *output) -{ - setNullFlush(false); - while(!input.eof()) - { - postgeneration(input, output); - u_fputc('\0', output); - u_fflush(output); - } -} - -void -FSTProcessor::intergeneration_wrapper_null_flush(InputFile& input, UFILE *output) -{ - setNullFlush(false); - while (!input.eof()) - { - intergeneration(input, output); - u_fputc('\0', output); - u_fflush(output); - } -} - -void -FSTProcessor::transliteration_wrapper_null_flush(InputFile& input, UFILE *output) -{ - setNullFlush(false); - while(!input.eof()) - { - transliteration(input, output); - u_fputc('\0', output); - u_fflush(output); - } -} - -void FSTProcessor::tm_analysis(InputFile& input, UFILE *output) { State current_state = initial_state; @@ -1744,7 +1580,7 @@ alphabet.getSymbol(sf,val); if(current_state.size() > 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !(beCaseSensitive(current_state))) { if(mode == gm_carefulcase) { @@ -1767,426 +1603,211 @@ void FSTProcessor::postgeneration(InputFile& input, UFILE *output) { - if(getNullFlush()) - { - postgeneration_wrapper_null_flush(input, output); - } - - bool skip_mode = true; - collect_wblanks = false; - need_end_wblank = false; - State current_state = initial_state; - UString lf; - UString sf; - int last = 0; - set empty_escaped_chars; - - while(UChar32 val = readPostgeneration(input, output)) - { - if(val == '~') - { - skip_mode = false; - collect_wblanks = true; - } - - if(is_wblank && skip_mode) - { - //do nothing - } - else if(skip_mode) - { - if(u_isspace(val)) - { - if(need_end_wblank) - { - write(WBLANK_FINAL, output); - need_end_wblank = false; - } + transliteration_drop_tilde = true; + transliteration(input, output); +} - printSpace(val, output); - } - else - { - if(!need_end_wblank) - { - flushWblanks(output); - } +void +FSTProcessor::intergeneration(InputFile& input, UFILE *output) +{ + transliteration_drop_tilde = false; + transliteration(input, output); +} - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); +void +FSTProcessor::transliteration(InputFile& input, UFILE *output) +{ + size_t start_pos = 0; + size_t cur_word = 0; + size_t cur_pos = 0; + size_t match_pos = 0; + State current_state = initial_state; + UString last_match; + int space_diff = 0; - if(need_end_wblank) - { - write(WBLANK_FINAL, output); - need_end_wblank = false; + bool firstupper = false; + bool uppercase = false; + bool have_first = false; + bool have_second = false; + + while (true) { + if (transliteration_queue.empty()) { + if (!blankqueue.empty()) { + flushBlanks(output); + } + if (!readTransliterationWord(input)) { + flushBlanks(output); + if (input.eof()) { + break; + } else { + u_fputc(input.get(), output); + u_fflush(output); + continue; } } } - else - { - if(is_wblank) - { - continue; - } - - // test for final states - if(current_state.isFinal(all_finals)) - { - bool firstupper = u_isupper(sf[1]); - bool uppercase = sf.size() > 1 && firstupper && u_isupper(sf[2]); - lf = current_state.filterFinals(all_finals, alphabet, - empty_escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - // case of the beggining of the next word - - UString mybuf; - for(size_t i = sf.size(); i > 0; --i) - { - if(!isalpha(sf[i-1])) - { + if (current_state.isFinal(all_finals)) { + last_match = current_state.filterFinals(all_finals, alphabet, + escaped_chars, displayWeightsMode, + 1, maxWeightClasses, + uppercase, firstupper); + while (cur_word > 0) { + if (cur_word == 1) { + if (cur_pos == 0 && last_match[last_match.size()-1] == ' ') { + match_pos = transliteration_queue.front().size(); + last_match = last_match.substr(0, last_match.size()-1); break; - } - else - { - mybuf = sf[i-1] + mybuf; - } - } - - if(mybuf.size() > 0) - { - bool myfirstupper = u_isupper(mybuf[0]); - bool myuppercase = mybuf.size() > 1 && u_isupper(mybuf[1]); - - for(size_t i = lf.size(); i > 0; --i) - { - if(!isalpha(lf[i-1])) - { - if(myfirstupper && i != lf.size()) - { - lf[i] = u_toupper(lf[i]); - } - else - { - lf[i] = u_tolower(lf[i]); - } - break; - } - else - { - if(myuppercase) - { - lf[i-1] = u_toupper(lf[i-1]); - } - else - { - lf[i-1] = u_tolower(lf[i-1]); - } - } + } else { + cur_pos += transliteration_queue.front().size() + 1; } } - - last = input_buffer.getPos(); + std::vector word = transliteration_queue.front(); + transliteration_queue.pop_front(); + word.push_back(static_cast(' ')); + word.insert(word.end(), transliteration_queue.front().begin(), + transliteration_queue.front().end()); + transliteration_queue.pop_front(); + transliteration_queue.push_front(word); + UString wblank = wblankqueue.front(); + wblankqueue.pop_front(); + wblank = StringUtils::merge_wblanks(wblank, wblankqueue.front()); + wblankqueue.pop_front(); + wblankqueue.push_front(wblank); + cur_word--; + } + if (cur_word == 0) { + match_pos = cur_pos; } + } - current_state.step_case(val, caseSensitive); - - if(current_state.size() != 0) - { - alphabet.getSymbol(sf, val); + int32_t sym = 0; + bool is_end = false; + if (cur_pos < transliteration_queue[cur_word].size()) { + sym = transliteration_queue[cur_word][cur_pos]; + cur_pos++; + } else { + if (cur_word + 1 == transliteration_queue.size() && + !readTransliterationWord(input)) { + is_end = true; + } else { + sym = static_cast(' '); + cur_word++; + cur_pos = 0; } - else - { - UString final_wblank = combineWblanks(); - write(final_wblank, output); - - if(lf.empty()) - { - unsigned int mark = sf.size(); - unsigned int space_index = sf.size(); - - for(unsigned int i = 1, limit = sf.size(); i < limit; i++) - { - if(sf[i] == '~') - { - mark = i; - break; - } - else if(sf[i] == ' ') - { - space_index = i; - } - } - - if(space_index != sf.size()) - { - write(sf.substr(1, space_index-1), output); - - if(need_end_wblank) - { - write(WBLANK_FINAL, output); - need_end_wblank = false; - u_fputc(sf[space_index], output); - flushWblanks(output); - } - else - { - u_fputc(sf[space_index], output); - } - - write(sf.substr(space_index+1, mark-space_index-1), output); - } - else - { - flushWblanks(output); - write(sf.substr(1, mark-1), output); - } + } - if(mark == sf.size()) - { - input_buffer.back(1); - } - else - { - input_buffer.back(sf.size()-mark); - } - } - else - { - write(lf.substr(1,lf.size()-3), output); - input_buffer.setPos(last); - input_buffer.back(2); - val = lf[lf.size()-2]; - if(u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); - } + if (isAlphabetic(sym)) { + if (!have_first) { + have_first = true; + if (u_isupper(sym)) { + firstupper = true; + } else { + firstupper = false; + have_second = true; } - - current_state = initial_state; - lf.clear(); - sf.clear(); - skip_mode = true; - collect_wblanks = false; + } else if (!have_second) { + have_second = true; + uppercase = u_isupper(sym); } } - } - - // print remaining blanks - flushBlanks(output); -} - -void -FSTProcessor::intergeneration(InputFile& input, UFILE *output) -{ - if (getNullFlush()) - { - intergeneration_wrapper_null_flush(input, output); - } - - bool skip_mode = true; - State current_state = initial_state; - UString target; - UString source; - int last = 0; - set empty_escaped_chars; - - while (true) - { - UChar32 val = readPostgeneration(input, output); - if (val == '~') - { - skip_mode = false; - } + current_state.step_case_override(sym, beCaseSensitive(current_state)); - if (skip_mode) - { - if (u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(val != '\0') - { - if (isEscaped(val)) - { - u_fputc('\\', output); + if (current_state.size() == 0 || is_end) { + if (last_match.empty()) { + start_pos++; + } else { + std::vector match = alphabet.tokenize(last_match.substr(1)); + last_match.clear(); + std::vector word = transliteration_queue.front(); + transliteration_queue.pop_front(); + size_t i = 0; + for (; i < match.size() && i < match_pos - start_pos; i++) { + if (match[match.size()-i-1] != word[match_pos-i-1]) { + break; } - u_fputc(val, output); } + std::vector new_word; + new_word.insert(new_word.end(), word.begin(), word.begin()+start_pos); + new_word.insert(new_word.end(), match.begin(), match.end()); + new_word.insert(new_word.end(), word.begin()+match_pos, word.end()); + transliteration_queue.push_front(new_word); + int sf_spaces = 0; + int lf_spaces = 0; + for (auto c : word) { + if (c == static_cast(' ')) sf_spaces++; + } + for (auto c : new_word) { + if (c == static_cast(' ')) lf_spaces++; + } + space_diff += (lf_spaces - sf_spaces); + size_t last_start = start_pos; + start_pos = match_pos - i; + if (start_pos == last_start) start_pos++; + cur_pos = start_pos; + cur_word = 0; } - } - else - { - // test for final states - if (current_state.isFinal(all_finals)) - { - bool firstupper = u_isupper(source[1]); - bool uppercase = source.size() > 1 && firstupper && u_isupper(source[2]); - target = current_state.filterFinals(all_finals, alphabet, - empty_escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - - last = input_buffer.getPos(); - } - - if (val != '\0') - { - current_state.step_case(val, caseSensitive); - } - - if (val != '\0' && current_state.size() != 0) - { - alphabet.getSymbol(source, val); - } - else - { - if (target.empty()) // no match - { - if (val == '\0') - { - // flush source - write(source, output); - } - else - { - u_fputc(source[0], output); - - unsigned int mark, limit; - for (mark = 1, limit = source.size(); mark < limit && source[mark] != '~' ; mark++) - { - u_fputc(source[mark], output); - } - - if (mark != source.size()) - { - int back = source.size() - mark; - input_buffer.back(back); - } - - if (val == '~') - { - input_buffer.back(1); + if (start_pos >= transliteration_queue.front().size()) { + write(blankqueue.front(), output); + blankqueue.pop(); + bool has_wblank = !wblankqueue.front().empty(); + write(wblankqueue.front(), output); + wblankqueue.pop_front(); + auto word = transliteration_queue.front(); + transliteration_queue.pop_front(); + int space_count = 0; + for (auto c : word) { + if (c == static_cast(' ')) space_count++; + } + int space_out = 0; + UString out; + for (auto c : word) { + if (c == ' ') { + if (space_out + space_diff >= space_count) { + out += ' '; } else { - u_fputc(val, output); - } - } - } - else - { - for(unsigned int i=1; i('~')) { + } else { + if (c > 0 && isEscaped(c)) { + out += '\\'; } - } - - if (val != '\0') - { - input_buffer.setPos(last); - input_buffer.back(1); + alphabet.getSymbol(out, c); } } - - current_state = initial_state; - target.clear(); - source.clear(); - skip_mode = true; - } - } - - if (val == '\0') - { - break; - } - } - - // print remaining blanks - flushBlanks(output); -} - -void -FSTProcessor::transliteration(InputFile& input, UFILE *output) -{ - if(getNullFlush()) - { - transliteration_wrapper_null_flush(input, output); - } - - State current_state = initial_state; - UString lf; - UString sf; - UString last_lf; - int rewind_point = 0; - int last_match = 0; - UChar32 firstchar = 0; - - while(UChar32 val = readPostgeneration(input, output)) { - if (sf.empty()) { - firstchar = val; - rewind_point = input_buffer.getPos(); - } else { - lf = filterFinals(current_state, sf); - if (!lf.empty()) { - last_match = input_buffer.getPos(); - last_lf.swap(lf); - } - } - current_state.step(val); - if (current_state.size() != 0) { - alphabet.getSymbol(sf, val); - } else { - if (last_lf.empty()) { - input_buffer.setPos(rewind_point); - if (u_isspace(firstchar)) { - printSpace(firstchar, output); - } else { - if (isEscaped(firstchar)) { - u_fputc('\\', output); + write(out, output); + if (has_wblank) { + write(WBLANK_FINAL, output); + } + while (space_diff < 0) { + if (blankqueue.front() != " "_u) { + write(blankqueue.front(), output); } - u_fputc(firstchar, output); + blankqueue.pop(); + space_diff++; } - } else { - write(last_lf.substr(1), output); - last_lf.clear(); - input_buffer.setPos(last_match); - input_buffer.back(1); + space_diff = 0; + start_pos = 0; } - sf.clear(); + match_pos = 0; + cur_pos = start_pos; + cur_word = 0; + uppercase = false; + firstupper = false; + have_first = false; + have_second = false; current_state = initial_state; } } - // print remaining blanks - flushBlanks(output); } UString -FSTProcessor::biltransfull(UString const &input_word, bool with_delim) +FSTProcessor::biltransfull(UStringView input_word, bool with_delim) { State current_state = initial_state; UString result; @@ -2203,7 +1824,7 @@ if(input_word[start_point] == '*') { - return input_word; + return US(input_word); } if(input_word[start_point] == '=') @@ -2245,7 +1866,7 @@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2280,11 +1901,11 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } return result; } @@ -2339,7 +1960,7 @@ UString -FSTProcessor::biltrans(UString const &input_word, bool with_delim) +FSTProcessor::biltrans(UStringView input_word, bool with_delim) { State current_state = initial_state; UString result; @@ -2356,7 +1977,7 @@ if(input_word[start_point] == '*') { - return input_word; + return US(input_word); } if(input_word[start_point] == '=') @@ -2398,7 +2019,7 @@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2433,11 +2054,11 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } return result; } @@ -2500,7 +2121,7 @@ } UString -FSTProcessor::compose(UString const &lexforms, UString const &queue) const +FSTProcessor::compose(UStringView lexforms, UStringView queue) const { UString result; result.reserve(lexforms.size() + 2 * queue.size()); @@ -2540,7 +2161,7 @@ outOfWord = false; skipUntil(input, output, '^'); - pair tr; // readBilingual return value, containing: + std::pair tr; // readBilingual return value, containing: int val; // the alphabet value of current symbol, and UString symbol; // the current symbol as a string bool seentags = false; // have we seen any tags at all in the analysis? @@ -2656,7 +2277,7 @@ } if(current_state.size() != 0) { - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); } if(current_state.isFinal(all_finals)) { @@ -2693,8 +2314,8 @@ } } -pair -FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) +std::pair +FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim) { State current_state = initial_state; UString result; @@ -2712,7 +2333,7 @@ if(input_word[start_point] == '*') { - return pair(input_word, 0); + return {US(input_word), 0}; } if(input_word[start_point] == '=') @@ -2755,7 +2376,7 @@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2790,13 +2411,13 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } - return pair(result, 0); + return std::pair(result, 0); } } } @@ -2809,13 +2430,13 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } - return pair(result, 0); + return {result, 0}; } @@ -2849,7 +2470,7 @@ { result_with_queue += '$'; } - return pair(result_with_queue, queue.size()); + return {result_with_queue, queue.size()}; } else { @@ -2857,12 +2478,12 @@ { result += '$'; } - return pair(result, 0); + return {result, 0}; } } UString -FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) +FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim) { State current_state = initial_state; UString result; @@ -2878,7 +2499,7 @@ if(input_word[start_point] == '*') { - return input_word; + return US(input_word); } if(input_word[start_point] == '=') @@ -2920,7 +2541,7 @@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) { current_state.step(val, u_tolower(val)); } @@ -2951,11 +2572,11 @@ // word is not present if(with_delim) { - result = "^@"_u + input_word.substr(1); + result = "^@"_u + US(input_word.substr(1)); } else { - result = "@"_u + input_word; + result = "@"_u + US(input_word); } return result; } @@ -2975,7 +2596,7 @@ { if(initial_state.isFinal(all_finals)) { - cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl; + std::cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << std::endl; return false; } else @@ -2984,7 +2605,7 @@ s.step(' '); if(s.size() != 0) { - cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl; + std::cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << std::endl; return false; } } @@ -3011,9 +2632,9 @@ if(val == '<') { UString str = input.readBlock('<', '>'); - if(str.substr(0, 9) == ""_u) + while(!StringUtils::endswith(str, u"]]>")) { str.append(input.readBlock('<', '>').substr(1)); } @@ -3047,7 +2668,7 @@ } void -FSTProcessor::printSAOWord(UString const &lf, UFILE *output) +FSTProcessor::printSAOWord(UStringView lf, UFILE *output) { for(unsigned int i = 1, limit = lf.size(); i != limit; i++) { @@ -3123,7 +2744,7 @@ last = input_buffer.getPos(); } - current_state.step_case(val, caseSensitive); + current_state.step_case(val, beCaseSensitive(current_state)); if(current_state.size() != 0) { @@ -3169,18 +2790,16 @@ } while((val = readSAO(input)) && isAlphabetic(val)); - unsigned int limit = firstNotAlpha(sf); - unsigned int size = sf.size(); - limit = (limit == static_cast(UString::npos)?size:limit); - input_buffer.back(1+(size-limit)); + auto limit = firstNotAlpha(sf); + unsigned int size = sf.size(); // TODO: change these to character counts + input_buffer.back(1+(size-limit.i_utf16)); u_fprintf(output, "%S", sf.c_str()); } else if(lf.empty()) { - unsigned int limit = firstNotAlpha(sf); - unsigned int size = sf.size(); - limit = (limit == static_cast(UString::npos)?size:limit); - input_buffer.back(1+(size-limit)); + auto limit = firstNotAlpha(sf); + unsigned int size = sf.size(); // TODO: change these to character counts + input_buffer.back(1+(size-limit.i_utf16)); u_fprintf(output, "%S", sf.c_str()); } else @@ -3202,8 +2821,8 @@ flushBlanks(output); } -UString -FSTProcessor::removeTags(UString const &str) +UStringView +FSTProcessor::removeTags(UStringView str) { for(unsigned int i = 0; i < str.size(); i++) { @@ -3218,61 +2837,61 @@ void -FSTProcessor::setBiltransSurfaceForms(bool const value) +FSTProcessor::setBiltransSurfaceForms(bool value) { biltransSurfaceForms = value; } void -FSTProcessor::setCaseSensitiveMode(bool const value) +FSTProcessor::setCaseSensitiveMode(bool value) { caseSensitive = value; } void -FSTProcessor::setDictionaryCaseMode(bool const value) +FSTProcessor::setDictionaryCaseMode(bool value) { dictionaryCase = value; } void -FSTProcessor::setNullFlush(bool const value) +FSTProcessor::setNullFlush(bool value) { nullFlush = value; } void -FSTProcessor::setIgnoredChars(bool const value) +FSTProcessor::setIgnoredChars(bool value) { useIgnoredChars = value; } void -FSTProcessor::setRestoreChars(bool const value) +FSTProcessor::setRestoreChars(bool value) { useRestoreChars = value; } void -FSTProcessor::setUseDefaultIgnoredChars(bool const value) +FSTProcessor::setUseDefaultIgnoredChars(bool value) { useDefaultIgnoredChars = value; } void -FSTProcessor::setDisplayWeightsMode(bool const value) +FSTProcessor::setDisplayWeightsMode(bool value) { displayWeightsMode = value; } void -FSTProcessor::setMaxAnalysesValue(int const value) +FSTProcessor::setMaxAnalysesValue(int value) { maxAnalyses = value; } void -FSTProcessor::setMaxWeightClassesValue(int const value) +FSTProcessor::setMaxWeightClassesValue(int value) { maxWeightClasses = value; } @@ -3289,18 +2908,22 @@ return nullFlush; } -size_t -FSTProcessor::firstNotAlpha(UString const &sf) +FSTProcessor::Indices +FSTProcessor::firstNotAlpha(UStringView sf) { - UCharCharacterIterator it = UCharCharacterIterator(sf.c_str(), sf.size()); - size_t i = 0; + FSTProcessor::Indices ix = { 0, 0 }; + UCharCharacterIterator it = UCharCharacterIterator(sf.data(), sf.size()); while (it.hasNext()) { UChar32 c = it.next32PostInc(); if(!isAlphabetic(c)) { - return i; + return ix; + } + ix.i_codepoint++; + ix.i_utf16++; + if(c > UINT16_MAX) { + ix.i_utf16++; } - i += c > UINT16_MAX ? 2 : 1; } - return UString::npos; + return ix; } diff -Nru lttoolbox-3.6.6/lttoolbox/fst_processor.h lttoolbox-3.7.1/lttoolbox/fst_processor.h --- lttoolbox-3.6.6/lttoolbox/fst_processor.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/fst_processor.h 2022-11-01 08:36:47.000000000 +0000 @@ -28,14 +28,13 @@ #include #include +#include #include #include #include #include #include -using namespace std; - /** * Kind of output of the generator module */ @@ -58,12 +57,7 @@ /** * Transducers in FSTP */ - map transducers; - - /** - * Current state of lexical analysis - */ - State current_state; + std::map transducers; /** * Initial state of every token @@ -78,57 +72,59 @@ /** * The final states of inconditional sections in the dictionaries */ - map inconditional; + std::map inconditional; /** * The final states of standard sections in the dictionaries */ - map standard; + std::map standard; /** * The final states of postblank sections in the dictionaries */ - map postblank; + std::map postblank; /** * The final states of preblank sections in the dictionaries */ - map preblank; + std::map preblank; /** * Merge of 'inconditional', 'standard', 'postblank' and 'preblank' sets */ - map all_finals; + std::map all_finals; /** * Queue of blanks, used in reading methods */ - queue blankqueue; + std::queue blankqueue; /** * Queue of wordbound blanks, used in reading methods */ - queue wblankqueue; + std::deque wblankqueue; + + std::deque> transliteration_queue; /** * Set of characters being considered alphabetics */ - set alphabetic_chars; + std::set alphabetic_chars; /** * Set of characters to escape with a backslash */ - set escaped_chars; + std::set escaped_chars; /** * Set of characters to ignore */ - set ignored_chars; + std::set ignored_chars; /** * Mapping of characters for simplistic diacritic restoration specified in RCX files */ - map > rcx_map; + std::map > rcx_map; /** * Original char being restored @@ -234,20 +230,7 @@ */ int maxAnalyses = INT_MAX; - /** - * True if a wblank block ([[..]]xyz[[/]]) was just read - */ - bool is_wblank; - - /** - * True if skip_mode is false and need to collect wblanks - */ - bool collect_wblanks; - - /** - * True if a wblank has been processed for postgen and we need an ending wblank - */ - bool need_end_wblank; + bool transliteration_drop_tilde = false; /** * Output no more than 'N' best weight classes @@ -260,14 +243,6 @@ void streamError(); /** - * Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]] - * @param input the stream being read - * @param output the stream to write on - * @return true if the word enclosed by the wordbound blank has a ~ for postgeneration activation - */ - bool wblankPostGen(InputFile& input, UFILE *output); - - /** * Returns true if the character code is identified as alphabetic * @param c the code provided by the user * @return true if it's alphabetic @@ -296,13 +271,8 @@ */ int readDecomposition(InputFile& input, UFILE *output); - /** - * Read text from stream (postgeneration version) - * @param input the stream to read - * @param output the stream to write on - * @return the next symbol in the stream - */ - int readPostgeneration(InputFile& input, UFILE *output); + bool readTransliterationBlank(InputFile& input); + bool readTransliterationWord(InputFile& input); /** * Read text from stream (generation version) @@ -318,7 +288,7 @@ * @param output the stream to write on * @return the queue of 0-symbols, and the next symbol in the stream */ - pair readBilingual(InputFile& input, UFILE *output); + std::pair readBilingual(InputFile& input, UFILE *output); /** * Read text from stream (SAO version) @@ -334,26 +304,6 @@ void flushBlanks(UFILE *output); /** - * Flush all the wordbound blanks remaining in the current process - * @param output stream to write blanks - */ - void flushWblanks(UFILE *output); - - /** - * Combine wordbound blanks in the queue and return them. - * - * May pop from 'wblankqueue' and set 'need_end_wblank' to true. - * - * If 'wblankqueue' (see which) is empty, we get an empty string, - * otherwise we return a semicolon-separated combination of opening - * wblanks in the queue. If there is only a closing wblank, we just - * set need_end_wblank. - * - * @return final wblank string - */ - UString combineWblanks(); - - /** * Calculate the initial state of parsing */ void calcInitial(); @@ -367,14 +317,14 @@ * Shortcut for filtering on all final states with current settings * Assumes that casefrom is non-empty */ - UString filterFinals(const State& state, const UString& casefrom); + UString filterFinals(const State& state, UStringView casefrom); /** * Write a string to an output stream, * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscaped(UString const &str, UFILE *output); + void writeEscaped(UStringView str, UFILE *output); /** * Write a string to an output stream. @@ -385,7 +335,7 @@ * @param output the stream to write in * @return how many blanks to pop and print after printing lu */ - size_t writeEscapedPopBlanks(UString const &str, UFILE *output); + size_t writeEscapedPopBlanks(UStringView str, UFILE *output); /** * Write a string to an output stream, escaping all escapable characters @@ -393,16 +343,7 @@ * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscapedWithTags(UString const &str, UFILE *output); - - - /** - * Checks if an string ends with a particular suffix - * @param str the string to test - * @param the searched suffix - * @returns true if 'str' has the suffix 'suffix' - */ - static bool endsWith(UString const &str, UString const &suffix); + void writeEscapedWithTags(UStringView str, UFILE *output); /** * Prints a word @@ -410,7 +351,7 @@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWord(UString const &sf, UString const &lf, UFILE *output); + void printWord(UStringView sf, UStringView lf, UFILE *output); /** * Prints a word. @@ -420,7 +361,7 @@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordPopBlank(UString const &sf, UString const &lf, UFILE *output); + void printWordPopBlank(UStringView sf, UStringView lf, UFILE *output); /** * Prints a word (Bilingual version) @@ -428,7 +369,7 @@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordBilingual(UString const &sf, UString const &lf, UFILE *output); + void printWordBilingual(UStringView sf, UStringView lf, UFILE *output); /** @@ -436,21 +377,21 @@ * @param lf lexical form * @param output stream where the word is written */ - void printSAOWord(UString const &lf, UFILE *output); + void printSAOWord(UStringView lf, UFILE *output); /** * Prints an unknown word * @param sf surface form of the word * @param output stream where the word is written */ - void printUnknownWord(UString const &sf, UFILE *output); + void printUnknownWord(UStringView sf, UFILE *output); void initDecompositionSymbols(); - vector numbers; + std::vector numbers; int readTMAnalysis(InputFile& input); - unsigned int lastBlank(UString const &str); + unsigned int lastBlank(UStringView str); /** * Print one blankqueue item if there is one, or a given "space" value. @@ -458,28 +399,37 @@ * @param val the space character to use if no blank queue * @param output stream where the word is written */ - void printSpace(UChar32 const val, UFILE *output); + void printSpace(UChar32 val, UFILE *output); /** * Print one possibly escaped character * if it's a space and the blank queue is non-empty, * pop the first blank and print that instead */ - void printChar(const UChar32 val, UFILE* output); + void printChar(UChar32 val, UFILE* output); - void skipUntil(InputFile& input, UFILE *output, UChar32 const character); - static UString removeTags(UString const &str); + void skipUntil(InputFile& input, UFILE *output, UChar32 character); + static UStringView removeTags(UStringView str); UString compoundAnalysis(UString str); - size_t firstNotAlpha(UString const &sf); + + struct Indices { + size_t i_codepoint; + size_t i_utf16; // always >= i_codepoint since some codepoints take up 2 UTF-16's + }; + + /* + * Iterates through unicode characters, returns a Unicode character + * index and UTF-16 string index of first non-alphabetic character, + * or size of the string (in characters, string size) + * + * @return index of first non-alpha char, or string size, as a tuple of number of characters and index in string + */ + Indices firstNotAlpha(UStringView sf); void analysis_wrapper_null_flush(InputFile& input, UFILE *output); void bilingual_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); void generation_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode); - void postgeneration_wrapper_null_flush(InputFile& input, UFILE *output); - void intergeneration_wrapper_null_flush(InputFile& input, UFILE *output); - void transliteration_wrapper_null_flush(InputFile& input, UFILE *output); - - UString compose(UString const &lexforms, UString const &queue) const; + UString compose(UStringView lexforms, UStringView queue) const; void procNodeICX(); void procNodeRCX(); @@ -488,21 +438,32 @@ bool isLastBlankTM = false; xmlTextReaderPtr reader; + + static constexpr size_t max_case_insensitive_state_size = 65536; + /* + * Including lowercased versions for every character can potentially create very large states + * (See https://github.com/apertium/lttoolbox/issues/167 ). As a sanity-check we don't do + * case-insensitive matching if the state size exceeds max_case_insensitive_state_size. + * + * @return running with --case-sensitive or state size exceeds max + */ + bool beCaseSensitive(const State& state) { + return caseSensitive || state.size() >= max_case_insensitive_state_size; + } + public: /* * String constants */ - static UString const XML_TEXT_NODE; - static UString const XML_COMMENT_NODE; - static UString const XML_IGNORED_CHARS_ELEM; - static UString const XML_RESTORE_CHAR_ELEM; - static UString const XML_RESTORE_CHARS_ELEM; - static UString const XML_VALUE_ATTR; - static UString const XML_CHAR_ELEM; - static UString const WBLANK_START; - static UString const WBLANK_END; - static UString const WBLANK_FINAL; + static constexpr UStringView XML_TEXT_NODE = u"#text"; + static constexpr UStringView XML_COMMENT_NODE = u"#comment"; + static constexpr UStringView XML_IGNORED_CHARS_ELEM = u"ignored-chars"; + static constexpr UStringView XML_RESTORE_CHAR_ELEM = u"restore-char"; + static constexpr UStringView XML_RESTORE_CHARS_ELEM = u"restore-chars"; + static constexpr UStringView XML_VALUE_ATTR = u"value"; + static constexpr UStringView XML_CHAR_ELEM = u"char"; + static constexpr UStringView WBLANK_FINAL = u"[[/]]"; FSTProcessor(); @@ -510,7 +471,8 @@ void initTMAnalysis(); void initSAO(){initAnalysis();}; void initGeneration(); - void initPostgeneration(); + void initPostgeneration(){initTransliteration();}; + void initTransliteration(); void initBiltrans(); void initDecomposition(); @@ -520,29 +482,29 @@ void postgeneration(InputFile& input, UFILE *output); void intergeneration(InputFile& input, UFILE *output); void transliteration(InputFile& input, UFILE *output); - UString biltrans(UString const &input_word, bool with_delim = true); - UString biltransfull(UString const &input_word, bool with_delim = true); + UString biltrans(UStringView input_word, bool with_delim = true); + UString biltransfull(UStringView input_word, bool with_delim = true); void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); - pair biltransWithQueue(UString const &input_word, bool with_delim = true); - UString biltransWithoutQueue(UString const &input_word, bool with_delim = true); + std::pair biltransWithQueue(UStringView input_word, bool with_delim = true); + UString biltransWithoutQueue(UStringView input_word, bool with_delim = true); void SAO(InputFile& input, UFILE *output); - void parseICX(string const &file); - void parseRCX(string const &file); + void parseICX(std::string const &file); + void parseRCX(std::string const &file); void load(FILE *input); bool valid() const; - void setCaseSensitiveMode(bool const value); - void setDictionaryCaseMode(bool const value); - void setBiltransSurfaceForms(bool const value); - void setIgnoredChars(bool const value); - void setRestoreChars(bool const value); - void setNullFlush(bool const value); - void setUseDefaultIgnoredChars(bool const value); - void setDisplayWeightsMode(bool const value); - void setMaxAnalysesValue(int const value); - void setMaxWeightClassesValue(int const value); + void setCaseSensitiveMode(bool value); + void setDictionaryCaseMode(bool value); + void setBiltransSurfaceForms(bool value); + void setIgnoredChars(bool value); + void setRestoreChars(bool value); + void setNullFlush(bool value); + void setUseDefaultIgnoredChars(bool value); + void setDisplayWeightsMode(bool value); + void setMaxAnalysesValue(int value); + void setMaxWeightClassesValue(int value); bool getNullFlush(); bool getDecompoundingMode(); }; diff -Nru lttoolbox-3.6.6/lttoolbox/lsx-comp.1 lttoolbox-3.7.1/lttoolbox/lsx-comp.1 --- lttoolbox-3.6.6/lttoolbox/lsx-comp.1 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lsx-comp.1 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,101 @@ +.Dd March 8, 2006 +.Dt LT-COMP 1 +.Os Apertium +.Sh NAME +.Nm lt-comp +.Nd augmented letter transducer compiler for Apertium +.Sh SYNOPSIS +.Nm lt-comp +.Op Fl a | v | l | r | m | h +.Cm lr | rl +.Ar dictionary_file +.Ar output_file +.Op Ar acx_file +.Sh DESCRIPTION +.Nm lt-comp +is the application responsible for compiling dictionaries used by +.Xr lt-proc 1 +in Apertium into a compact and efficient representation +(a class of finite-state transducers called augmented letter transducers). +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl a , Fl Fl alt +Sets the value of the +.Sy alt +attribute to use in compilation. +.Pp +Note that if no value is set, all entries containing an \fIalt\fR +attribute are omitted. +.It Fl v , Fl Fl var +Sets the value of the +.Sy v +attribute to use in compilation. +This should only be used with monodixes; for bidixes, see +.Fl l +and +.Fl r . +.Pp +Note that if no value is set, all entries containing a +.Sy v +attribute are considered to be +.Em left-to-right . +.It Fl l , Fl Fl var-left +Sets the value of the +.Sy vl +attribute for use in compilation of bidixes. +.Dq Left +here refers to the side of the dictionary, so this option is only valid in +.Cm rl +mode. +.It Fl r , Fl Fl var-right +Sets the value of the +.Sy vr +attribute for use in compilation of bidixes. +.Dq Right +here refers to the side of the dictionary, so this option is only valid in +.Cm lr +mode. +.It Fl m , Fl Fl keep-boundaries +Keep any morpheme boundaries defined by the '' symbol +.It Fl H , Fl Fl hfst +expect HFST symbols +.It Fl S , Fl Fl no-split +don't attempt to split into word and punctuation transducers +.It Fl j , Fl Fl jobs +Parallelise minimisation by using one cpu core per section. By +default, this also creates a new section after 50.000 entries. You can +override this number by setting the environment variable +LT_MAX_SECTION_ENTRIES to some number. If set to 0, sections are never +split (but kept exactly as in the dix file). You can also set the +environment variable LT_JOBS=true if you always want parallel +minimisation even if lt-comp was called without this option. +.It Fl h , Fl Fl help +Prints a short help message. +.It Cm lr +The resulting transducer will process dictionary entries +.Em left-to-right . +.It Cm rl +The resulting transducer will process dictionary entries +.Em right-to-left . +.El +.Sh FILES +.Bl -tag -width Ds +.It Ar dictionary_file +The input dictionary. +.It Ar output_file +The compiled dictionary (a finite state transducer). +.It Ar acx_file +Optional XML file of equivalent characters in monodices. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-expand 1 , +.Xr lt-proc 1 +.Sh COPYRIGHT +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff -Nru lttoolbox-3.6.6/lttoolbox/lt_append.cc lttoolbox-3.7.1/lttoolbox/lt_append.cc --- lttoolbox-3.6.6/lttoolbox/lt_append.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_append.cc 2022-11-01 08:36:47.000000000 +0000 @@ -15,121 +15,33 @@ * along with this program; if not, see . */ #include -#include #include - -#include +#include #include - -#include #include -#include -#include -#include -#include - -#ifdef _MSC_VER -#include -#include -#endif - -using namespace std; - -void endProgram(char *name) -{ - if(name != NULL) - { - cout << basename(name) << " v" << PACKAGE_VERSION <<": add sections to a compiled transducer" << endl; - cout << "USAGE: " << basename(name) << " [-ksh] bin_file1 bin_file2 output_file" << endl; - cout << " -k, --keep: in case of section name conflicts, keep the one from the first transducer" << endl; - cout << " -s, --single: treat input transducers as one-sided" << endl; - cout << " -h, --help: print this message and exit" << endl; - } - exit(EXIT_FAILURE); -} - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - - bool pairs = true; - bool keep = false; - -#ifdef _MSC_VER - _setmode(_fileno(output), _O_U8TEXT); -#endif - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"keep", no_argument, 0, 'k'}, - {"single", no_argument, 0, 's'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "ksh", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "ksh"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'k': - keep = true; - break; - - case 's': - pairs = false; - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } - } - - string infile1; - string infile2; - string outfile; - switch(argc - optind) - { - case 1: - infile1 = argv[argc-1]; - break; - - case 2: - infile1 = argv[argc-2]; - infile2 = argv[argc-1]; - break; - - case 3: - infile1 = argv[argc-3]; - infile2 = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; - } - - FILE* input1 = openInBinFile(infile1); - FILE* input2 = openInBinFile(infile2); - FILE* output = openOutBinFile(outfile); + CLI cli("add sections to a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('k', "keep", "in case of section name conflicts, keep the one from the first transducer"); + cli.add_bool_arg('s', "single", "treat input transducers as one-sided"); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("bin_file1", false); + cli.add_file_arg("bin_file2"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + bool pairs = !cli.get_bools()["single"]; + bool keep = cli.get_bools()["keep"]; + + FILE* input1 = openInBinFile(cli.get_files()[0]); + FILE* input2 = openInBinFile(cli.get_files()[1]); + FILE* output = openOutBinFile(cli.get_files()[2]); Alphabet alpha1, alpha2; - set chars1, chars2; - map trans1, trans2; + std::set chars1, chars2; + std::map trans1, trans2; readTransducerSet(input1, chars1, alpha1, trans1); readTransducerSet(input2, chars2, alpha2, trans2); @@ -144,7 +56,7 @@ if (keep) { continue; } else { - cerr << "WARNING: section '" << it.first << "' appears in both transducers and will be overwritten!" << endl; + std::cerr << "WARNING: section '" << it.first << "' appears in both transducers and will be overwritten!" << std::endl; } } it.second.updateAlphabet(alpha2, alpha1, pairs); diff -Nru lttoolbox-3.6.6/lttoolbox/lt_apply_acx.cc lttoolbox-3.7.1/lttoolbox/lt_apply_acx.cc --- lttoolbox-3.6.6/lttoolbox/lt_apply_acx.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_apply_acx.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("apply an ACX file to a compiled transducer", PACKAGE_VERSION); + cli.add_file_arg("input_file", false); + cli.add_file_arg("acx_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + FILE* input = openInBinFile(cli.get_files()[0]); + auto acx = readACX(cli.get_files()[1].c_str()); + FILE* output = openOutBinFile(cli.get_files()[2]); + + Alphabet alpha; + std::set letters; + std::map trans; + readTransducerSet(input, letters, alpha, trans); + + for (auto& it : trans) { + it.second.applyACX(alpha, acx); + } + + writeTransducerSet(output, letters, alpha, trans); + + fclose(input); + fclose(output); + return 0; +} diff -Nru lttoolbox-3.6.6/lttoolbox/lt_comp.cc lttoolbox-3.7.1/lttoolbox/lt_comp.cc --- lttoolbox-3.6.6/lttoolbox/lt_comp.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_comp.cc 2022-11-01 08:36:47.000000000 +0000 @@ -17,14 +17,10 @@ #include #include #include +#include +#include -#include #include -#include -#include -#include - -using namespace std; /* * Error function that does nothing so that when we fallback from @@ -36,160 +32,73 @@ return; } -void endProgram(char *name) -{ - if(name != NULL) - { - cout << basename(name) << " v" << PACKAGE_VERSION <<": build a letter transducer from a dictionary" << endl; - cout << "USAGE: " << basename(name) << " [-hmvalrHSj] lr | rl dictionary_file output_file [acx_file]" << endl; -#if HAVE_GETOPT_LONG - cout << " -m, --keep-boundaries: keep morpheme boundaries" << endl; - cout << " -v, --var: set language variant" << endl; - cout << " -a, --alt: set alternative (monodix)" << endl; - cout << " -l, --var-left: set left language variant (bidix)" << endl; - cout << " -r, --var-right: set right language variant (bidix)" << endl; - cout << " -H, --hfst: expect HFST symbols" << endl; - cout << " -S, --no-split: don't attempt to split into word and punctuation transducers" << endl; - cout << " -j, --jobs: use one cpu core per section when minimising, new section after 50k entries" << endl; -#else - cout << " -m: keep morpheme boundaries" << endl; - cout << " -v: set language variant" << endl; - cout << " -a: set alternative (monodix)" << endl; - cout << " -l: set left language variant (bidix)" << endl; - cout << " -r: set right language variant (bidix)" << endl; - cout << " -H: expect HFST symbols" << endl; - cout << " -S: don't attempt to split into word and punctuation transducers" << endl; - cout << " -j: use one cpu core per section when minimising, new section after 50k entries" << endl; -#endif - cout << "Modes:" << endl; - cout << " lr: left-to-right compilation" << endl; - cout << " rl: right-to-left compilation" << endl; - } - exit(EXIT_FAILURE); -} - - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("build a letter transducer from a dictionary", PACKAGE_VERSION); + cli.add_bool_arg('d', "debug", "insert line numbers before each entry"); + cli.add_bool_arg('m', "keep-boundaries", "keep morpheme boundaries"); + cli.add_str_arg('v', "var", "set language variant", "VAR"); + cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); + cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); + cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); + cli.add_bool_arg('H', "hfst", "expect HFST symbols"); + cli.add_bool_arg('S', "no-split", "don't attempt to split into word and punctuation sections"); + cli.add_bool_arg('j', "jobs", "use one cpu core per section when minimising, new section after 50k entries"); + cli.add_bool_arg('V', "verbose", "compile verbosely"); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("lr | rl | u", false); + cli.add_file_arg("dictionary_file", false); + cli.add_file_arg("output_file", false); + cli.add_file_arg("acx_file", true); + cli.parse_args(argc, argv); char ttype = 'x'; Compiler c; AttCompiler a; - c.setKeepBoundaries(false); - c.setVerbose(false); - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - string vl; - string vr; - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"alt", required_argument, 0, 'a'}, - {"var", required_argument, 0, 'v'}, - {"var-left", required_argument, 0, 'l'}, - {"var-right", required_argument, 0, 'r'}, - {"keep-boundaries", no_argument, 0, 'm'}, - {"hfst", no_argument, 0, 'H'}, - {"no-split", no_argument, 0, 'S'}, - {"help", no_argument, 0, 'h'}, - {"verbose", no_argument, 0, 'V'}, - {"jobs", no_argument, 0, 'j'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "a:v:l:r:mHShVj", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "a:v:l:r:mHShV"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'a': - c.setAltValue(to_ustring(optarg)); - break; - - case 'v': - c.setVariantValue(to_ustring(optarg)); - break; - - case 'l': - vl = optarg; - c.setVariantLeftValue(to_ustring(optarg)); - break; - - case 'r': - vr = optarg; - c.setVariantRightValue(to_ustring(optarg)); - break; - - case 'm': - c.setKeepBoundaries(true); - break; - - case 'H': - a.setHfstSymbols(true); - break; - - case 'S': - a.setSplitting(false); - break; - case 'j': - c.setJobs(true); - c.setMaxSectionEntries(50000); - break; + bool have_vl = false; + bool have_vr = false; + auto args = cli.get_strs(); + if (args.find("var") != args.end()) { + c.setVariantValue(to_ustring(args["var"][0].c_str())); + } + if (args.find("alt") != args.end()) { + c.setAltValue(to_ustring(args["alt"][0].c_str())); + } + if (args.find("var-left") != args.end()) { + have_vl = true; + c.setVariantLeftValue(to_ustring(args["var-left"][0].c_str())); + } + if (args.find("var-right") != args.end()) { + have_vr = true; + c.setVariantRightValue(to_ustring(args["var-right"][0].c_str())); + } - case 'V': - c.setVerbose(true); - break; + c.setEntryDebugging(cli.get_bools()["debug"]); + c.setKeepBoundaries(cli.get_bools()["keep-boundaries"]); + c.setVerbose(cli.get_bools()["verbose"]); - case 'h': - default: - endProgram(argv[0]); - break; - } - } + a.setHfstSymbols(cli.get_bools()["hfst"]); + a.setSplitting(!cli.get_bools()["no-split"]); - if(std::getenv("LT_JOBS")) { + auto LT_JOBS = std::getenv("LT_JOBS"); + if(cli.get_bools()["jobs"] || (LT_JOBS != NULL && LT_JOBS[0] != 'n')) { c.setJobs(true); c.setMaxSectionEntries(50000); } + else { + c.setJobs(false); + c.setMaxSectionEntries(0); + } if(const char* max_section_entries = std::getenv("LT_MAX_SECTION_ENTRIES")) { - c.setMaxSectionEntries(stol(max_section_entries)); + c.setMaxSectionEntries(std::stol(max_section_entries)); } - string opc; - string infile; - string outfile; - string acxfile; - - switch(argc - optind + 1) - { - case 5: - opc = argv[argc-4]; - infile = argv[argc-3]; - outfile = argv[argc-2]; - acxfile = argv[argc-1]; - break; - - case 4: - opc = argv[argc-3]; - infile = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; - } + std::string opc = cli.get_files()[0]; + std::string infile = cli.get_files()[1]; + std::string outfile = cli.get_files()[2]; + std::string acxfile = cli.get_files()[3]; xmlTextReaderPtr reader; reader = xmlReaderForFile(infile.c_str(), NULL, 0); @@ -207,7 +116,7 @@ } else { - cerr << "Error: Cannot not open file '" << infile << "'." << endl << endl; + std::cerr << "Error: Cannot not open file '" << infile << "'." << std::endl << std::endl; exit(EXIT_FAILURE); } initGenericErrorDefaultFunc(NULL); @@ -215,10 +124,9 @@ if(opc == "lr") { - if(vr == "" && vl != "") - { - cout << "Error: -l specified, but mode is lr" << endl; - endProgram(argv[0]); + if (have_vl) { + std::cout << "Error: -l specified, but mode is lr" << std::endl; + cli.print_usage(); } if(ttype == 'a') { @@ -226,7 +134,7 @@ } else { - if(acxfile != "") + if(!acxfile.empty()) { c.parseACX(acxfile, Compiler::COMPILER_RESTRICTION_LR_VAL); } @@ -235,10 +143,9 @@ } else if(opc == "rl") { - if(vl == "" && vr != "") - { - cout << "Error: -r specified, but mode is rl" << endl; - endProgram(argv[0]); + if (have_vr) { + std::cout << "Error: -r specified, but mode is rl" << std::endl; + cli.print_usage(); } if(ttype == 'a') { @@ -249,17 +156,19 @@ c.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL); } } + else if (opc == "u") { + if (ttype == 'a') { + a.parse(infile, false); + } else { + c.parse(infile, Compiler::COMPILER_RESTRICTION_U_VAL); + } + } else { - endProgram(argv[0]); + cli.print_usage(); } - FILE *output = fopen(outfile.c_str(), "wb"); - if(!output) - { - cerr << "Error: Cannot open file '" << outfile << "'." << endl; - exit(EXIT_FAILURE); - } + FILE* output = openOutBinFile(outfile); if(ttype == 'a') { a.write(output); diff -Nru lttoolbox-3.6.6/lttoolbox/lt-compose.1 lttoolbox-3.7.1/lttoolbox/lt-compose.1 --- lttoolbox-3.6.6/lttoolbox/lt-compose.1 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt-compose.1 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,60 @@ +.Dd September 25, 2022 +.Dt LT-COMPOSE 1 +.Os Apertium +.Sh NAME +.Nm lt-compose +.Nd compiled dictionary composition for Apertium +.Sh SYNOPSIS +.Nm lt-compose +.Ar transducer1_binary +.Ar transducer2_binary +.Ar composed_binary +.Sh DESCRIPTION +.Nm lt-compose +is the application responsible for composing two compiled +dictionaries, matching the output-side of transducer1 with the +input-side of transducer2. By default, matches are anchored to +initial/final states, so the transducer2 has to match full paths (in +regex terms, transducer2 is implicitly surrounded by ^ and $). But +there is also support for letting transducer2 match sub-paths of +transducer1 (in which matches become optional, making the composition +a superset of transducer1). Matching sub-paths means that transducer2 +can start matching in the midst of paths of transducer2 (in regex +terms, transducer2 is implicitly surrounded in .* on both sides). +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl i , Fl Fl inverted +Apply transducer2 to the input-side (left) of transducer1 instead of +the output-side. You would do this when altering the forms of an +analyser. +.It Fl a , Fl Fl anywhere +Allow transducer2 to match sub-paths instead of requiring matching +initial/final states. Matches then become optional. +.It Fl j , Fl Fl jobs +Parallelise composition by using one cpu core per section of +transducer1. You can also set the environment variable LT_JOBS=true if +you always want parallelisation where available in lttoolbox. +.Sh FILES +.Bl -tag -width Ds +.It Ar transducer1_binary +a finite state transducer +.It Ar transducer2_binary +a finite state transducer +.It Ar composed_binary +a finite state transducer +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-comp 1 , +.Xr lt-expand 1 , +.Xr lt-print 1 , +.Xr lt-trim 1 , +.Xr lt-proc 1 +.Sh AUTHOR +Copyright \(co 2005-2022 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff -Nru lttoolbox-3.6.6/lttoolbox/lt_compose.cc lttoolbox-3.7.1/lttoolbox/lt_compose.cc --- lttoolbox-3.6.6/lttoolbox/lt_compose.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_compose.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include +#include + +void +compose(FILE* file_f, FILE* file_g, FILE* file_gf, bool f_inverted, bool g_anywhere, bool jobs) +{ + Alphabet alph_f; + std::set letters_f; + std::map trans_f; + readTransducerSet(file_f, letters_f, alph_f, trans_f); + Alphabet alph_g; + std::set letters_g; + std::map trans_g; + readTransducerSet(file_g, letters_g, alph_g, trans_g); + + std::map trans_gf; + + Transducer union_g; + for (auto& it : trans_g) { + if (union_g.isEmpty()) { + union_g = it.second; + } else { + union_g.unionWith(alph_g, it.second); + } + } + union_g.minimize(); + + std::vector>> compositions; + for (auto& it : trans_f) { + if (it.second.numberOfTransitions() == 0) { + std::cerr << "Warning: section " << it.first << " is empty! Skipping it..." << std::endl; + continue; + } + if(jobs) { + compositions.push_back(std::async( + [](Transducer &f, Transducer &g, Alphabet &alph_f, Alphabet &alph_g, + bool f_inverted, bool g_anywhere, UString name) { + Transducer gf = f.compose(g, alph_f, alph_g, f_inverted, g_anywhere); + if (gf.hasNoFinals()) { + std::cerr << "Warning: section " << name + << " had no final state after composing! Skipping it..." + << std::endl; + ; + } else { + gf.minimize(); + } + return std::make_pair(name, gf); + }, + std::ref(it.second), std::ref(union_g), std::ref(alph_f), + std::ref(alph_g), f_inverted, g_anywhere, it.first)); + } else { + Transducer gf = it.second.compose(union_g, alph_f, alph_g, f_inverted, g_anywhere); + if (gf.hasNoFinals()) { + std::cerr << "Warning: section " << it.first + << " had no final state after composing! Skipping it..." + << std::endl; + continue; + } + gf.minimize(); + trans_gf[it.first] = gf; + } + } + for (auto &thr : compositions) { + auto it = thr.get(); + if (!it.second.hasNoFinals()) { + trans_gf[it.first] = it.second; + } + } + + if (trans_gf.empty()) { + std::cerr << "Error: Composition gave empty transducer!" << std::endl; + exit(EXIT_FAILURE); + } + + writeTransducerSet(file_gf, letters_f, alph_f, trans_gf); +} + + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("compose transducer1 with transducer2", PACKAGE_VERSION); + cli.add_bool_arg('i', "inverted", "run composition right-to-left on transducer1"); + cli.add_bool_arg('a', "anywhere", "don't require anchored matches, let transducer2 optionally compose at any sub-path"); + cli.add_file_arg("transducer1_bin_file", false); + cli.add_file_arg("transducer2_bin_file"); + cli.add_file_arg("trimmed_bin_file"); + cli.parse_args(argc, argv); + + FILE* transducer1 = openInBinFile(cli.get_files()[0]); + FILE* transducer2 = openInBinFile(cli.get_files()[1]); + FILE* composition = openOutBinFile(cli.get_files()[2]); + + bool jobs = false; + auto LT_JOBS = std::getenv("LT_JOBS"); + if(cli.get_bools()["jobs"] || (LT_JOBS != NULL && LT_JOBS[0] != 'n')) { + jobs = true; + } + compose(transducer1, transducer2, composition, + cli.get_bools()["inverted"], + cli.get_bools()["anywhere"], + jobs); + + fclose(transducer1); + fclose(transducer2); + fclose(composition); + + return 0; +} diff -Nru lttoolbox-3.6.6/lttoolbox/lt_expand.cc lttoolbox-3.7.1/lttoolbox/lt_expand.cc --- lttoolbox-3.6.6/lttoolbox/lt_expand.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_expand.cc 2022-11-01 08:36:47.000000000 +0000 @@ -18,133 +18,40 @@ #include #include #include - -#include -#include -#include -#include -#include - -#ifdef _MSC_VER -#include -#include -#endif - -using namespace std; - -void endProgram(char *name) -{ - if(name != NULL) - { - cout << basename(name) << " v" << PACKAGE_VERSION <<": expand the contents of a dictionary file" << endl; - cout << "USAGE: " << basename(name) << " [-mavlrh] dictionary_file [output_file]" << endl; -#if HAVE_GETOPT_LONG - cout << " -m, --keep-boundaries: keep morpheme boundaries" << endl; - cout << " -v, --var: set language variant" << endl; - cout << " -a, --alt: set alternative (monodix)" << endl; - cout << " -l, --var-left: set left language variant (bidix)" << endl; - cout << " -r, --var-right: set right language variant (bidix)" << endl; -#else - cout << " -m: keep morpheme boundaries" << endl; - cout << " -v: set language variant" << endl; - cout << " -a: set alternative (monodix)" << endl; - cout << " -l: set left language variant (bidix)" << endl; - cout << " -r: set right language variant (bidix)" << endl; -#endif - } - exit(EXIT_FAILURE); -} +#include int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("expand the contents of a dictionary file", PACKAGE_VERSION); + cli.add_bool_arg('m', "keep-boundaries", "keep morpheme boundaries"); + cli.add_str_arg('v', "var", "set language variant", "VAR"); + cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); + cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); + cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); + cli.add_file_arg("dictionary_file", false); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); - FILE* input = NULL; - UFILE* output = NULL; Expander e; - e.setKeepBoundaries(false); - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"keep-boundaries", no_argument, 0, 'm'}, - {"alt", required_argument, 0, 'a'}, - {"var", required_argument, 0, 'v'}, - {"var-left", required_argument, 0, 'l'}, - {"var-right", required_argument, 0, 'r'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "a:v:l:r:mh", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "a:v:l:r:mh"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'a': - e.setAltValue(to_ustring(optarg)); - break; - - case 'v': - e.setVariantValue(to_ustring(optarg)); - break; - - case 'l': - e.setVariantLeftValue(to_ustring(optarg)); - break; - - case 'm': - e.setKeepBoundaries(true); - break; - - case 'r': - e.setVariantRightValue(to_ustring(optarg)); - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } + e.setKeepBoundaries(cli.get_bools()["keep-boundaries"]); + auto args = cli.get_strs(); + if (args.find("var") != args.end()) { + e.setVariantValue(to_ustring(args["var"][0].c_str())); } - - string infile; - string outfile; - - switch(argc - optind + 1) - { - case 2: - infile = argv[argc-1]; - break; - - case 3: - infile = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; + if (args.find("alt") != args.end()) { + e.setAltValue(to_ustring(args["alt"][0].c_str())); + } + if (args.find("var-left") != args.end()) { + e.setVariantLeftValue(to_ustring(args["var-left"][0].c_str())); + } + if (args.find("var-right") != args.end()) { + e.setVariantRightValue(to_ustring(args["var-right"][0].c_str())); } -#ifdef _MSC_VER - _setmode(_fileno(output), _O_U8TEXT); -#endif - - input = openInBinFile(infile); - fclose(input); - output = openOutTextFile(outfile); + UFILE* output = openOutTextFile(cli.get_files()[1]); - e.expand(infile, output); + e.expand(cli.get_files()[0], output); u_fclose(output); return EXIT_SUCCESS; diff -Nru lttoolbox-3.6.6/lttoolbox/lt_invert.cc lttoolbox-3.7.1/lttoolbox/lt_invert.cc --- lttoolbox-3.6.6/lttoolbox/lt_invert.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_invert.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + + CLI cli("reverse the direction of a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("in_bin"); + cli.add_file_arg("out_bin"); + cli.parse_args(argc, argv); + + FILE* input = openInBinFile(cli.get_files()[0]); + FILE* output = openOutBinFile(cli.get_files()[1]); + + Alphabet alphabet; + std::set alphabetic_chars; + std::map transducers; + readTransducerSet(input, alphabetic_chars, alphabet, transducers); + + for (auto& it : transducers) { + it.second.invert(alphabet); + } + + writeTransducerSet(output, alphabetic_chars, alphabet, transducers); + + fclose(input); + fclose(output); + return EXIT_SUCCESS; +} diff -Nru lttoolbox-3.6.6/lttoolbox/lt_locale.cc lttoolbox-3.7.1/lttoolbox/lt_locale.cc --- lttoolbox-3.6.6/lttoolbox/lt_locale.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_locale.cc 2022-11-01 08:36:47.000000000 +0000 @@ -16,6 +16,7 @@ */ #include #include +#include #include #include @@ -23,14 +24,12 @@ #include #endif -using namespace std; - void LtLocale::tryToSetLocale() { try { - locale::global(locale(locale::classic(), "", locale::ctype)); + std::locale::global(std::locale(std::locale::classic(), "", std::locale::ctype)); } catch (...) { // Nothing @@ -38,6 +37,7 @@ UErrorCode status = U_ZERO_ERROR; uloc_setDefault("en_US_POSIX", &status); + ucnv_setDefaultName("UTF-8"); #if !defined(__CYGWIN__) && !defined (__MINGW32__) if(setlocale(LC_CTYPE, "") != NULL) @@ -45,7 +45,7 @@ return; } - cerr << "Warning: unsupported locale, fallback to \"C\"" << endl; + std::cerr << "Warning: unsupported locale, fallback to \"C\"" << std::endl; setlocale(LC_ALL, "C"); #endif diff -Nru lttoolbox-3.6.6/lttoolbox/lt_locale.h lttoolbox-3.7.1/lttoolbox/lt_locale.h --- lttoolbox-3.6.6/lttoolbox/lt_locale.h 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_locale.h 2022-11-01 08:36:47.000000000 +0000 @@ -20,8 +20,6 @@ #include -using namespace std; - class LtLocale { public: diff -Nru lttoolbox-3.6.6/lttoolbox/lt-paradigm.1 lttoolbox-3.7.1/lttoolbox/lt-paradigm.1 --- lttoolbox-3.6.6/lttoolbox/lt-paradigm.1 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt-paradigm.1 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,47 @@ +.Dd June 30, 2022 +.Dt LT-PARADIGM 1 +.Os Apertium +.Sh NAME +.Nm lt-paradigm +.Nd generate listings from a compiled transducer +.Sh SYNOPSIS +.Nm lt-paradigm +.Op Fl a | s | z | h +.Op Fl e Ar TAG +.Ar fst_file +.Op Ar input_file Op Ar output_file +.Sh DESCRIPTION +.Nm lt-paradigm +prints paths matching input patterns from a transducer +.Bl -tag -width Ds +.It Ar fst_file +The compiled transducer +.It Ar input_file +A list of patterns to be extracted, separated by newlines or nulls +.It Ar output_file +All paths matching the patterns in input_file. Each path is terminated by a newline and groups are separated by the separator used in the input. +.El +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl a Fl Fl analyser +Match patterns on the right side of the transducer rather than the left. +.It Fl e Ar TAG Fl Fl exclude Ar TAG +When expanding <*>, do use +.Ar TAG +.It Fl s Fl Fl sort +Sort the output for each pattern. +.It Fl z Fl Fl null-flush +No-op, included for compatibility. +.It Fl h Fl Fl help +Prints a short help message. +.El +.Sh SEE ALSO +.Xr lt-expand 1 , +.Xr hfst-expand 1 , +.Sh COPYRIGHT +Copyright \(co 2022 Apertium +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff -Nru lttoolbox-3.6.6/lttoolbox/lt_paradigm.cc lttoolbox-3.7.1/lttoolbox/lt_paradigm.cc --- lttoolbox-3.6.6/lttoolbox/lt_paradigm.cc 1970-01-01 00:00:00.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_paradigm.cc 2022-11-01 08:36:47.000000000 +0000 @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +void expand(Transducer& inter, int state, const std::set& past_states, + const std::vector& syms, const Alphabet& alpha, UFILE* out, + std::set>& outset) +{ + if (inter.isFinal(state)) { + UString l, r; + for (auto& it : syms) { + auto pr = alpha.decode(it); + alpha.getSymbol(l, pr.first); + alpha.getSymbol(r, pr.second); + } + if (!l.empty() && !r.empty()) { + if (out != nullptr) { + u_fprintf(out, "%S:%S\n", r.c_str(), l.c_str()); + } else { + outset.insert({r, l}); + } + } + } + std::set new_states = past_states; + new_states.insert(state); + for (auto& it : inter.getTransitions()[state]) { + if (past_states.find(it.second.first) != past_states.end()) { + continue; + } + std::vector new_syms = syms; + new_syms.push_back(it.first); + expand(inter, it.second.first, new_states, new_syms, alpha, out, outset); + } +} + +void process(UStringView pattern, std::map& trans, + Alphabet& alpha, + const std::set& letters, const std::set& tags, + UFILE* output, bool sort) +{ + int32_t any_char = static_cast('*'); + int32_t any_tag = alpha(u"<*>"); + std::vector pat = alpha.tokenize(pattern); + Transducer other; + int state = other.getInitial(); + for (auto& it : pat) { + if (it == any_char) { + state = other.insertNewSingleTransduction(0, state); + for (auto& sym : letters) { + other.linkStates(state, state, alpha(sym, sym)); + } + } else if (it == any_tag) { + state = other.insertNewSingleTransduction(0, state); + for (auto& sym : tags) { + other.linkStates(state, state, alpha(sym, sym)); + } + } else { + state = other.insertNewSingleTransduction(alpha(it, it), state); + } + } + other.setFinal(state); + std::set> outset; + for (auto& it : trans) { + Transducer inter = it.second.trim(other, alpha, alpha); + if (!inter.getFinals().empty()) { + std::set states; + std::vector syms; + expand(inter, inter.getInitial(), states, syms, alpha, + (sort ? nullptr : output), outset); + } + } + if (sort) { + for (auto& it : outset) { + u_fprintf(output, "%S:%S\n", it.first.c_str(), it.second.c_str()); + } + } +} + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("generate listings from a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('a', "analyser", "FST is an analyser (tags on the right)"); + cli.add_str_arg('e', "exclude", "disregard paths containing TAG", "TAG"); + cli.add_bool_arg('s', "sort", "alphabetize the paths for each pattern"); + cli.add_bool_arg('z', "null-flush", "flush output on \\0"); + cli.add_bool_arg('h', "help", "show this help and exit"); + cli.add_file_arg("FST", false); + cli.add_file_arg("input"); + cli.add_file_arg("output"); + cli.parse_args(argc, argv); + + bool should_invert = !cli.get_bools()["analyser"]; + bool sort = cli.get_bools()["sort"]; + std::set skip_tags; + for (auto& it : cli.get_strs()["exclude"]) { + skip_tags.insert(to_ustring(it.c_str())); + } + + FILE* fst = openInBinFile(cli.get_files()[0]); + + std::set letters; + Alphabet alpha; + std::map trans; + readTransducerSet(fst, letters, alpha, trans); + fclose(fst); + + alpha.includeSymbol(u"<*>"); + std::set tags; + for (int32_t i = 1; i <= alpha.size(); i++) { + if (!skip_tags.empty()) { + UString t; + alpha.getSymbol(t, -i); + if (skip_tags.find(t) != skip_tags.end()) continue; + } + tags.insert(-i); + } + + if (should_invert) { + for (auto& it : trans) { + it.second.invert(alpha); + } + } + + InputFile input; + if (!cli.get_files()[1].empty()) { + input.open_or_exit(cli.get_files()[1].c_str()); + } + UFILE* output = openOutTextFile(cli.get_files()[2]); + + UString cur; + do { + UChar32 c = input.get(); + if (c == '\n' || c == '\0' || c == U_EOF) { + process(cur, trans, alpha, letters, tags, output, sort); + if (c != U_EOF) { + u_fputc(c, output); + u_fflush(output); + } + cur.clear(); + } else { + cur += c; + } + } while (!input.eof()); + + u_fclose(output); + return 0; +} diff -Nru lttoolbox-3.6.6/lttoolbox/lt_print.cc lttoolbox-3.7.1/lttoolbox/lt_print.cc --- lttoolbox-3.6.6/lttoolbox/lt_print.cc 2022-05-30 19:01:36.000000000 +0000 +++ lttoolbox-3.7.1/lttoolbox/lt_print.cc 2022-11-01 08:36:47.000000000 +0000 @@ -15,113 +15,30 @@ * along with this program; if not, see . */ #include -#include #include - -#include +#include #include -#include -#include -#include -#include -#include