diff -Nru libvoikko-4.0.1/configure libvoikko-4.0.2/configure --- libvoikko-4.0.1/configure 2016-01-31 17:30:44.000000000 +0000 +++ libvoikko-4.0.2/configure 2016-02-19 13:54:23.000000000 +0000 @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for libvoikko 4.0.1. +# Generated by GNU Autoconf 2.69 for libvoikko 4.0.2. # # Report bugs to . # @@ -590,8 +590,8 @@ # Identity of this package. PACKAGE_NAME='libvoikko' PACKAGE_TARNAME='libvoikko' -PACKAGE_VERSION='4.0.1' -PACKAGE_STRING='libvoikko 4.0.1' +PACKAGE_VERSION='4.0.2' +PACKAGE_STRING='libvoikko 4.0.2' PACKAGE_BUGREPORT='hatapitk@iki.fi' PACKAGE_URL='' @@ -1400,7 +1400,7 @@ # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures libvoikko 4.0.1 to adapt to many kinds of systems. +\`configure' configures libvoikko 4.0.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1471,7 +1471,7 @@ if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of libvoikko 4.0.1:";; + short | recursive ) echo "Configuration of libvoikko 4.0.2:";; esac cat <<\_ACEOF @@ -1617,7 +1617,7 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -libvoikko configure 4.0.1 +libvoikko configure 4.0.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2270,7 +2270,7 @@ This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by libvoikko $as_me 4.0.1, which was +It was created by libvoikko $as_me 4.0.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -3138,7 +3138,7 @@ # Define the identity of the package. PACKAGE='libvoikko' - VERSION='4.0.1' + VERSION='4.0.2' cat >>confdefs.h <<_ACEOF @@ -18888,7 +18888,7 @@ # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by libvoikko $as_me 4.0.1, which was +This file was extended by libvoikko $as_me 4.0.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -18954,7 +18954,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -libvoikko config.status 4.0.1 +libvoikko config.status 4.0.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff -Nru libvoikko-4.0.1/configure.ac libvoikko-4.0.2/configure.ac --- libvoikko-4.0.1/configure.ac 2016-01-31 17:29:42.000000000 +0000 +++ libvoikko-4.0.2/configure.ac 2016-02-19 13:54:23.000000000 +0000 @@ -28,7 +28,7 @@ dnl General options for autoconf AC_PREREQ(2.60) -AC_INIT([libvoikko],[4.0.1],[hatapitk@iki.fi]) +AC_INIT([libvoikko],[4.0.2],[hatapitk@iki.fi]) LT_PREREQ([2.2.6]) diff -Nru libvoikko-4.0.1/debian/changelog libvoikko-4.0.2/debian/changelog --- libvoikko-4.0.1/debian/changelog 2016-02-17 08:48:23.000000000 +0000 +++ libvoikko-4.0.2/debian/changelog 2016-08-11 14:22:30.000000000 +0000 @@ -1,10 +1,38 @@ -libvoikko (4.0.1-3ubuntu1) xenial; urgency=medium +libvoikko (4.0.2-2ubuntu1) yakkety; urgency=medium - * Merge from Debian unstable. (LP: #1546404) Remaining changes: - - Add Conflicts/Replaces/Provides for g++5 ABI library rename; can be - dropped after xenial release. + * Fork packaging for Ubuntu to keep HFST disabled for now. + * Since it's past 16.04 LTS, the Ubuntu's former C/R/P for g++5 ABI + library renaming can be however dropped. + * Remaining changes in Ubuntu: + - Don't build depend on hfst-ospell-dev + - Modify description to state that only VFST is enabled by default. + - Configure with --disable-hfst - -- Timo Jyrinki Wed, 17 Feb 2016 09:01:57 +0200 + -- Timo Jyrinki Thu, 11 Aug 2016 16:52:59 +0300 + +libvoikko (4.0.2-2) unstable; urgency=medium + + * Release to unstable. + + -- Timo Jyrinki Thu, 03 Mar 2016 17:31:28 +0200 + +libvoikko (4.0.2-1) experimental; urgency=medium + + [ Timo Jyrinki ] + * Run wrap-and-sort -a -t + * New upstream version. + * Enable HFST support and add /usr/share/voikko to dictionary paths. + - Build depend on hfst-ospell-dev + * Replace Conflicts with Breaks for the python package transition. + * Use https vcs urls. + * Remove use of hardening-wrapper but add hardening=+all to keep status quo. + * Update package description for the backend changes. + + [ Tino Didriksen ] + * Clean up debian/rules and debian/control. + * Remove .la files. + + -- Timo Jyrinki Thu, 25 Feb 2016 18:23:16 +0200 libvoikko (4.0.1-3) unstable; urgency=medium @@ -14,14 +42,6 @@ -- Timo Jyrinki Mon, 15 Feb 2016 10:19:37 +0200 -libvoikko (4.0.1-2ubuntu1) xenial; urgency=low - - * Merge from Debian unstable. Remaining changes: - - Add Conflicts/Replaces/Provides for g++5 ABI library rename; can be - dropped after xenial release. - - -- Steve Langasek Thu, 11 Feb 2016 09:06:14 -0800 - libvoikko (4.0.1-2) unstable; urgency=medium * Fix python install location. @@ -38,20 +58,6 @@ -- Timo Jyrinki Tue, 09 Feb 2016 09:10:36 +0200 -libvoikko (3.8-1ubuntu2) xenial; urgency=medium - - * Fix wrong stray dependency on libvoikko1v5 from libvoikko-dev. - - -- Steve Langasek Fri, 30 Oct 2015 11:05:49 -0700 - -libvoikko (3.8-1ubuntu1) xenial; urgency=low - - * Sync from Debian unstable. - * Add Conflicts/Replaces/Provides for g++5 ABI library rename; this - library's ABI is C, not C++, per bug #791181. - - -- Steve Langasek Thu, 29 Oct 2015 21:47:28 -0700 - libvoikko (3.8-1) unstable; urgency=medium * Imported Upstream version 3.8 @@ -61,12 +67,6 @@ -- Timo Jyrinki Mon, 28 Sep 2015 17:33:36 +0300 -libvoikko (3.7.1-1ubuntu1) wily; urgency=medium - - * Rename library packages for g++5 ABI transition. - - -- Steve Langasek Tue, 04 Aug 2015 08:09:58 +0000 - libvoikko (3.7.1-1) unstable; urgency=medium * Imported Upstream version 3.7.1 diff -Nru libvoikko-4.0.1/debian/control libvoikko-4.0.2/debian/control --- libvoikko-4.0.1/debian/control 2016-02-17 08:48:23.000000000 +0000 +++ libvoikko-4.0.2/debian/control 2016-08-11 14:04:42.000000000 +0000 @@ -2,48 +2,55 @@ Priority: optional Maintainer: Ubuntu Developers XSBC-Original-Maintainer: Timo Jyrinki -Build-Depends: debhelper (>= 9.0.0), dpkg-dev (>= 1.13.19), autotools-dev, python3, hardening-wrapper, dh-autoreconf, pkg-config, dh-python -Standards-Version: 3.9.6 +Build-Depends: autotools-dev, + debhelper (>= 9.0.0), + dh-autoreconf, + dh-python, + pkg-config, + python3, +Standards-Version: 3.9.7 Section: libs Homepage: http://voikko.puimula.org/ -Vcs-Git: git://anonscm.debian.org/collab-maint/libvoikko.git -Vcs-Browser: http://anonscm.debian.org/gitweb/?p=collab-maint/libvoikko.git;a=summary +Vcs-Git: https://anonscm.debian.org/git/collab-maint/libvoikko.git +Vcs-Browser: https://anonscm.debian.org/gitweb/?p=collab-maint/libvoikko.git;a=summary Package: libvoikko-dev Section: libdevel Architecture: any -Multi-Arch: foreign -Depends: ${shlibs:Depends}, libvoikko1 (= ${binary:Version}), ${misc:Depends} +Multi-Arch: foreign +Depends: libvoikko1 (= ${binary:Version}), + ${misc:Depends}, + ${shlibs:Depends}, Description: Development files for libvoikko - Libvoikko is a library of free natural language processing tools. It - aims to provide support for languages that are not well served by + Libvoikko is a library of free natural language processing tools. It + aims to provide support for languages that are not well served by other existing free linguistic tools. . - This package contains the files needed to build or develop applications + This package contains the files needed to build or develop applications that use Voikko. Package: libvoikko1 Architecture: any Multi-Arch: same -Depends: ${shlibs:Depends}, ${misc:Depends} -Suggests: voikko-fi -Breaks: voikko-fi (<< 2.0-1) -Conflicts: libvoikko1v5 -Replaces: libvoikko1v5 -Provides: libvoikko1v5 +Depends: ${misc:Depends}, + ${shlibs:Depends}, +Suggests: voikko-fi, +Breaks: voikko-fi (<< 2.0-1), Description: Library of free natural language processing tools - Libvoikko is a library of free natural language processing tools. It - aims to provide support for languages that are not well served by + Libvoikko is a library of free natural language processing tools. It + aims to provide support for languages that are not well served by other existing free linguistic tools. . - The library supports multiple backends, currently of which only Malaga - is enabled in this packaging: + The library supports multiple backends, of which VFST is + enabled in the Ubuntu's default build: . + - VFST: Finite state transducer format used for Finnish morphology + and as an experimental language independent backend. + - HFST (Helsinki Finite-State Transducer Technology): Supports ZHFST + speller archives for various languages. - Malaga: Left associative grammar for describing the morphology of Finnish language. - - HFST (Helsinki Finite-State Transducer Technology): Supports ZHFST - speller archives for various languages. - - Experimental backends: Lttoolbox, VFST and vislcg3. + - Experimental backends: Weighted VFST and Lttoolbox. . Libvoikko provides spell checking, hyphenation, grammar checking and morphological analysis for Finnish language. Spell checking is @@ -55,10 +62,11 @@ Priority: extra Section: oldlibs Architecture: all -Depends: python3-libvoikko, ${misc:Depends} +Depends: python3-libvoikko, + ${misc:Depends}, Description: transitional dummy package for Python bindings for libvoikko - Libvoikko is a library of free natural language processing tools. It - aims to provide support for languages that are not well served by + Libvoikko is a library of free natural language processing tools. It + aims to provide support for languages that are not well served by other existing free linguistic tools. . This package is a transitional dummy package which can be safely removed. @@ -66,14 +74,16 @@ Package: python3-libvoikko Section: python Architecture: all -Depends: python3, libvoikko1 (>= ${source:Version}), ${python3:Depends}, ${misc:Depends} -Conflicts: python-libvoikko (<< 4.0.1-3) -Replaces: python-libvoikko -Provides: python-libvoikko +Depends: libvoikko1 (>= ${source:Version}), + python3, + ${misc:Depends}, + ${python3:Depends}, +Breaks: python-libvoikko (<< 4.0.1-3), +Replaces: python-libvoikko, +Provides: python-libvoikko, Description: Python bindings for libvoikko - Libvoikko is a library of free natural language processing tools. It - aims to provide support for languages that are not well served by + Libvoikko is a library of free natural language processing tools. It + aims to provide support for languages that are not well served by other existing free linguistic tools. . This package contains the Python bindings. - diff -Nru libvoikko-4.0.1/debian/libvoikko1.docs libvoikko-4.0.2/debian/libvoikko1.docs --- libvoikko-4.0.1/debian/libvoikko1.docs 2016-02-10 23:46:58.000000000 +0000 +++ libvoikko-4.0.2/debian/libvoikko1.docs 2016-08-11 14:00:42.000000000 +0000 @@ -1,2 +1 @@ README - diff -Nru libvoikko-4.0.1/debian/libvoikko-dev.install libvoikko-4.0.2/debian/libvoikko-dev.install --- libvoikko-4.0.1/debian/libvoikko-dev.install 2016-02-10 23:46:58.000000000 +0000 +++ libvoikko-4.0.2/debian/libvoikko-dev.install 2016-08-11 14:00:42.000000000 +0000 @@ -3,5 +3,4 @@ usr/lib/*/lib*.a usr/lib/*/lib*.so usr/lib/*/pkgconfig/* -usr/lib/*/*.la usr/share/man/man1/* diff -Nru libvoikko-4.0.1/debian/manpages libvoikko-4.0.2/debian/manpages --- libvoikko-4.0.1/debian/manpages 2016-02-10 23:46:58.000000000 +0000 +++ libvoikko-4.0.2/debian/manpages 2016-08-11 14:00:42.000000000 +0000 @@ -1,3 +1,3 @@ -src/tools/voikkospell.1 -src/tools/voikkohyphenate.1 src/tools/voikkogc.1 +src/tools/voikkohyphenate.1 +src/tools/voikkospell.1 diff -Nru libvoikko-4.0.1/debian/rules libvoikko-4.0.2/debian/rules --- libvoikko-4.0.1/debian/rules 2016-02-17 08:48:23.000000000 +0000 +++ libvoikko-4.0.2/debian/rules 2016-08-11 14:12:12.000000000 +0000 @@ -1,24 +1,18 @@ #!/usr/bin/make -f -# Uncomment this to turn on verbose mode. -#export DH_VERBOSE=1 - -DEB_HOST_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE) -DEB_BUILD_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE) -DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) - -# http://wiki.debian.org/Hardening -export DEB_BUILD_HARDENING=1 +export DEB_BUILD_MAINT_OPTIONS = hardening=+all %: dh $@ --fail-missing --with python3,autoreconf override_dh_auto_configure: - dh_auto_configure -- --host=$(DEB_HOST_GNU_TYPE) --build=$(DEB_BUILD_GNU_TYPE) --prefix=/usr --mandir=\$${prefix}/share/man --infodir=\$${prefix}/share/info --with-dictionary-path=/usr/lib/voikko --libdir=/usr/lib/$(DEB_HOST_MULTIARCH) --disable-hfst + dh_auto_configure -- --with-dictionary-path=/usr/lib/voikko:/usr/share/voikko --disable-hfst override_dh_auto_install: dh_auto_install install -m 644 -D python/libvoikko.py debian/python3-libvoikko/usr/lib/python3/dist-packages/libvoikko.py + # Remove libtool-like files + find . -name '*.la' -exec rm -f {} ';' override_dh_auto_test: diff -Nru libvoikko-4.0.1/python/libvoikko.py libvoikko-4.0.2/python/libvoikko.py --- libvoikko-4.0.1/python/libvoikko.py 2016-02-03 18:21:13.000000000 +0000 +++ libvoikko-4.0.2/python/libvoikko.py 2016-02-19 13:54:23.000000000 +0000 @@ -51,16 +51,20 @@ # Python 3 without modifications. from __future__ import unicode_literals +from ctypes import addressof from ctypes import byref from ctypes import CDLL from ctypes import c_int from ctypes import c_char from ctypes import c_char_p +from ctypes import c_wchar from ctypes import c_wchar_p from ctypes import c_size_t from ctypes import c_void_p +from ctypes import create_unicode_buffer from ctypes import pointer from ctypes import POINTER +from ctypes import sizeof from ctypes import string_at from ctypes import Structure import os @@ -609,13 +613,15 @@ def __splitTokens(self, text): uniText = unicode_str(text) + uniTextPtr = create_unicode_buffer(uniText) + wcharSize = sizeof(c_wchar) result = [] textLen = len(uniText) tokenLen = c_size_t() position = 0 while textLen > 0: tokenType = self.__lib.voikkoNextTokenUcs4(self.__handle, - uniText[position:], textLen, byref(tokenLen)) + c_wchar_p(addressof(uniTextPtr) + position * wcharSize), textLen, byref(tokenLen)) if tokenType == Token.NONE: break tokenText = uniText[position:position+tokenLen.value] diff -Nru libvoikko-4.0.1/src/character/charset.cpp libvoikko-4.0.2/src/character/charset.cpp --- libvoikko-4.0.1/src/character/charset.cpp 2015-06-08 15:57:36.000000000 +0000 +++ libvoikko-4.0.2/src/character/charset.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -40,6 +40,20 @@ namespace libvoikko { char_type get_char_type(wchar_t c) { + if ((c >= 0x41 && c <= 0x5A) || /* A-Z */ + (c >= 0x61 && c <= 0x7A) || /* a-z */ + (c >= 0xC1 && c <= 0xD6) || /* À-Ö */ + (c >= 0xD8 && c <= 0xF6) || /* Ø-ö */ + (c >= 0x00F8 && c <= 0x02AF) || /* ø-ɏ */ + (c >= 0x0400 && c <= 0x0481) || /* Ѐ-ҁ - Cyrillic */ + (c >= 0x048A && c <= 0x0527) || /* Ҋ-ԧ - Cyrillic + Cyrillic extended */ + (c >= 0x1400 && c <= 0x15C3) || /* ᐀-ᗃ - Canadian syllabics */ + (c >= 0xFB00 && c <= 0xFB04)) { + return CHAR_LETTER; + } + if (SimpleChar::isWhitespace(c)) { + return CHAR_WHITESPACE; + } if (wcschr(L".,;-!?:'()[]{}/&" L"\u00AD" /* SOFT HYPHEN */ L"\u2019" /* RIGHT SINGLE QUOTATION MARK */ @@ -53,20 +67,6 @@ if (isFinnishQuotationMark(c)) { return CHAR_PUNCTUATION; } - if (SimpleChar::isWhitespace(c)) { - return CHAR_WHITESPACE; - } - if ((c >= 0x41 && c <= 0x5A) || /* A-Z */ - (c >= 0x61 && c <= 0x7A) || /* a-z */ - (c >= 0xC1 && c <= 0xD6) || /* À-Ö */ - (c >= 0xD8 && c <= 0xF6) || /* Ø-ö */ - (c >= 0x00F8 && c <= 0x02AF) || /* ø-ɏ */ - (c >= 0x0400 && c <= 0x0481) || /* Ѐ-ҁ - Cyrillic */ - (c >= 0x048A && c <= 0x0527) || /* Ҋ-ԧ - Cyrillic + Cyrillic extended */ - (c >= 0x1400 && c <= 0x15C3) || /* ᐀-ᗃ - Canadian syllabics */ - (c >= 0xFB00 && c <= 0xFB04)) { - return CHAR_LETTER; - } if (wcschr(L"0123456789", c)) { return CHAR_DIGIT; } diff -Nru libvoikko-4.0.1/src/fst/UnweightedTransducer.cpp libvoikko-4.0.2/src/fst/UnweightedTransducer.cpp --- libvoikko-4.0.1/src/fst/UnweightedTransducer.cpp 2016-01-30 15:44:50.000000000 +0000 +++ libvoikko-4.0.2/src/fst/UnweightedTransducer.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -31,6 +31,7 @@ #include "fst/Configuration.hpp" #include "setup/DictionaryException.hpp" #include "utf8/utf8.hpp" +#include "utils/StringUtils.hpp" #include #include @@ -146,22 +147,33 @@ std::map values; values[""] = FlagValueNeutral; values["@"] = FlagValueAny; - symbolToDiacritic.push_back(OpFeatureValue()); // epsilon DEBUG("Reading " << symbolCount << " symbols to symbol table"); for (uint16_t i = 0; i < symbolCount; i++) { - string symbol(filePtr); - if (firstNormalChar == 0 && i > 0 && symbol[0] != '@') { - firstNormalChar = i; + wchar_t * ucs4Symbol = utils::StringUtils::ucs4FromUtf8(filePtr); + symbolToString.push_back(ucs4Symbol); + if (i == 0) { + symbolToDiacritic.push_back(OpFeatureValue()); // epsilon + symbolStringLength.push_back(0); + filePtr += 1; } - if (firstNormalChar != 0 && firstMultiChar == 0 && symbol[0] == '[') { - firstMultiChar = i; - } - symbolToString.push_back(filePtr); - symbolStringLength.push_back(strlen(filePtr)); - filePtr += (symbol.length() + 1); - stringToSymbol.insert(pair(symbol, i)); - if (firstNormalChar == 0 && i > 0) { - symbolToDiacritic.push_back(getDiacriticOperation(symbol, features, values)); + else { + string symbol(filePtr); + if (firstNormalChar == 0) { + if (symbol[0] == '@') { + symbolToDiacritic.push_back(getDiacriticOperation(symbol, features, values)); + } + else { + firstNormalChar = i; + } + } + else if (firstMultiChar == 0 && symbol[0] == '[') { + firstMultiChar = i; + } + symbolStringLength.push_back(wcslen(ucs4Symbol)); + if (firstNormalChar > 0 && firstMultiChar == 0) { + stringToSymbol.insert(pair(ucs4Symbol[0], i)); + } + filePtr += (symbol.length() + 1); } } unknownSymbolOrdinal = symbolCount; @@ -176,19 +188,22 @@ transitionStart = reinterpret_cast(filePtr); } - bool UnweightedTransducer::prepare(Configuration * configuration, const char * input, size_t inputLen) const { + UnweightedTransducer::~UnweightedTransducer() { + for (wchar_t * s : symbolToString) { + delete[] s; + } + } + + bool UnweightedTransducer::prepare(Configuration * configuration, const wchar_t * input, size_t inputLen) const { configuration->stackDepth = 0; configuration->flagDepth = 0; configuration->inputDepth = 0; configuration->stateIndexStack[0] = 0; configuration->currentTransitionStack[0] = 0; configuration->inputLength = 0; - const char * ip = input; bool allKnown = true; - while (ip < input + inputLen) { - const char * prevIp = ip; - utf8::unchecked::next(ip); - std::map::const_iterator it = stringToSymbol.find(string(prevIp, ip - prevIp)); + while ((size_t)configuration->inputLength < inputLen) { + std::map::const_iterator it = stringToSymbol.find(input[configuration->inputLength]); if (it == stringToSymbol.end()) { configuration->inputSymbolStack[configuration->inputLength] = unknownSymbolOrdinal; allKnown = false; @@ -268,11 +283,11 @@ return true; } - bool UnweightedTransducer::next(Configuration * configuration, char * outputBuffer, size_t bufferLen) const { + bool UnweightedTransducer::next(Configuration * configuration, wchar_t * outputBuffer, size_t bufferLen) const { return nextPrefix(configuration, outputBuffer, bufferLen, 0); } - bool UnweightedTransducer::nextPrefix(Configuration * configuration, char * outputBuffer, size_t bufferLen, size_t * prefixLength) const { + bool UnweightedTransducer::nextPrefix(Configuration * configuration, wchar_t * outputBuffer, size_t bufferLen, size_t * prefixLength) const { uint32_t loopCounter = 0; while (loopCounter < MAX_LOOP_COUNT) { Transition * stateHead = transitionStart + configuration->stateIndexStack[configuration->stackDepth]; @@ -289,7 +304,7 @@ if (currentTransition->symIn == 0xFFFF) { // final state if (configuration->inputDepth == configuration->inputLength || prefixLength) { - char * outputBufferPos = outputBuffer; + wchar_t * outputBufferPos = outputBuffer; for (int i = 0; i < configuration->stackDepth; i++) { uint16_t outSymIndex = configuration->outputSymbolStack[i]; size_t symLen = symbolStringLength[outSymIndex]; @@ -297,10 +312,10 @@ // would overflow the output buffer return false; } - strncpy(outputBufferPos, symbolToString[outSymIndex], symLen); + wcsncpy(outputBufferPos, symbolToString[outSymIndex], symLen); outputBufferPos += symLen; } - *outputBufferPos = '\0'; + *outputBufferPos = L'\0'; configuration->currentTransitionStack[configuration->stackDepth] = currentTransition - transitionStart + 1; if (prefixLength) { *prefixLength = configuration->inputDepth; diff -Nru libvoikko-4.0.1/src/fst/UnweightedTransducer.hpp libvoikko-4.0.2/src/fst/UnweightedTransducer.hpp --- libvoikko-4.0.1/src/fst/UnweightedTransducer.hpp 2016-01-26 20:25:16.000000000 +0000 +++ libvoikko-4.0.2/src/fst/UnweightedTransducer.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -40,20 +40,21 @@ static_assert(sizeof(Transition) == 8, "Size of unweighted transition must be 8 bytes"); private: Transition * transitionStart; - std::map stringToSymbol; - std::vector symbolToString; + std::map stringToSymbol; + std::vector symbolToString; std::vector symbolStringLength; uint16_t firstMultiChar; uint16_t unknownSymbolOrdinal; void byteSwapTransducer(void *& mapPtr, size_t fileLength); public: UnweightedTransducer(const char * filePath); + ~UnweightedTransducer(); - bool prepare(Configuration * configuration, const char * input, size_t inputLen) const; + bool prepare(Configuration * configuration, const wchar_t * input, size_t inputLen) const; - bool next(Configuration * configuration, char * outputBuffer, size_t bufferLen) const; + bool next(Configuration * configuration, wchar_t * outputBuffer, size_t bufferLen) const; - bool nextPrefix(Configuration * configuration, char * outputBuffer, size_t bufferLen, size_t * prefixLength) const; + bool nextPrefix(Configuration * configuration, wchar_t * outputBuffer, size_t bufferLen, size_t * prefixLength) const; }; } } diff -Nru libvoikko-4.0.1/src/fst/WeightedConfiguration.hpp libvoikko-4.0.2/src/fst/WeightedConfiguration.hpp --- libvoikko-4.0.1/src/fst/WeightedConfiguration.hpp 2015-07-18 12:55:23.000000000 +0000 +++ libvoikko-4.0.2/src/fst/WeightedConfiguration.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -38,6 +38,7 @@ struct WeightedConfiguration { const int bufferSize; int stackDepth; + int flagDepth; int inputDepth; uint32_t * stateIndexStack; uint32_t * currentTransitionStack; diff -Nru libvoikko-4.0.1/src/fst/WeightedTransducer.cpp libvoikko-4.0.2/src/fst/WeightedTransducer.cpp --- libvoikko-4.0.1/src/fst/WeightedTransducer.cpp 2015-11-01 12:12:53.000000000 +0000 +++ libvoikko-4.0.2/src/fst/WeightedTransducer.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -31,6 +31,7 @@ #include "fst/Configuration.hpp" #include "setup/DictionaryException.hpp" #include "utf8/utf8.hpp" +#include "utils/StringUtils.hpp" #include #include @@ -152,21 +153,33 @@ std::map values; values[""] = FlagValueNeutral; values["@"] = FlagValueAny; - symbolToDiacritic.push_back(OpFeatureValue()); // epsilon DEBUG("Reading " << symbolCount << " symbols to symbol table"); for (uint16_t i = 0; i < symbolCount; i++) { - string symbol(filePtr); - if (firstNormalChar == 0 && i > 0 && symbol[0] != '@') { - firstNormalChar = i; + wchar_t * ucs4Symbol = utils::StringUtils::ucs4FromUtf8(filePtr); + symbolToString.push_back(ucs4Symbol); + if (i == 0) { + symbolToDiacritic.push_back(OpFeatureValue()); // epsilon + symbolStringLength.push_back(0); + filePtr += 1; } - if (firstNormalChar != 0 && firstMultiChar == 0 && symbol[0] == '[') { - firstMultiChar = i; - } - symbolToString.push_back(filePtr); - filePtr += (symbol.length() + 1); - stringToSymbol.insert(pair(symbol, i)); - if (firstNormalChar == 0 && i > 0) { - symbolToDiacritic.push_back(getDiacriticOperation(symbol, features, values)); + else { + string symbol(filePtr); + if (firstNormalChar == 0) { + if (symbol[0] == '@') { + symbolToDiacritic.push_back(getDiacriticOperation(symbol, features, values)); + } + else { + firstNormalChar = i; + } + } + else if (firstMultiChar == 0 && symbol[0] == '[') { + firstMultiChar = i; + } + symbolStringLength.push_back(wcslen(ucs4Symbol)); + if (firstNormalChar > 0 && firstMultiChar == 0) { + stringToSymbol.insert(pair(ucs4Symbol[0], i)); + } + filePtr += (symbol.length() + 1); } } flagDiacriticFeatureCount = features.size(); @@ -180,17 +193,21 @@ transitionStart = reinterpret_cast(filePtr); } - bool WeightedTransducer::prepare(WeightedConfiguration * configuration, const char * input, size_t inputLen) const { + WeightedTransducer::~WeightedTransducer() { + for (wchar_t * s : symbolToString) { + delete[] s; + } + } + + bool WeightedTransducer::prepare(WeightedConfiguration * configuration, const wchar_t * input, size_t inputLen) const { configuration->stackDepth = 0; + configuration->flagDepth = 0; configuration->inputDepth = 0; configuration->stateIndexStack[0] = 0; configuration->currentTransitionStack[0] = 0; configuration->inputLength = 0; - const char * ip = input; - while (ip < input + inputLen) { - const char * prevIp = ip; - utf8::unchecked::next(ip); - std::map::const_iterator it = stringToSymbol.find(string(prevIp, ip - prevIp)); + while ((size_t)configuration->inputLength < inputLen) { + std::map::const_iterator it = stringToSymbol.find(input[configuration->inputLength]); if (it == stringToSymbol.end()) { // Unknown symbol return false; @@ -212,76 +229,73 @@ static bool flagDiacriticCheck(WeightedConfiguration * configuration, const Transducer * transducer, uint16_t symbol) { uint16_t flagDiacriticFeatureCount = transducer->flagDiacriticFeatureCount; - if (!flagDiacriticFeatureCount) { + if (!flagDiacriticFeatureCount || symbol == 0) { return true; } - int stackDepth = configuration->stackDepth; size_t diacriticCell = flagDiacriticFeatureCount * sizeof(uint32_t); uint32_t * flagValueStack = configuration->flagValueStack; - uint32_t * currentFlagArray = flagValueStack + stackDepth * flagDiacriticFeatureCount; + uint32_t * currentFlagArray = flagValueStack + configuration->flagDepth * flagDiacriticFeatureCount; bool update = false; - OpFeatureValue ofv; - if (symbol != 0 && symbol < transducer->firstNormalChar) { - ofv = transducer->symbolToDiacritic[symbol]; - uint32_t currentValue = currentFlagArray[ofv.feature]; - DEBUG("checking op " << ofv.op << " " << ofv.feature << " " << ofv.value << " current value " << currentValue) - switch (ofv.op) { - case Operation_P: - update = true; - break; - case Operation_C: - ofv.value = FlagValueNeutral; - update = true; - break; - case Operation_U: - if (currentValue) { - if (currentValue != ofv.value) { - return false; - } - } - else { - update = true; - } - break; - case Operation_R: - if (ofv.value == FlagValueAny && currentValue == FlagValueNeutral) { - return false; - } - if (ofv.value != FlagValueAny && currentValue != ofv.value) { - return false; - } - break; - case Operation_D: - if ((ofv.value == FlagValueAny && currentValue != FlagValueNeutral) || currentValue == ofv.value) { + OpFeatureValue ofv = transducer->symbolToDiacritic[symbol]; + uint32_t currentValue = currentFlagArray[ofv.feature]; + DEBUG("checking op " << ofv.op << " " << ofv.feature << " " << ofv.value << " current value " << currentValue) + switch (ofv.op) { + case Operation_P: + update = true; + break; + case Operation_C: + ofv.value = FlagValueNeutral; + update = true; + break; + case Operation_U: + if (currentValue) { + if (currentValue != ofv.value) { return false; } - break; - default: - return false;// this would be an error - } - DEBUG("allowed") + } + else { + update = true; + } + break; + case Operation_R: + if (ofv.value == FlagValueAny && currentValue == FlagValueNeutral) { + return false; + } + if (ofv.value != FlagValueAny && currentValue != ofv.value) { + return false; + } + break; + case Operation_D: + if ((ofv.value == FlagValueAny && currentValue != FlagValueNeutral) || currentValue == ofv.value) { + return false; + } + break; + default: + return false;// this would be an error } + DEBUG("allowed") memcpy(currentFlagArray + flagDiacriticFeatureCount, currentFlagArray, diacriticCell); if (update) { DEBUG("updating feature " << ofv.feature << " to " << ofv.value) (currentFlagArray + flagDiacriticFeatureCount)[ofv.feature] = ofv.value; } + configuration->flagDepth++; return true; } - bool WeightedTransducer::next(WeightedConfiguration * configuration, char * outputBuffer, size_t bufferLen) const { + bool WeightedTransducer::next(WeightedConfiguration * configuration, wchar_t * outputBuffer, size_t bufferLen) const { int16_t weight; return next(configuration, outputBuffer, bufferLen, &weight); } - bool WeightedTransducer::next(WeightedConfiguration * configuration, char * outputBuffer, size_t bufferLen, int16_t * weight) const { + bool WeightedTransducer::next(WeightedConfiguration * configuration, wchar_t * outputBuffer, size_t bufferLen, int16_t * weight) const { int firstNotReachedPosition; return next(configuration, outputBuffer, bufferLen, weight, &firstNotReachedPosition); } - bool WeightedTransducer::next(WeightedConfiguration * configuration, char * outputBuffer, size_t bufferLen, int16_t * weight, int * firstNotReachedPosition) const { + bool WeightedTransducer::next(WeightedConfiguration * configuration, wchar_t * outputBuffer, size_t bufferLen, int16_t * weight, int * firstNotReachedPosition) const { uint32_t loopCounter = 0; *firstNotReachedPosition = configuration->inputDepth; while (loopCounter < MAX_LOOP_COUNT) { @@ -289,6 +303,7 @@ WeightedTransition * currentTransition = transitionStart + configuration->currentTransitionStack[configuration->stackDepth]; uint32_t startTransitionIndex = currentTransition - stateHead; uint32_t maxTc = getMaxTc(stateHead); + uint32_t inputSym = (configuration->inputDepth == configuration->inputLength) ? 0 : configuration->inputSymbolStack[configuration->inputDepth]; for (uint32_t tc = startTransitionIndex; tc <= maxTc; tc++) { if (tc == 1 && maxTc >= 255) { // skip overflow cell @@ -299,18 +314,18 @@ if (currentTransition->symIn == 0xFFFFFFFF) { // final state if (configuration->inputDepth == configuration->inputLength) { - char * outputBufferPos = outputBuffer; + wchar_t * outputBufferPos = outputBuffer; for (int i = 0; i < configuration->stackDepth; i++) { - const char * outputSym = symbolToString[configuration->outputSymbolStack[i]]; - size_t symLen = strlen(outputSym); + uint32_t outSymIndex = configuration->outputSymbolStack[i]; + size_t symLen = symbolStringLength[outSymIndex]; if ((outputBufferPos - outputBuffer) + symLen + 1 >= bufferLen) { DEBUG("would overflow the output buffer") return false; } - strncpy(outputBufferPos, outputSym, symLen); + wcsncpy(outputBufferPos, symbolToString[outSymIndex], symLen); outputBufferPos += symLen; } - *outputBufferPos = '\0'; + *outputBufferPos = L'\0'; configuration->currentTransitionStack[configuration->stackDepth] = currentTransition - transitionStart + 1; *weight = currentTransition->weight; for (int i = 0; i < configuration->stackDepth; i++) { @@ -319,10 +334,12 @@ return true; } } - else if (((configuration->inputDepth < configuration->inputLength && - configuration->inputSymbolStack[configuration->inputDepth] == currentTransition->symIn) || - currentTransition->symIn < firstNormalChar) && - flagDiacriticCheck(configuration, this, currentTransition->symIn)) { + else if (inputSym == 0 && currentTransition->symIn >= firstNormalChar) { + // only normal transitions left but we don't have any input + break; + } + else if ((configuration->inputDepth < configuration->inputLength && inputSym == currentTransition->symIn) || + (currentTransition->symIn < firstNormalChar && flagDiacriticCheck(configuration, this, currentTransition->symIn))) { // down DEBUG("down " << tc) if (configuration->stackDepth + 2 == configuration->bufferSize) { @@ -343,6 +360,25 @@ } goto nextInMainLoop; } + else if (currentTransition->symIn > inputSym) { + break; + } + else if (tc >= 1 && currentTransition->symIn >= firstNormalChar && + currentTransition->symIn < inputSym) { + uint32_t min = 0; + uint32_t max = maxTc - tc; + while (min + 1 < max) { + uint32_t middle = (min + max) / 2; + if ((currentTransition + middle)->symIn < inputSym) { + min = middle; + } + else { + max = middle; + } + } + tc += min; + currentTransition += min; + } currentTransition++; } if (configuration->stackDepth == 0) { @@ -357,6 +393,9 @@ if (previousInputSymbol >= firstNormalChar) { configuration->inputDepth--; } + else if (flagDiacriticFeatureCount && previousInputSymbol != 0) { + configuration->flagDepth--; + } } configuration->currentTransitionStack[configuration->stackDepth]++; nextInMainLoop: diff -Nru libvoikko-4.0.1/src/fst/WeightedTransducer.hpp libvoikko-4.0.2/src/fst/WeightedTransducer.hpp --- libvoikko-4.0.1/src/fst/WeightedTransducer.hpp 2015-07-18 12:55:23.000000000 +0000 +++ libvoikko-4.0.2/src/fst/WeightedTransducer.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -39,20 +39,22 @@ class WeightedTransducer : public Transducer { private: WeightedTransition * transitionStart; - std::map stringToSymbol; - std::vector symbolToString; + std::map stringToSymbol; + std::vector symbolToString; + std::vector symbolStringLength; uint16_t firstMultiChar; void byteSwapTransducer(void *& mapPtr, size_t fileLength); public: WeightedTransducer(const char * filePath); + ~WeightedTransducer(); - bool prepare(WeightedConfiguration * configuration, const char * input, size_t inputLen) const; + bool prepare(WeightedConfiguration * configuration, const wchar_t * input, size_t inputLen) const; - bool next(WeightedConfiguration * configuration, char * outputBuffer, size_t bufferLen) const; + bool next(WeightedConfiguration * configuration, wchar_t * outputBuffer, size_t bufferLen) const; - bool next(WeightedConfiguration * configuration, char * outputBuffer, size_t bufferLen, int16_t * weight) const; + bool next(WeightedConfiguration * configuration, wchar_t * outputBuffer, size_t bufferLen, int16_t * weight) const; - bool next(WeightedConfiguration * configuration, char * outputBuffer, size_t bufferLen, int16_t * weight, + bool next(WeightedConfiguration * configuration, wchar_t * outputBuffer, size_t bufferLen, int16_t * weight, int * firstNotReachedPosition) const; void backtrackToOutputDepth(WeightedConfiguration * configuration, int depth); diff -Nru libvoikko-4.0.1/src/grammar/FinnishAnalysis.cpp libvoikko-4.0.2/src/grammar/FinnishAnalysis.cpp --- libvoikko-4.0.1/src/grammar/FinnishAnalysis.cpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/grammar/FinnishAnalysis.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -82,18 +82,18 @@ bool verbFollowerTypeSet = false; while (it != analyses->end()) { token->isValidWord = true; - const wchar_t * structure = (*it)->getValue("STRUCTURE"); - const wchar_t * wclass = (*it)->getValue("CLASS"); - const wchar_t * mood = (*it)->getValue("MOOD"); - const wchar_t * person = (*it)->getValue("PERSON"); - const wchar_t * negative = (*it)->getValue("NEGATIVE"); - const wchar_t * possibleGeographicalName = (*it)->getValue("POSSIBLE_GEOGRAPHICAL_NAME"); - const wchar_t * requireFollowingVerb = (*it)->getValue("REQUIRE_FOLLOWING_VERB"); + const wchar_t * structure = (*it)->getValue(morphology::Analysis::Key::STRUCTURE); + const wchar_t * wclass = (*it)->getValue(morphology::Analysis::Key::CLASS); + const wchar_t * mood = (*it)->getValue(morphology::Analysis::Key::MOOD); + const wchar_t * person = (*it)->getValue(morphology::Analysis::Key::PERSON); + const wchar_t * negative = (*it)->getValue(morphology::Analysis::Key::NEGATIVE); + const wchar_t * possibleGeographicalName = (*it)->getValue(morphology::Analysis::Key::POSSIBLE_GEOGRAPHICAL_NAME); + const wchar_t * requireFollowingVerb = (*it)->getValue(morphology::Analysis::Key::REQUIRE_FOLLOWING_VERB); if (wcslen(structure) < 2 || (structure[1] != L'p' && structure[1] != L'q')) { // Word may start with a capital letter anywhere token->firstLetterLcase = false; - const wchar_t * wcase = (*it)->getValue("SIJAMUOTO"); + const wchar_t * wcase = (*it)->getValue(morphology::Analysis::Key::SIJAMUOTO); if (wclass && wcscmp(L"paikannimi", wclass) == 0 && wcase && wcscmp(L"omanto", wcase) == 0) { token->isGeographicalNameInGenitive = true; diff -Nru libvoikko-4.0.1/src/grammar/FinnishRuleEngine/VfstAutocorrectCheck.cpp libvoikko-4.0.2/src/grammar/FinnishRuleEngine/VfstAutocorrectCheck.cpp --- libvoikko-4.0.1/src/grammar/FinnishRuleEngine/VfstAutocorrectCheck.cpp 2015-11-30 16:28:53.000000000 +0000 +++ libvoikko-4.0.2/src/grammar/FinnishRuleEngine/VfstAutocorrectCheck.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -42,8 +42,8 @@ VfstAutocorrectCheck::VfstAutocorrectCheck(const string & fileName) throw(setup::DictionaryException) { transducer = new fst::UnweightedTransducer(fileName.c_str()); configuration = new fst::Configuration(transducer->getFlagDiacriticFeatureCount(), BUFFER_SIZE); - inputBuffer = new char[BUFFER_SIZE + 1]; - outputBuffer = new char[BUFFER_SIZE + 1]; + inputBuffer = new wchar_t[BUFFER_SIZE + 1]; + outputBuffer = new wchar_t[BUFFER_SIZE + 1]; } VfstAutocorrectCheck::~VfstAutocorrectCheck() { @@ -83,7 +83,7 @@ if (sentenceLengthUtf >= BUFFER_SIZE) { return false; // sentence is unreasonably long } - inputBuffer[sentenceLengthUtf] = ' '; + inputBuffer[sentenceLengthUtf] = L' '; ucsNormalizedPositions.push_back(ucsNormalizedPositions[i] + 1); } else { @@ -96,15 +96,21 @@ else { tokenStr = token->str; } - tokenUtfLen = utils::StringUtils::utf8FromUcs4(tokenStr, token->tokenlen, - inputBuffer + sentenceLengthUtf, BUFFER_SIZE - sentenceLengthUtf, - L"\u00AD", &skippedChars); + if (sentenceLengthUtf + token->tokenlen >= BUFFER_SIZE) { + return false; // sentence is unreasonably long + } + for (size_t i = 0; i < token->tokenlen; i++) { + if (wcschr(L"\u00AD", tokenStr[i])) { + skippedChars++; + } + else { + inputBuffer[sentenceLengthUtf + i - skippedChars] = tokenStr[i]; + } + } + tokenUtfLen = token->tokenlen - skippedChars; if (lowerFirst && i == 0) { delete[] tokenStr; } - if (tokenUtfLen == BUFFER_SIZE - sentenceLengthUtf + 1) { - return false; // sentence is unreasonably long - } ucsNormalizedPositions.push_back(ucsNormalizedPositions[i] + token->tokenlen - skippedChars); } sentenceLengthUtf += tokenUtfLen; @@ -150,14 +156,9 @@ } e->error.setErrorLen(prefixLength + lengthCorrection); if (lowerFirst) { - wchar_t * outputUcs = utils::StringUtils::ucs4FromUtf8(outputBuffer); - outputUcs[0] = character::SimpleChar::upper(outputUcs[0]); - e->error.getSuggestions()[0] = utils::StringUtils::utf8FromUcs4(outputUcs); - delete[] outputUcs; - } - else { - e->error.getSuggestions()[0] = utils::StringUtils::copy(outputBuffer); + outputBuffer[0] = character::SimpleChar::upper(outputBuffer[0]); } + e->error.getSuggestions()[0] = utils::StringUtils::utf8FromUcs4(outputBuffer); options->grammarChecker->cache.appendError(e); } else { diff -Nru libvoikko-4.0.1/src/grammar/FinnishRuleEngine/VfstAutocorrectCheck.hpp libvoikko-4.0.2/src/grammar/FinnishRuleEngine/VfstAutocorrectCheck.hpp --- libvoikko-4.0.1/src/grammar/FinnishRuleEngine/VfstAutocorrectCheck.hpp 2015-09-02 13:51:19.000000000 +0000 +++ libvoikko-4.0.2/src/grammar/FinnishRuleEngine/VfstAutocorrectCheck.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -47,8 +47,8 @@ private: fst::UnweightedTransducer * transducer; fst::Configuration * configuration; - char * inputBuffer; - char * outputBuffer; + wchar_t * inputBuffer; + wchar_t * outputBuffer; bool check(voikko_options_t * options, const Sentence * sentence, bool lowerFirst); }; diff -Nru libvoikko-4.0.1/src/hyphenator/AnalyzerToFinnishHyphenatorAdapter.cpp libvoikko-4.0.2/src/hyphenator/AnalyzerToFinnishHyphenatorAdapter.cpp --- libvoikko-4.0.1/src/hyphenator/AnalyzerToFinnishHyphenatorAdapter.cpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/hyphenator/AnalyzerToFinnishHyphenatorAdapter.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -287,7 +287,7 @@ void AnalyzerToFinnishHyphenatorAdapter::interpretAnalysis(const Analysis * analysis, char * buffer, size_t len) const { - const wchar_t * structure = analysis->getValue("STRUCTURE"); + const wchar_t * structure = analysis->getValue(Analysis::Key::STRUCTURE); const wchar_t * structurePtr = structure; memset(buffer, ' ', len); if (*structurePtr == L'=') { diff -Nru libvoikko-4.0.1/src/Makefile.am libvoikko-4.0.2/src/Makefile.am --- libvoikko-4.0.1/src/Makefile.am 2016-01-30 14:01:30.000000000 +0000 +++ libvoikko-4.0.2/src/Makefile.am 2016-02-19 13:54:23.000000000 +0000 @@ -80,7 +80,7 @@ grammar/FinnishRuleEngine/SentenceCheck.cpp \ grammar/FinnishRuleEngine/SidesanaCheck.cpp \ compatibility/interface.cpp -libvoikko_la_LDFLAGS = -no-undefined -version-info 15:2:14 @LIBLDFLAGSWIN@ +libvoikko_la_LDFLAGS = -no-undefined -version-info 15:3:14 @LIBLDFLAGSWIN@ pkginclude_HEADERS = voikko.h voikko_enums.h voikko_defines.h voikko_deprecated.h voikko_structs.h noinst_HEADERS = \ diff -Nru libvoikko-4.0.1/src/Makefile.in libvoikko-4.0.2/src/Makefile.in --- libvoikko-4.0.1/src/Makefile.in 2016-01-31 17:30:46.000000000 +0000 +++ libvoikko-4.0.2/src/Makefile.in 2016-02-19 13:54:23.000000000 +0000 @@ -592,7 +592,7 @@ grammar/FinnishRuleEngine/SidesanaCheck.cpp \ compatibility/interface.cpp $(am__append_1) $(am__append_3) \ $(am__append_5) $(am__append_8) $(am__append_11) -libvoikko_la_LDFLAGS = -no-undefined -version-info 15:2:14 \ +libvoikko_la_LDFLAGS = -no-undefined -version-info 15:3:14 \ @LIBLDFLAGSWIN@ $(am__append_6) $(am__append_9) \ $(am__append_12) pkginclude_HEADERS = voikko.h voikko_enums.h voikko_defines.h voikko_deprecated.h voikko_structs.h diff -Nru libvoikko-4.0.1/src/morphology/Analysis.cpp libvoikko-4.0.2/src/morphology/Analysis.cpp --- libvoikko-4.0.1/src/morphology/Analysis.cpp 2014-09-22 15:58:49.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/Analysis.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -33,28 +33,87 @@ namespace libvoikko { namespace morphology { +static constexpr std::array KEY_TO_STRING { + "BASEFORM", + "CLASS", + "COMPARISON", + "FOCUS", + "FSTOUTPUT", + "KYSYMYSLIITE", + "MALAGA_VAPAA_JALKIOSA", + "MOOD", + "NEGATIVE", + "NUMBER", + "PARTICIPLE", + "PERSON", + "POSSESSIVE", + "POSSIBLE_GEOGRAPHICAL_NAME", + "REQUIRE_FOLLOWING_VERB", + "SIJAMUOTO", + "STRUCTURE", + "TENSE", + "WEIGHT", + "WORDBASES", + "WORDIDS" +}; + +static const std::map STRING_TO_KEY = { + {"BASEFORM", Analysis::Key::BASEFORM}, + {"CLASS", Analysis::Key::CLASS}, + {"COMPARISON", Analysis::Key::COMPARISON}, + {"FOCUS", Analysis::Key::FOCUS}, + {"FSTOUTPUT", Analysis::Key::FSTOUTPUT}, + {"KYSYMYSLIITE", Analysis::Key::KYSYMYSLIITE}, + {"MALAGA_VAPAA_JALKIOSA", Analysis::Key::MALAGA_VAPAA_JALKIOSA}, + {"MOOD", Analysis::Key::MOOD}, + {"NEGATIVE", Analysis::Key::NEGATIVE}, + {"NUMBER", Analysis::Key::NUMBER}, + {"PARTICIPLE", Analysis::Key::PARTICIPLE}, + {"PERSON", Analysis::Key::PERSON}, + {"POSSESSIVE", Analysis::Key::POSSESSIVE}, + {"POSSIBLE_GEOGRAPHICAL_NAME", Analysis::Key::POSSIBLE_GEOGRAPHICAL_NAME}, + {"REQUIRE_FOLLOWING_VERB", Analysis::Key::REQUIRE_FOLLOWING_VERB}, + {"SIJAMUOTO", Analysis::Key::SIJAMUOTO}, + {"STRUCTURE", Analysis::Key::STRUCTURE}, + {"TENSE", Analysis::Key::TENSE}, + {"WEIGHT", Analysis::Key::WEIGHT}, + {"WORDBASES", Analysis::Key::WORDBASES}, + {"WORDIDS", Analysis::Key::WORDIDS} +}; + Analysis::Analysis() : keys(0) { } Analysis::~Analysis() { deleteKeys(); - std::map::iterator it = attributes.begin(); + std::map::iterator it = attributes.begin(); while (it != attributes.end()) { - delete[] it++->second; + if (!constAttributes[static_cast(it->first)]) { + delete[] it->second; + } + ++it; } } -void Analysis::addAttribute(const char * key, wchar_t * value) { - attributes.insert(std::make_pair(std::string(key), value)); - recreateKeys(); +void Analysis::addAttribute(Key key, wchar_t * value) { + attributes.insert(std::make_pair(key, value)); +} + +void Analysis::addConstAttribute(Key key, const wchar_t * value) { + attributes.insert(std::make_pair(key, const_cast(value))); + constAttributes.set(static_cast(key)); } -void Analysis::removeAttribute(const char * key) { - std::map::iterator valueI = attributes.find(std::string(key)); +void Analysis::removeAttribute(Key key) { + std::map::iterator valueI = attributes.find(key); if (valueI != attributes.end()) { - delete[] valueI->second; + if (constAttributes[static_cast(valueI->first)]) { + constAttributes[static_cast(valueI->first)] = false; + } + else { + delete[] valueI->second; + } attributes.erase(valueI); - recreateKeys(); } } @@ -62,29 +121,47 @@ return const_cast(keys); } -const wchar_t * Analysis::getValue(const char * key) const { - std::map::const_iterator valueI = - attributes.find(std::string(key)); +std::vector Analysis::getInternalKeys() const { + std::vector keys; + for (auto keyAndValue: attributes) { + keys.push_back(keyAndValue.first); + } + return keys; +} + +const wchar_t * Analysis::getValue(Analysis::Key key) const { + std::map::const_iterator valueI = + attributes.find(key); if (valueI == attributes.end()) { - return 0; + return nullptr; } else { return valueI->second; } } +const wchar_t * Analysis::getValueS(const char * key) const { + std::map::const_iterator keyI = STRING_TO_KEY.find(std::string(key)); + if (keyI == STRING_TO_KEY.end()) { + return nullptr; + } + else { + return this->getValue(keyI->second); + } +} + void Analysis::deleteKeys() { delete[] keys; keys = 0; } -void Analysis::recreateKeys() { +void Analysis::seal() { deleteKeys(); keys = new const char*[attributes.size() + 1]; - std::map::const_iterator it = attributes.begin(); + std::map::const_iterator it = attributes.begin(); size_t i = 0; while (it != attributes.end()) { - keys[i++] = it++->first.c_str(); + keys[i++] = KEY_TO_STRING[(int)it++->first]; } keys[i] = 0; } diff -Nru libvoikko-4.0.1/src/morphology/Analysis.hpp libvoikko-4.0.2/src/morphology/Analysis.hpp --- libvoikko-4.0.1/src/morphology/Analysis.hpp 2015-06-20 14:55:29.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/Analysis.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -31,6 +31,8 @@ #include #include +#include +#include namespace libvoikko { namespace morphology { @@ -41,6 +43,30 @@ */ class Analysis { public: + enum class Key { + BASEFORM, + CLASS, + COMPARISON, + FOCUS, + FSTOUTPUT, + KYSYMYSLIITE, + MALAGA_VAPAA_JALKIOSA, + MOOD, + NEGATIVE, + NUMBER, + PARTICIPLE, + PERSON, + POSSESSIVE, + POSSIBLE_GEOGRAPHICAL_NAME, + REQUIRE_FOLLOWING_VERB, + SIJAMUOTO, + STRUCTURE, + TENSE, + WEIGHT, + WORDBASES, + WORDIDS + }; + Analysis(); ~Analysis(); @@ -48,32 +74,54 @@ * Adds an attribute to analysis. Ownership of value * is transferred to this object. */ - void addAttribute(const char * key, wchar_t * value); + void addAttribute(Key key, wchar_t * value); + + /** + * Adds an attribute to analysis without transferring ownership. + */ + void addConstAttribute(Key key, const wchar_t * value); /** * Deletes an attribute from analysis. */ - void removeAttribute(const char * key); + void removeAttribute(Key key); + + /** + * Seals this analysis for publication through external interface + */ + void seal(); /** * Returns a null terminated array of strings containing * the attribute names in this analysis. */ const char ** getKeys() const; + + /** + * Returns a list of attribute keys in this analysis. + */ + std::vector getInternalKeys() const; /** * Returns the value of given attribute. If no such * attribute exists, returns null. */ - const wchar_t * getValue(const char * key) const; + const wchar_t * getValue(Key key) const; + + /** + * Returns the value of given attribute. If no such + * attribute exists, returns null. + */ + const wchar_t * getValueS(const char * key) const; + private: Analysis(Analysis const & other); Analysis & operator = (const Analysis & other); void deleteKeys(); - void recreateKeys(); const char ** keys; - std::map attributes; + std::map attributes; + std::bitset<21> constAttributes; }; } } diff -Nru libvoikko-4.0.1/src/morphology/FinnishVfstAnalyzer.cpp libvoikko-4.0.2/src/morphology/FinnishVfstAnalyzer.cpp --- libvoikko-4.0.1/src/morphology/FinnishVfstAnalyzer.cpp 2016-01-30 13:11:07.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/FinnishVfstAnalyzer.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -52,7 +52,7 @@ string morFile = directoryName + "/mor.vfst"; transducer = new UnweightedTransducer(morFile.c_str()); configuration = new Configuration(transducer->getFlagDiacriticFeatureCount(), BUFFER_SIZE); - outputBuffer = new char[BUFFER_SIZE]; + outputBuffer = new wchar_t[BUFFER_SIZE]; classMap.insert(std::make_pair(L"n", L"nimisana")); classMap.insert(std::make_pair(L"l", L"laatusana")); @@ -298,23 +298,23 @@ return structure; } -static wchar_t * getAttributeFromMap(map & theMap, const wchar_t * keyStart, size_t keyLen) { - map::const_iterator mapIterator = theMap.find(wstring(keyStart, keyLen)); +static const wchar_t * getAttributeFromMap(map & theMap, const wchar_t * keyStart, size_t keyLen) { + map::const_iterator mapIterator = theMap.find(wstring(keyStart, keyLen)); if (mapIterator == theMap.end()) { - return 0; + return nullptr; } - return StringUtils::copy((*mapIterator).second.c_str()); + return mapIterator->second; } static void parseBasicAttribute(Analysis * analysis, const wchar_t * fstOutput, size_t i, size_t j, - const char * attributeName, map & theMap) { - if (analysis->getValue(attributeName)) { + Analysis::Key key, map & theMap) { + if (analysis->getValue(key)) { return; // already set } size_t sijaLen = i - j - 2; - wchar_t * muoto = getAttributeFromMap(theMap, fstOutput + j + 2, sijaLen); + const wchar_t * muoto = getAttributeFromMap(theMap, fstOutput + j + 2, sijaLen); if (muoto) { - analysis->addAttribute(attributeName, muoto); + analysis->addConstAttribute(key, muoto); } } @@ -431,26 +431,26 @@ } static void addInfoFlag(Analysis * analysis, const wchar_t * outputPosition, const wchar_t * outputBuffer) { - const wchar_t * className = analysis->getValue("CLASS"); + const wchar_t * className = analysis->getValue(Analysis::Key::CLASS); if (wcsncmp(outputPosition, L"vj", 2) == 0) { if (outputBuffer[0] != L'-') { - analysis->addAttribute("MALAGA_VAPAA_JALKIOSA", StringUtils::copy(L"true")); + analysis->addConstAttribute(Analysis::Key::MALAGA_VAPAA_JALKIOSA, L"true"); } } else if (wcsncmp(outputPosition, L"ca", 2) == 0) { if (!wcsstr(outputPosition, L"[Bc]") && !wcsstr(outputPosition, L"[Ll]") && (!className || wcsncmp(className, L"nimisana", 8) == 0)) { - analysis->addAttribute("POSSIBLE_GEOGRAPHICAL_NAME", StringUtils::copy(L"true")); + analysis->addConstAttribute(Analysis::Key::POSSIBLE_GEOGRAPHICAL_NAME, L"true"); } } else { - const wchar_t * mood = analysis->getValue("MOOD"); + const wchar_t * mood = analysis->getValue(Analysis::Key::MOOD); if ((!mood || (wcscmp(mood, L"E-infinitive") != 0 && wcscmp(mood, L"MINEN-infinitive") != 0 && wcscmp(mood, L"MA-infinitive") != 0)) && (!className || wcscmp(className, L"teonsana") == 0)) { if (wcsncmp(outputPosition, L"ra", 2) == 0) { - analysis->addAttribute("REQUIRE_FOLLOWING_VERB", StringUtils::copy(L"A-infinitive")); + analysis->addConstAttribute(Analysis::Key::REQUIRE_FOLLOWING_VERB, L"A-infinitive"); } else if (wcsncmp(outputPosition, L"rm", 2) == 0) { - analysis->addAttribute("REQUIRE_FOLLOWING_VERB", StringUtils::copy(L"MA-infinitive")); + analysis->addConstAttribute(Analysis::Key::REQUIRE_FOLLOWING_VERB, L"MA-infinitive"); } } } @@ -672,7 +672,7 @@ } void FinnishVfstAnalyzer::duplicateOrgName(Analysis * analysis, const wchar_t * fstOutput, std::list * analysisList) { - const wchar_t * oldClass = analysis->getValue("CLASS"); + const wchar_t * oldClass = analysis->getValue(Analysis::Key::CLASS); if (!oldClass || wcscmp(oldClass, L"nimisana") != 0) { return; } @@ -694,14 +694,12 @@ for (size_t j = i - 4; j >= 4; j--) { if (wcsncmp(fstOutput + j, L"[Bc]", 4) == 0) { Analysis * newAnalysis = new Analysis(); - const char ** keys = analysis->getKeys(); wchar_t * newStructure = 0; - for (const char ** keyPtr = keys; *keyPtr; keyPtr++) { - const char * key = *keyPtr; - if (strcmp(key, "CLASS") == 0) { - newAnalysis->addAttribute(key, StringUtils::copy(L"nimi")); + for (Analysis::Key key : analysis->getInternalKeys()) { + if (key == Analysis::Key::CLASS) { + newAnalysis->addConstAttribute(key, L"nimi"); } - else if (strcmp(key, "STRUCTURE") == 0) { + else if (key == Analysis::Key::STRUCTURE) { const wchar_t * oldStructure = analysis->getValue(key); size_t structureLen = wcslen(oldStructure); if (structureLen >= 2) { @@ -710,7 +708,7 @@ newAnalysis->addAttribute(key, newStructure); } } - else if (strcmp(key, "POSSIBLE_GEOGRAPHICAL_NAME") == 0) { + else if (key == Analysis::Key::POSSIBLE_GEOGRAPHICAL_NAME) { // skip } else { @@ -720,7 +718,7 @@ if (newStructure) { wchar_t * baseform = parseBaseform(fstOutput, fstLen, newStructure); if (baseform) { - newAnalysis->addAttribute("BASEFORM", baseform); + newAnalysis->addAttribute(Analysis::Key::BASEFORM, baseform); } } analysisList->push_back(newAnalysis); @@ -884,17 +882,18 @@ delete[] xsBuffer; delete[] xpBuffer; if (anyXs) { - analysis->addAttribute("WORDIDS", wordIds); + analysis->addAttribute(Analysis::Key::WORDIDS, wordIds); } else { delete[] wordIds; } - analysis->addAttribute("WORDBASES", wordBases); + analysis->addAttribute(Analysis::Key::WORDBASES, wordBases); } void FinnishVfstAnalyzer::parseBasicAttributes(Analysis * analysis, const wchar_t * fstOutput, size_t fstLen) { bool convertNimiLaatusanaToLaatusana = false; bool bcPassed = false; + bool classSet = false; for (size_t i = fstLen - 1; i >= 2; i--) { if (fstOutput[i] == L']') { size_t j = i; @@ -902,72 +901,75 @@ j--; if (fstOutput[j] == L'[') { if (fstOutput[j + 1] == L'L') { - if (wcsncmp(fstOutput + (j + 2), L"nl", 2) == 0) { - const wchar_t * comp = analysis->getValue("COMPARISON"); - if (convertNimiLaatusanaToLaatusana || (comp && (wcscmp(comp, L"comparative") == 0 || wcscmp(comp, L"superlative") == 0)) || - wcsncmp(fstOutput, L"[Lu]", 4) == 0) { - analysis->addAttribute("CLASS", StringUtils::copy(L"laatusana")); + if (!classSet || fstOutput[j + 2] == L']') { // TODO check for ']' is for compatibility with voikko-fi 2.0 + if (wcsncmp(fstOutput + (j + 2), L"nl", 2) == 0) { + const wchar_t * comp = analysis->getValue(Analysis::Key::COMPARISON); + if (convertNimiLaatusanaToLaatusana || (comp && (wcscmp(comp, L"comparative") == 0 || wcscmp(comp, L"superlative") == 0)) || + wcsncmp(fstOutput, L"[Lu]", 4) == 0) { + analysis->addConstAttribute(Analysis::Key::CLASS, L"laatusana"); + } + else { + analysis->addConstAttribute(Analysis::Key::CLASS, L"nimisana_laatusana"); + } } else { - analysis->addAttribute("CLASS", StringUtils::copy(L"nimisana_laatusana")); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::CLASS, classMap); } - } - else { - parseBasicAttribute(analysis, fstOutput, i, j, "CLASS", classMap); + classSet = true; } } else if (fstOutput[j + 1] == L'N') { - const wchar_t * wclass = analysis->getValue("CLASS"); + const wchar_t * wclass = analysis->getValue(Analysis::Key::CLASS); if (!wclass || (wcscmp(wclass, L"etuliite") != 0 && wcscmp(wclass, L"seikkasana") != 0)) { - parseBasicAttribute(analysis, fstOutput, i, j, "NUMBER", numberMap); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::NUMBER, numberMap); } } else if (fstOutput[j + 1] == L'P') { - parseBasicAttribute(analysis, fstOutput, i, j, "PERSON", personMap); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::PERSON, personMap); } else if (fstOutput[j + 1] == L'S') { - const wchar_t * wclass = analysis->getValue("CLASS"); + const wchar_t * wclass = analysis->getValue(Analysis::Key::CLASS); if (!wclass || (wcscmp(wclass, L"etuliite") != 0 && wcscmp(wclass, L"seikkasana") != 0)) { - parseBasicAttribute(analysis, fstOutput, i, j, "SIJAMUOTO", sijamuotoMap); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::SIJAMUOTO, sijamuotoMap); if (j + 5 < fstLen && wcsncmp(fstOutput + (j + 2), L"sti", 3) == 0) { convertNimiLaatusanaToLaatusana = true; } } } else if (fstOutput[j + 1] == L'T') { - if (!analysis->getValue("CLASS")) { - parseBasicAttribute(analysis, fstOutput, i, j, "MOOD", moodMap); + if (!analysis->getValue(Analysis::Key::CLASS)) { + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::MOOD, moodMap); } } else if (fstOutput[j + 1] == L'A') { - parseBasicAttribute(analysis, fstOutput, i, j, "TENSE", tenseMap); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::TENSE, tenseMap); } else if (fstOutput[j + 1] == L'F') { if (wcsncmp(fstOutput + (j + 2), L"ko", 2) == 0) { - analysis->addAttribute("KYSYMYSLIITE", StringUtils::copy(L"true")); + analysis->addConstAttribute(Analysis::Key::KYSYMYSLIITE, L"true"); } else { - parseBasicAttribute(analysis, fstOutput, i, j, "FOCUS", focusMap); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::FOCUS, focusMap); } } else if (fstOutput[j + 1] == L'O') { - parseBasicAttribute(analysis, fstOutput, i, j, "POSSESSIVE", possessiveMap); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::POSSESSIVE, possessiveMap); } else if (fstOutput[j + 1] == L'C') { - if (!analysis->getValue("CLASS")) { - parseBasicAttribute(analysis, fstOutput, i, j, "COMPARISON", comparisonMap); + if (!analysis->getValue(Analysis::Key::CLASS)) { + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::COMPARISON, comparisonMap); } } else if (fstOutput[j + 1] == L'E') { - parseBasicAttribute(analysis, fstOutput, i, j, "NEGATIVE", negativeMap); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::NEGATIVE, negativeMap); } else if (fstOutput[j + 1] == L'R') { if (!bcPassed) { - const wchar_t * wclass = analysis->getValue("CLASS"); + const wchar_t * wclass = analysis->getValue(Analysis::Key::CLASS); // TODO: Checking the end for [Ln] is done to handle -tUAnne ("kuunneltuanne"). This is for compatibility // with Malaga implementation. See VISK § 543 (temporaalirakenne) for correct analysis. if (!wclass || wcscmp(wclass, L"laatusana") == 0 || wcscmp(fstOutput + (fstLen - 4), L"[Ln]") == 0) { - parseBasicAttribute(analysis, fstOutput, i, j, "PARTICIPLE", participleMap); + parseBasicAttribute(analysis, fstOutput, i, j, Analysis::Key::PARTICIPLE, participleMap); } } } @@ -976,8 +978,10 @@ } else if (fstOutput[j + 1] == L'B') { if (j >= 5 && fstOutput[j + 2] == L'c') { - if (!analysis->getValue("CLASS") && (fstOutput[j - 1] == L'-' || wcsncmp(fstOutput + (j - 5), L"-[Bh]", 5) == 0)) { - analysis->addAttribute("CLASS", StringUtils::copy(L"etuliite")); + if (!classSet && !analysis->getValue(Analysis::Key::CLASS) && + (fstOutput[j - 1] == L'-' || wcsncmp(fstOutput + (j - 5), L"-[Bh]", 5) == 0)) { + analysis->addConstAttribute(Analysis::Key::CLASS, L"etuliite"); + classSet = true; } bcPassed = true; } @@ -1053,68 +1057,58 @@ wchar_t * wordLowerUcs4 = new wchar_t[wlen]; memcpy(wordLowerUcs4, word, wlen * sizeof(wchar_t)); voikko_set_case(CT_ALL_LOWER, wordLowerUcs4, wlen); - char * wordLower = StringUtils::utf8FromUcs4(wordLowerUcs4, wlen); - delete[] wordLowerUcs4; - if (!wordLower) { - return analysisList; - } - if (transducer->prepare(configuration, wordLower, strlen(wordLower))) { + if (transducer->prepare(configuration, wordLowerUcs4, wlen)) { int analysisCount = 0; while (++analysisCount < MAX_ANALYSIS_COUNT && transducer->next(configuration, outputBuffer, BUFFER_SIZE)) { - wchar_t * fstOutput = StringUtils::ucs4FromUtf8(outputBuffer); - size_t fstLen = wcslen(fstOutput); - if (!isValidAnalysis(fstOutput, fstLen)) { - delete[] fstOutput; + size_t fstLen = wcslen(outputBuffer); + if (!isValidAnalysis(outputBuffer, fstLen)) { continue; } Analysis * analysis = new Analysis(); - wchar_t * structure = parseStructure(fstOutput, wlen); - parseBasicAttributes(analysis, fstOutput, fstLen); - fixStructure(structure, fstOutput, fstLen); - analysis->addAttribute("STRUCTURE", structure); - const wchar_t * wclass = analysis->getValue("CLASS"); - const wchar_t * sijamuoto = analysis->getValue("SIJAMUOTO"); - const wchar_t * mood = analysis->getValue("MOOD"); - const wchar_t * participle = analysis->getValue("PARTICIPLE"); - if (analysis->getValue("NEGATIVE") && ((wclass && wcscmp(wclass, L"teonsana") != 0) || + wchar_t * structure = parseStructure(outputBuffer, wlen); + parseBasicAttributes(analysis, outputBuffer, fstLen); + fixStructure(structure, outputBuffer, fstLen); + analysis->addAttribute(Analysis::Key::STRUCTURE, structure); + const wchar_t * wclass = analysis->getValue(Analysis::Key::CLASS); + const wchar_t * sijamuoto = analysis->getValue(Analysis::Key::SIJAMUOTO); + const wchar_t * mood = analysis->getValue(Analysis::Key::MOOD); + const wchar_t * participle = analysis->getValue(Analysis::Key::PARTICIPLE); + if (analysis->getValue(Analysis::Key::NEGATIVE) && ((wclass && wcscmp(wclass, L"teonsana") != 0) || (mood && (wcscmp(mood, L"MINEN-infinitive") == 0 || wcscmp(mood, L"E-infinitive") == 0 || wcscmp(mood, L"MA-infinitive") == 0)) )) { - analysis->removeAttribute("NEGATIVE"); + analysis->removeAttribute(Analysis::Key::NEGATIVE); } if (participle && wcscmp(participle, L"past_passive") == 0 && (!wclass || wcscmp(participle, L"laatusana") != 0)) { wclass = L"laatusana"; - analysis->removeAttribute("CLASS"); - analysis->addAttribute("CLASS", StringUtils::copy(wclass)); + analysis->removeAttribute(Analysis::Key::CLASS); + analysis->addConstAttribute(Analysis::Key::CLASS, wclass); } - if (analysis->getValue("NUMBER") && sijamuoto && wcscmp(sijamuoto, L"kerrontosti") == 0) { - analysis->removeAttribute("NUMBER"); + if (analysis->getValue(Analysis::Key::NUMBER) && sijamuoto && wcscmp(sijamuoto, L"kerrontosti") == 0) { + analysis->removeAttribute(Analysis::Key::NUMBER); } - if (!analysis->getValue("COMPARISON")) { + if (!analysis->getValue(Analysis::Key::COMPARISON)) { if (wclass && (wcscmp(wclass, L"laatusana") == 0 || wcscmp(wclass, L"nimisana_laatusana") == 0)) { - analysis->addAttribute("COMPARISON", StringUtils::copy(L"positive")); + analysis->addConstAttribute(Analysis::Key::COMPARISON, L"positive"); } } else if (wclass && (wcscmp(wclass, L"nimisana") == 0)) { - analysis->removeAttribute("COMPARISON"); + analysis->removeAttribute(Analysis::Key::COMPARISON); } analysisList->push_back(analysis); - duplicateOrgName(analysis, fstOutput, analysisList); + duplicateOrgName(analysis, outputBuffer, analysisList); if (fullMorphology) { - analysis->addAttribute("FSTOUTPUT", fstOutput); - wchar_t * baseform = parseBaseform(fstOutput, fstLen, structure); + analysis->addAttribute(Analysis::Key::FSTOUTPUT, StringUtils::copy(outputBuffer)); + wchar_t * baseform = parseBaseform(outputBuffer, fstLen, structure); if (baseform) { - analysis->addAttribute("BASEFORM", baseform); + analysis->addAttribute(Analysis::Key::BASEFORM, baseform); } - parseDebugAttributes(analysis, fstOutput, fstLen); - } - else { - delete[] fstOutput; + parseDebugAttributes(analysis, outputBuffer, fstLen); } } } - delete[] wordLower; + delete[] wordLowerUcs4; return analysisList; } diff -Nru libvoikko-4.0.1/src/morphology/FinnishVfstAnalyzer.hpp libvoikko-4.0.2/src/morphology/FinnishVfstAnalyzer.hpp --- libvoikko-4.0.1/src/morphology/FinnishVfstAnalyzer.hpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/FinnishVfstAnalyzer.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -50,18 +50,18 @@ private: fst::UnweightedTransducer * transducer; fst::Configuration * configuration; - char * outputBuffer; - std::map classMap; - std::map sijamuotoMap; - std::map moodMap; - std::map numberMap; - std::map comparisonMap; - std::map personMap; - std::map tenseMap; - std::map focusMap; - std::map possessiveMap; - std::map negativeMap; - std::map participleMap; + wchar_t * outputBuffer; + std::map classMap; + std::map sijamuotoMap; + std::map moodMap; + std::map numberMap; + std::map comparisonMap; + std::map personMap; + std::map tenseMap; + std::map focusMap; + std::map possessiveMap; + std::map negativeMap; + std::map participleMap; void parseBasicAttributes(Analysis * analysis, const wchar_t * fstOutput, size_t fstLen); void parseDebugAttributes(Analysis * analysis, const wchar_t * fstOutput, size_t fstLen); diff -Nru libvoikko-4.0.1/src/morphology/HfstAnalyzer.cpp libvoikko-4.0.2/src/morphology/HfstAnalyzer.cpp --- libvoikko-4.0.1/src/morphology/HfstAnalyzer.cpp 2015-10-22 16:26:10.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/HfstAnalyzer.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -82,9 +82,9 @@ Analysis * a = new Analysis(); if (fullMorphology) { string lemma = analysis.substr(0,analysis.find("+")); - a->addAttribute("lemma", StringUtils::ucs4FromUtf8(lemma.c_str())); + a->addAttribute(Analysis::Key::BASEFORM, StringUtils::ucs4FromUtf8(lemma.c_str())); } - a->addAttribute("tags", StringUtils::ucs4FromUtf8(tags.c_str())); + a->addAttribute(Analysis::Key::FSTOUTPUT, StringUtils::ucs4FromUtf8(tags.c_str())); analysisList->push_back(a); q.pop(); } diff -Nru libvoikko-4.0.1/src/morphology/interface.cpp libvoikko-4.0.2/src/morphology/interface.cpp --- libvoikko-4.0.1/src/morphology/interface.cpp 2015-10-22 15:53:25.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/interface.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -48,6 +48,7 @@ list::const_iterator it = analyses->begin(); size_t i = 0; while (it != analyses->end()) { + (*it)->seal(); result[i++] = *it++; } result[i] = 0; @@ -73,7 +74,7 @@ VOIKKOEXPORT const wchar_t * voikko_mor_analysis_value_ucs4( const voikko_mor_analysis * analysis, const char * key) { - return analysis->getValue(key); + return analysis->getValueS(key); } VOIKKOEXPORT voikko_mor_analysis ** voikkoAnalyzeWordCstr( diff -Nru libvoikko-4.0.1/src/morphology/MalagaAnalyzer.cpp libvoikko-4.0.2/src/morphology/MalagaAnalyzer.cpp --- libvoikko-4.0.1/src/morphology/MalagaAnalyzer.cpp 2015-10-22 16:27:19.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/MalagaAnalyzer.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -66,21 +66,21 @@ while (res && currentAnalysisCount < MAX_ANALYSIS_COUNT) { Analysis * analysis = new Analysis(); parseStructure(analysis, res); - parseBasicAttribute(analysis, res, symbols[MS_SIJAMUOTO], "SIJAMUOTO"); - parseBasicAttribute(analysis, res, symbols[MS_CLASS], "CLASS"); - parseBasicAttribute(analysis, res, symbols[MS_NUMBER], "NUMBER"); - parseBasicAttribute(analysis, res, symbols[MS_PERSON], "PERSON"); - parseBasicAttribute(analysis, res, symbols[MS_MOOD], "MOOD"); - parseBasicAttribute(analysis, res, symbols[MS_VAPAA_JALKIOSA], "MALAGA_VAPAA_JALKIOSA"); - parseBasicAttribute(analysis, res, symbols[MS_NEGATIVE], "NEGATIVE"); - parseBasicAttribute(analysis, res, symbols[MS_POSSIBLE_GEOGRAPHICAL_NAME], "POSSIBLE_GEOGRAPHICAL_NAME"); - parseBasicAttribute(analysis, res, symbols[MS_REQUIRE_FOLLOWING_VERB], "REQUIRE_FOLLOWING_VERB"); - parseBasicAttribute(analysis, res, symbols[MS_TENSE], "TENSE"); - parseBasicAttribute(analysis, res, symbols[MS_PARTICIPLE], "PARTICIPLE"); - parseBasicAttribute(analysis, res, symbols[MS_POSSESSIVE], "POSSESSIVE"); - parseBasicAttribute(analysis, res, symbols[MS_KYSYMYSLIITE], "KYSYMYSLIITE"); - parseBasicAttribute(analysis, res, symbols[MS_FOCUS], "FOCUS"); - parseBasicAttribute(analysis, res, symbols[MS_COMPARISON], "COMPARISON"); + parseBasicAttribute(analysis, res, symbols[MS_SIJAMUOTO], Analysis::Key::SIJAMUOTO); + parseBasicAttribute(analysis, res, symbols[MS_CLASS], Analysis::Key::CLASS); + parseBasicAttribute(analysis, res, symbols[MS_NUMBER], Analysis::Key::NUMBER); + parseBasicAttribute(analysis, res, symbols[MS_PERSON], Analysis::Key::PERSON); + parseBasicAttribute(analysis, res, symbols[MS_MOOD], Analysis::Key::MOOD); + parseBasicAttribute(analysis, res, symbols[MS_VAPAA_JALKIOSA], Analysis::Key::MALAGA_VAPAA_JALKIOSA); + parseBasicAttribute(analysis, res, symbols[MS_NEGATIVE], Analysis::Key::NEGATIVE); + parseBasicAttribute(analysis, res, symbols[MS_POSSIBLE_GEOGRAPHICAL_NAME], Analysis::Key::POSSIBLE_GEOGRAPHICAL_NAME); + parseBasicAttribute(analysis, res, symbols[MS_REQUIRE_FOLLOWING_VERB], Analysis::Key::REQUIRE_FOLLOWING_VERB); + parseBasicAttribute(analysis, res, symbols[MS_TENSE], Analysis::Key::TENSE); + parseBasicAttribute(analysis, res, symbols[MS_PARTICIPLE], Analysis::Key::PARTICIPLE); + parseBasicAttribute(analysis, res, symbols[MS_POSSESSIVE], Analysis::Key::POSSESSIVE); + parseBasicAttribute(analysis, res, symbols[MS_KYSYMYSLIITE], Analysis::Key::KYSYMYSLIITE); + parseBasicAttribute(analysis, res, symbols[MS_FOCUS], Analysis::Key::FOCUS); + parseBasicAttribute(analysis, res, symbols[MS_COMPARISON], Analysis::Key::COMPARISON); if (fullMorphology) { parsePerusmuoto(analysis, res); } @@ -268,12 +268,12 @@ value_t structureVal = get_attribute(result, symbols[MS_RAKENNE]); char * value = get_value_string(structureVal); wchar_t * structure = StringUtils::ucs4FromUtf8(value); - analysis->addAttribute("STRUCTURE", structure); + analysis->addAttribute(Analysis::Key::STRUCTURE, structure); free(value); } void MalagaAnalyzer::parseBasicAttribute(Analysis * &analysis, value_t &result, - symbol_t symbol, const char * attrName) const { + symbol_t symbol, Analysis::Key key) const { if (!symbol) { return; } @@ -293,7 +293,7 @@ } const wchar_t * valueName = (*mapIterator).second; if (valueName) { - analysis->addAttribute(attrName, StringUtils::copy(valueName)); + analysis->addConstAttribute(key, valueName); } } @@ -305,19 +305,19 @@ char * value = get_value_string(perusmuotoVal); wchar_t * perusmuoto = StringUtils::ucs4FromUtf8(value); free(value); - const wchar_t * structure = analysis->getValue("STRUCTURE"); + const wchar_t * structure = analysis->getValue(Analysis::Key::STRUCTURE); wchar_t * baseForm = parseBaseform(perusmuoto, structure); wchar_t * wordIds = parseAttributeFromPerusmuoto(perusmuoto, L's'); wchar_t * wordBases = parseAttributeFromPerusmuoto(perusmuoto, L'p'); delete[] perusmuoto; if (baseForm) { - analysis->addAttribute("BASEFORM", baseForm); + analysis->addAttribute(Analysis::Key::BASEFORM, baseForm); } if (wordIds) { - analysis->addAttribute("WORDIDS", wordIds); + analysis->addAttribute(Analysis::Key::WORDIDS, wordIds); } if (wordBases) { - analysis->addAttribute("WORDBASES", wordBases); + analysis->addAttribute(Analysis::Key::WORDBASES, wordBases); } } diff -Nru libvoikko-4.0.1/src/morphology/MalagaAnalyzer.hpp libvoikko-4.0.2/src/morphology/MalagaAnalyzer.hpp --- libvoikko-4.0.1/src/morphology/MalagaAnalyzer.hpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/MalagaAnalyzer.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -67,7 +67,7 @@ void parseStructure(Analysis * &analysis, malaga::value_t &result) const; void parsePerusmuoto(Analysis * &analysis, malaga::value_t &result) const; void parseBasicAttribute(Analysis * &analysis, malaga::value_t &result, - malaga::symbol_t symbol, const char * attrName) const; + malaga::symbol_t symbol, Analysis::Key key) const; wchar_t * parseBaseform(wchar_t * &perusmuoto, const wchar_t * structure) const; wchar_t * parseAttributeFromPerusmuoto(wchar_t * &perusmuoto, wchar_t id) const; void initSymbols(); diff -Nru libvoikko-4.0.1/src/morphology/VfstAnalyzer.cpp libvoikko-4.0.2/src/morphology/VfstAnalyzer.cpp --- libvoikko-4.0.1/src/morphology/VfstAnalyzer.cpp 2015-10-22 15:53:25.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/VfstAnalyzer.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -56,7 +56,7 @@ // XXX: could handle different types of transducers transducer = new WeightedTransducer(morFile.c_str()); configuration = new WeightedConfiguration(transducer->getFlagDiacriticFeatureCount(), BUFFER_SIZE); - outputBuffer = new char[BUFFER_SIZE]; + outputBuffer = new wchar_t[BUFFER_SIZE]; } list * VfstAnalyzer::analyze(const char * word, bool fullMorphology) { @@ -78,28 +78,25 @@ wchar_t * wordLowerUcs4 = new wchar_t[wlen]; memcpy(wordLowerUcs4, word, wlen * sizeof(wchar_t)); voikko_set_case(CT_ALL_LOWER, wordLowerUcs4, wlen); - char * wordLower = StringUtils::utf8FromUcs4(wordLowerUcs4, wlen); - delete[] wordLowerUcs4; list * analysisList = new list(); - if (transducer->prepare(configuration, wordLower, strlen(wordLower))) { + if (transducer->prepare(configuration, wordLowerUcs4, wlen)) { int analysisCount = 0; int16_t weight; while (++analysisCount < MAX_ANALYSIS_COUNT && transducer->next(configuration, outputBuffer, BUFFER_SIZE, &weight)) { Analysis * analysis = new Analysis(); if (fullMorphology) { - wchar_t * fstOutput = StringUtils::ucs4FromUtf8(outputBuffer); - analysis->addAttribute("FSTOUTPUT", fstOutput); + analysis->addAttribute(Analysis::Key::FSTOUTPUT, StringUtils::copy(outputBuffer)); } stringstream ss; ss << setprecision(9) << logWeightToProb(weight); string weightStr = ss.str(); - analysis->addAttribute("WEIGHT", StringUtils::ucs4FromUtf8(weightStr.c_str())); + analysis->addAttribute(Analysis::Key::WEIGHT, StringUtils::ucs4FromUtf8(weightStr.c_str())); analysisList->push_back(analysis); } } - delete[] wordLower; + delete[] wordLowerUcs4; return analysisList; } diff -Nru libvoikko-4.0.1/src/morphology/VfstAnalyzer.hpp libvoikko-4.0.2/src/morphology/VfstAnalyzer.hpp --- libvoikko-4.0.1/src/morphology/VfstAnalyzer.hpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/morphology/VfstAnalyzer.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -50,7 +50,7 @@ private: fst::WeightedTransducer * transducer; fst::WeightedConfiguration * configuration; - char * outputBuffer; + wchar_t * outputBuffer; }; } } diff -Nru libvoikko-4.0.1/src/spellchecker/AnalyzerToSpellerAdapter.cpp libvoikko-4.0.2/src/spellchecker/AnalyzerToSpellerAdapter.cpp --- libvoikko-4.0.1/src/spellchecker/AnalyzerToSpellerAdapter.cpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/spellchecker/AnalyzerToSpellerAdapter.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -49,7 +49,7 @@ spellresult best_result = SPELL_FAILED; list::const_iterator it = analyses->begin(); while (it != analyses->end()) { - const wchar_t * structure = (*it)->getValue("STRUCTURE"); + const wchar_t * structure = (*it)->getValue(Analysis::Key::STRUCTURE); spellresult result = SpellUtils::matchWordAndAnalysis(word, wlen, structure); if (best_result == SPELL_FAILED || best_result > result) { best_result = result; diff -Nru libvoikko-4.0.1/src/spellchecker/FinnishSpellerTweaksWrapper.cpp libvoikko-4.0.2/src/spellchecker/FinnishSpellerTweaksWrapper.cpp --- libvoikko-4.0.1/src/spellchecker/FinnishSpellerTweaksWrapper.cpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/spellchecker/FinnishSpellerTweaksWrapper.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -106,7 +106,7 @@ list::const_iterator it = trailingAnalyses->begin(); bool isTrailingAcceptable = false; while (it != trailingAnalyses->end()) { - const wchar_t * trailingAttr = (*it)->getValue("MALAGA_VAPAA_JALKIOSA"); + const wchar_t * trailingAttr = (*it)->getValue(Analysis::Key::MALAGA_VAPAA_JALKIOSA); if (trailingAttr != 0 && wcscmp(trailingAttr, L"true") == 0) { isTrailingAcceptable = true; break; @@ -136,7 +136,7 @@ list::const_iterator it = analyses->begin(); while (it != analyses->end()) { - const wchar_t * structure = (*it)->getValue("STRUCTURE"); + const wchar_t * structure = (*it)->getValue(Analysis::Key::STRUCTURE); size_t j = 0; size_t i; for (i = 0; i < leading_len; i++) { diff -Nru libvoikko-4.0.1/src/spellchecker/SpellWithPriority.cpp libvoikko-4.0.2/src/spellchecker/SpellWithPriority.cpp --- libvoikko-4.0.1/src/spellchecker/SpellWithPriority.cpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/spellchecker/SpellWithPriority.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -36,7 +36,7 @@ namespace libvoikko { namespace spellchecker { static int getPriorityFromNounInflection(const Analysis * analysis) { - const wchar_t * sijamuoto = analysis->getValue("SIJAMUOTO"); + const wchar_t * sijamuoto = analysis->getValue(Analysis::Key::SIJAMUOTO); if (!sijamuoto) { // unknown sijamuoto return 4; @@ -90,7 +90,7 @@ } static int getPriorityFromWordClassAndInflection(const Analysis * analysis) { - const wchar_t * wordClass = analysis->getValue("CLASS"); + const wchar_t * wordClass = analysis->getValue(Analysis::Key::CLASS); if (!wordClass) { // unknown word class return 4; @@ -141,7 +141,7 @@ static spellresult handleAnalysis(const wchar_t * word, size_t len, int &prio, const Analysis * analysis) { prio = getPriorityFromWordClassAndInflection(analysis); - const wchar_t * structure = analysis->getValue("STRUCTURE"); + const wchar_t * structure = analysis->getValue(Analysis::Key::STRUCTURE); prio *= getPriorityFromStructure(structure); spellresult result = SpellUtils::matchWordAndAnalysis(word, len, structure); prio *= getPriorityFromSpellResult(result); diff -Nru libvoikko-4.0.1/src/spellchecker/suggestion/SuggestionGeneratorCaseChange.cpp libvoikko-4.0.2/src/spellchecker/suggestion/SuggestionGeneratorCaseChange.cpp --- libvoikko-4.0.1/src/spellchecker/suggestion/SuggestionGeneratorCaseChange.cpp 2015-10-20 15:02:53.000000000 +0000 +++ libvoikko-4.0.2/src/spellchecker/suggestion/SuggestionGeneratorCaseChange.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -79,7 +79,7 @@ return; } const wchar_t * structure = - (*analyses->begin())->getValue("STRUCTURE"); + (*analyses->begin())->getValue(Analysis::Key::STRUCTURE); newsugg = new wchar_t[wlen + 1]; wcsncpy(newsugg, word, wlen); newsugg[wlen] = L'\0'; diff -Nru libvoikko-4.0.1/src/spellchecker/VfstSpeller.cpp libvoikko-4.0.2/src/spellchecker/VfstSpeller.cpp --- libvoikko-4.0.1/src/spellchecker/VfstSpeller.cpp 2015-07-18 12:55:23.000000000 +0000 +++ libvoikko-4.0.2/src/spellchecker/VfstSpeller.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -44,7 +44,7 @@ string splFile = directoryName + "/spl.vfst"; transducer = new WeightedTransducer(splFile.c_str()); configuration = new WeightedConfiguration(transducer->getFlagDiacriticFeatureCount(), BUFFER_SIZE); - outputBuffer = new char[BUFFER_SIZE]; + outputBuffer = new wchar_t[BUFFER_SIZE]; } spellresult VfstSpeller::doSpell(const wchar_t * word, size_t wlen) { @@ -52,13 +52,11 @@ return SPELL_FAILED; } spellresult result = SPELL_FAILED; - char * wordUtf = StringUtils::utf8FromUcs4(word, wlen); - if (transducer->prepare(configuration, wordUtf, strlen(wordUtf))) { + if (transducer->prepare(configuration, word, wlen)) { if (transducer->next(configuration, outputBuffer, BUFFER_SIZE)) { result = SPELL_OK; } } - delete[] wordUtf; return result; } diff -Nru libvoikko-4.0.1/src/spellchecker/VfstSpeller.hpp libvoikko-4.0.2/src/spellchecker/VfstSpeller.hpp --- libvoikko-4.0.1/src/spellchecker/VfstSpeller.hpp 2015-07-18 12:55:23.000000000 +0000 +++ libvoikko-4.0.2/src/spellchecker/VfstSpeller.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -51,7 +51,7 @@ /** Return SPELL_FAILED or SPELL_OK depending on whether given word is correct as is. */ spellresult doSpell(const wchar_t * word, size_t wlen); fst::WeightedConfiguration * configuration; - char * outputBuffer; + wchar_t * outputBuffer; }; } } diff -Nru libvoikko-4.0.1/src/spellchecker/VfstSuggestion.cpp libvoikko-4.0.2/src/spellchecker/VfstSuggestion.cpp --- libvoikko-4.0.1/src/spellchecker/VfstSuggestion.cpp 2015-07-18 12:55:23.000000000 +0000 +++ libvoikko-4.0.2/src/spellchecker/VfstSuggestion.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -55,23 +55,22 @@ errorModel = new fst::WeightedTransducer(errFile.c_str()); acceptorConf = new fst::WeightedConfiguration(acceptor->getFlagDiacriticFeatureCount(), BUFFER_SIZE); errorModelConf = new fst::WeightedConfiguration(errorModel->getFlagDiacriticFeatureCount(), BUFFER_SIZE); - acceptorBuffer = new char[BUFFER_SIZE]; - errorModelBuffer = new char[BUFFER_SIZE]; + acceptorBuffer = new wchar_t[BUFFER_SIZE]; + errorModelBuffer = new wchar_t[BUFFER_SIZE]; } void VfstSuggestion::generate(SuggestionStatus * s) const { s->setMaxCost(100); // not actually used size_t wlen = s->getWordLength(); - char * wordUtf = StringUtils::utf8FromUcs4(s->getWord(), wlen); int16_t acceptorWeight; int16_t errorModelWeight; - map suggestionWeights; - if (errorModel->prepare(errorModelConf, wordUtf, wlen)) { + map suggestionWeights; + if (errorModel->prepare(errorModelConf, s->getWord(), wlen)) { while (!s->shouldAbort() && errorModel->next(errorModelConf, errorModelBuffer, BUFFER_SIZE, &errorModelWeight)) { - if (acceptor->prepare(acceptorConf, errorModelBuffer, strlen(errorModelBuffer))) { + if (acceptor->prepare(acceptorConf, errorModelBuffer, wcslen(errorModelBuffer))) { int firstNotReachedPosition; if (acceptor->next(acceptorConf, acceptorBuffer, BUFFER_SIZE, &acceptorWeight, &firstNotReachedPosition)) { - string suggStr(errorModelBuffer); + wstring suggStr(errorModelBuffer); int weight = acceptorWeight + errorModelWeight; if (suggestionWeights.find(suggStr) != suggestionWeights.end()) { suggestionWeights[suggStr] = min(suggestionWeights[suggStr], weight); @@ -86,12 +85,11 @@ } } } - delete[] wordUtf; priority_queue queue; - for (map::const_iterator it = suggestionWeights.begin(); it != suggestionWeights.end(); ++it) { + for (map::const_iterator it = suggestionWeights.begin(); it != suggestionWeights.end(); ++it) { WeightedSuggestion sugg; - sugg.suggestion = StringUtils::ucs4FromUtf8(it->first.c_str()); + sugg.suggestion = StringUtils::copy(it->first.c_str()); sugg.weight = it->second; queue.push(sugg); } diff -Nru libvoikko-4.0.1/src/spellchecker/VfstSuggestion.hpp libvoikko-4.0.2/src/spellchecker/VfstSuggestion.hpp --- libvoikko-4.0.1/src/spellchecker/VfstSuggestion.hpp 2015-07-18 12:55:23.000000000 +0000 +++ libvoikko-4.0.2/src/spellchecker/VfstSuggestion.hpp 2016-02-19 13:54:23.000000000 +0000 @@ -52,8 +52,8 @@ fst::WeightedTransducer * errorModel; fst::WeightedConfiguration * acceptorConf; fst::WeightedConfiguration * errorModelConf; - char * acceptorBuffer; - char * errorModelBuffer; + wchar_t * acceptorBuffer; + wchar_t * errorModelBuffer; }; } } } diff -Nru libvoikko-4.0.1/src/tools/voikkovfstc.cpp libvoikko-4.0.2/src/tools/voikkovfstc.cpp --- libvoikko-4.0.1/src/tools/voikkovfstc.cpp 2015-12-01 17:31:41.000000000 +0000 +++ libvoikko-4.0.2/src/tools/voikkovfstc.cpp 2016-02-19 13:54:23.000000000 +0000 @@ -219,9 +219,28 @@ if (a.text[0] == '[' || a.text[0] == '@') { return a.text.substr(1) < b.text.substr(1); } - else { - return a.text < b.text; + return a.text < b.text; + } +}; + +struct compareTransitionsForFileOrder { + bool operator()(WeightedTransition const & a, WeightedTransition const & b) const { + if (a.symIn == b.symIn) { + return false; + } + if (a.symIn == 0) { + return true; + } + if (b.symIn == 0) { + return false; } + if (a.symIn == 0xFFFFFFFF) { + return true; + } + if (b.symIn == 0xFFFFFFFF) { + return false; + } + return a.symIn < b.symIn; } }; @@ -457,9 +476,13 @@ // Write state transitions for (vector::iterator it = attStateVector.begin(); it < attStateVector.end(); it++) { uint32_t tCount = it->transitions.size(); + for (uint32_t ti = 0; ti < tCount; ti++) { + WeightedTransition & t = it->transitions[ti]; + setTarget(t, stateOrdinalToOffset, it->targetStateOrds[ti]); + } + sort(it->transitions.begin(), it->transitions.end(), compareTransitionsForFileOrder()); { WeightedTransition & t = it->transitions[0]; - setTarget(t, stateOrdinalToOffset, it->targetStateOrds[0]); t.moreTransitions = (tCount > 255 ? 255 : tCount - 1); writeTrans(transducerFile, byteSwap, t, weights); } @@ -472,7 +495,6 @@ } for (uint32_t ti = 1; ti < tCount; ti++) { WeightedTransition & t = it->transitions[ti]; - setTarget(t, stateOrdinalToOffset, it->targetStateOrds[ti]); t.moreTransitions = 0; writeTrans(transducerFile, byteSwap, t, weights); } diff -Nru libvoikko-4.0.1/test/libvoikkoTest.py libvoikko-4.0.2/test/libvoikkoTest.py --- libvoikko-4.0.1/test/libvoikkoTest.py 2015-09-15 13:34:48.000000000 +0000 +++ libvoikko-4.0.2/test/libvoikkoTest.py 2016-02-19 13:54:23.000000000 +0000 @@ -356,6 +356,14 @@ self.failUnless(len(longWord) > MAX_WORD_CHARS) self.assertEqual(0, len(self.voikko.analyze(longWord))) + def testTokenizationWorksForHugeParagraphs(self): + hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000 + self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph))) + + def testTokenizationWorksWithSomeMultibyteCharacters(self): + text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \nKissä on 29 vuotta vanha... Onhan se silloin vanha. \nKissä on 29 vuotta vanha... Onhan se silloin vanha. \nKissä on 29 vuotta vanha... Onhan se silloin vanha. \nKissä on 29 vuotta vanha... Onhan se silloin vanha. \nKissä on 29 vuotta vanha... Onhan se silloin vanha. \nKissä on 29 vuotta vanha... Onhan se silloin vanha. \nKissä on 29 vuotta vanha... Onhan se silloin vanha. \nKissä on 29 vuotta vanha... Onhan se silloin vanha. \n" + self.assertEqual(180, len(self.voikko.tokens(text))) + def testEmbeddedNullsAreNotAccepted(self): self.failIf(self.voikko.spell(u"kissa\0asdasd")) self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira")))