diff -Nru link-grammar-5.3.16/aclocal.m4 link-grammar-5.5.0/aclocal.m4 --- link-grammar-5.3.16/aclocal.m4 2017-04-15 21:49:47.000000000 +0000 +++ link-grammar-5.5.0/aclocal.m4 2018-04-29 21:40:01.000000000 +0000 @@ -164,6 +164,139 @@ ]) # =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_jni_include_dir.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_JNI_INCLUDE_DIR +# +# DESCRIPTION +# +# AX_JNI_INCLUDE_DIR finds include directories needed for compiling +# programs using the JNI interface. +# +# JNI include directories are usually in the Java distribution. This is +# deduced from the value of $JAVA_HOME, $JAVAC, or the path to "javac", in +# that order. When this macro completes, a list of directories is left in +# the variable JNI_INCLUDE_DIRS. +# +# Example usage follows: +# +# AX_JNI_INCLUDE_DIR +# +# for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS +# do +# CPPFLAGS="$CPPFLAGS -I$JNI_INCLUDE_DIR" +# done +# +# If you want to force a specific compiler: +# +# - at the configure.in level, set JAVAC=yourcompiler before calling +# AX_JNI_INCLUDE_DIR +# +# - at the configure level, setenv JAVAC +# +# Note: This macro can work with the autoconf M4 macros for Java programs. +# This particular macro is not part of the original set of macros. +# +# LICENSE +# +# Copyright (c) 2008 Don Anderson +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 11 + +AU_ALIAS([AC_JNI_INCLUDE_DIR], [AX_JNI_INCLUDE_DIR]) +AC_DEFUN([AX_JNI_INCLUDE_DIR],[ + +JNI_INCLUDE_DIRS="" + +if test "x$JAVA_HOME" != x; then + _JTOPDIR="$JAVA_HOME" +else + if test "x$JAVAC" = x; then + JAVAC=javac + fi + AC_PATH_PROG([_ACJNI_JAVAC], [$JAVAC], [no]) + if test "x$_ACJNI_JAVAC" = xno; then + AC_MSG_ERROR([cannot find JDK; try setting \$JAVAC or \$JAVA_HOME]) + fi + _ACJNI_FOLLOW_SYMLINKS("$_ACJNI_JAVAC") + _JTOPDIR=`echo "$_ACJNI_FOLLOWED" | sed -e 's://*:/:g' -e 's:/[[^/]]*$::'` +fi + +case "$host_os" in + darwin*) # Apple JDK is at /System location and has headers symlinked elsewhere + case "$_JTOPDIR" in + /System/Library/Frameworks/JavaVM.framework/*) + _JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[[^/]]*$::'` + _JINC="$_JTOPDIR/Headers";; + *) _JINC="$_JTOPDIR/include";; + esac;; + *) _JINC="$_JTOPDIR/include";; +esac +_AS_ECHO_LOG([_JTOPDIR=$_JTOPDIR]) +_AS_ECHO_LOG([_JINC=$_JINC]) + +# On Mac OS X 10.6.4, jni.h is a symlink: +# /System/Library/Frameworks/JavaVM.framework/Versions/Current/Headers/jni.h +# -> ../../CurrentJDK/Headers/jni.h. +AC_CHECK_FILE([$_JINC/jni.h], + [JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JINC"], + [_JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[[^/]]*$::'` + AC_CHECK_FILE([$_JTOPDIR/include/jni.h], + [JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JTOPDIR/include"], + AC_MSG_ERROR([cannot find JDK header files])) + ]) + +# get the likely subdirectories for system specific java includes +case "$host_os" in +bsdi*) _JNI_INC_SUBDIRS="bsdos";; +freebsd*) _JNI_INC_SUBDIRS="freebsd";; +darwin*) _JNI_INC_SUBDIRS="darwin";; +linux*) _JNI_INC_SUBDIRS="linux genunix";; +osf*) _JNI_INC_SUBDIRS="alpha";; +solaris*) _JNI_INC_SUBDIRS="solaris";; +mingw*) _JNI_INC_SUBDIRS="win32";; +cygwin*) _JNI_INC_SUBDIRS="win32";; +*) _JNI_INC_SUBDIRS="genunix";; +esac + +# add any subdirectories that are present +for JINCSUBDIR in $_JNI_INC_SUBDIRS +do + if test -d "$_JTOPDIR/include/$JINCSUBDIR"; then + JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JTOPDIR/include/$JINCSUBDIR" + fi +done +]) + +# _ACJNI_FOLLOW_SYMLINKS +# Follows symbolic links on , +# finally setting variable _ACJNI_FOLLOWED +# ---------------------------------------- +AC_DEFUN([_ACJNI_FOLLOW_SYMLINKS],[ +# find the include directory relative to the javac executable +_cur="$1" +while ls -ld "$_cur" 2>/dev/null | grep " -> " >/dev/null; do + AC_MSG_CHECKING([symlink for $_cur]) + _slink=`ls -ld "$_cur" | sed 's/.* -> //'` + case "$_slink" in + /*) _cur="$_slink";; + # 'X' avoids triggering unwanted echo options. + *) _cur=`echo "X$_cur" | sed -e 's/^X//' -e 's:[[^/]]*$::'`"$_slink";; + esac + AC_MSG_RESULT([$_cur]) +done +_ACJNI_FOLLOWED="$_cur" +])# _ACJNI + +# =========================================================================== # http://www.gnu.org/software/autoconf-archive/ax_path_lib_pcre.html # =========================================================================== # @@ -1211,6 +1344,43 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd` ]) +# AM_COND_IF -*- Autoconf -*- + +# Copyright (C) 2008-2014 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# _AM_COND_IF +# _AM_COND_ELSE +# _AM_COND_ENDIF +# -------------- +# These macros are only used for tracing. +m4_define([_AM_COND_IF]) +m4_define([_AM_COND_ELSE]) +m4_define([_AM_COND_ENDIF]) + +# AM_COND_IF(COND, [IF-TRUE], [IF-FALSE]) +# --------------------------------------- +# If the shell condition COND is true, execute IF-TRUE, otherwise execute +# IF-FALSE. Allow automake to learn about conditional instantiating macros +# (the AC_CONFIG_FOOS). +AC_DEFUN([AM_COND_IF], +[m4_ifndef([_AM_COND_VALUE_$1], + [m4_fatal([$0: no such condition "$1"])])dnl +_AM_COND_IF([$1])dnl +if test -z "$$1_TRUE"; then : + m4_n([$2])[]dnl +m4_ifval([$3], +[_AM_COND_ELSE([$1])dnl +else + $3 +])dnl +_AM_COND_ENDIF([$1])dnl +fi[]dnl +]) + # AM_CONDITIONAL -*- Autoconf -*- # Copyright (C) 1997-2014 Free Software Foundation, Inc. @@ -1748,6 +1918,24 @@ rmdir .tst 2>/dev/null AC_SUBST([am__leading_dot])]) +# Copyright (C) 1998-2014 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_PROG_LEX +# ----------- +# Autoconf leaves LEX=: if lex or flex can't be found. Change that to a +# "missing" invocation, for better error output. +AC_DEFUN([AM_PROG_LEX], +[AC_PREREQ([2.50])dnl +AC_REQUIRE([AM_MISSING_HAS_RUN])dnl +AC_REQUIRE([AC_PROG_LEX])dnl +if test "$LEX" = :; then + LEX=${am_missing_run}flex +fi]) + # Add --enable-maintainer-mode option to configure. -*- Autoconf -*- # From Jim Meyering diff -Nru link-grammar-5.3.16/autogen.sh link-grammar-5.5.0/autogen.sh --- link-grammar-5.3.16/autogen.sh 2016-10-04 20:31:59.000000000 +0000 +++ link-grammar-5.5.0/autogen.sh 2017-12-02 07:32:23.000000000 +0000 @@ -23,17 +23,22 @@ exit 1 fi -libtoolize --force --copy || { - echo "error: libtoolize failed" - exit 1 -} - # Produce aclocal.m4, so autoconf gets the automake macros it needs # case `uname` in CYGWIN*) include_dir='-I m4' # Needed for Cygwin only. + ;; + Darwin) + [ "$LIBTOOLIZE" = "" ] && LIBTOOLIZE=glibtoolize + ;; esac + + ${LIBTOOLIZE:=libtoolize} --force --copy || { + echo "error: libtoolize failed" + exit 1 +} + echo "Creating aclocal.m4: aclocal $include_dir $ACLOCAL_FLAGS" aclocal $include_dir $ACLOCAL_FLAGS 2>> autogen.err diff -Nru link-grammar-5.3.16/bindings/java/build.xml.in link-grammar-5.5.0/bindings/java/build.xml.in --- link-grammar-5.3.16/bindings/java/build.xml.in 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/java/build.xml.in 2018-03-11 20:04:15.000000000 +0000 @@ -7,7 +7,7 @@ - + @@ -19,7 +19,7 @@ - + diff -Nru link-grammar-5.3.16/bindings/java/link-grammar-server.sh link-grammar-5.5.0/bindings/java/link-grammar-server.sh --- link-grammar-5.3.16/bindings/java/link-grammar-server.sh 2017-01-27 22:01:05.000000000 +0000 +++ link-grammar-5.5.0/bindings/java/link-grammar-server.sh 2017-06-17 01:29:32.000000000 +0000 @@ -3,9 +3,9 @@ # Shell script to start the link-grammar network server. # # Usage: java org.linkgrammar.LGService [-verbose] [-threads n] port [language] [dictPath] -# Start a link-grammar parse server on tcp/ip port. The server returns -# JSON-formated parse results. Socket input should be a single sentence -# to parse, preceeded by the identifier "text:". +# Start a link-grammar parse server on TCP/IP port. The server returns +# JSON-formatted parse results. Socket input should be a single sentence +# to parse, proceeded by the identifier "text:". # # 'port' The TCP port the service should listen to. # -verbose Generate verbose output. @@ -13,9 +13,9 @@ # 'language' Language abbreviation (en, ru, de, lt, fr, he, tr, any). # 'dictPath' Full path to the Link-Grammar dictionaries. # -# The below starts the server on port 9000. It the port is ommitted, +# The below starts the server on port 9000. It the port is omitted, # help text is printed. This server can be contacted directly via -# tcpip; for example: +# TCP/IP; for example: # # telnet localhost 9000 # @@ -34,7 +34,7 @@ # # echo "text:this is a test" | nc localhost 9000 # -# returns the parse; and +# returns the parse; and # # echo "storeDiagramString:true, text:this is a test" | nc localhost 9000 # diff -Nru link-grammar-5.3.16/bindings/java/Makefile.am link-grammar-5.5.0/bindings/java/Makefile.am --- link-grammar-5.3.16/bindings/java/Makefile.am 2017-02-12 11:36:40.000000000 +0000 +++ link-grammar-5.5.0/bindings/java/Makefile.am 2018-03-04 14:56:46.000000000 +0000 @@ -17,6 +17,7 @@ linkgrammar-@VERSION@.jar: $(JAVA_SRCS) ant +if HAVE_ANT # distclean-generic will delete build.xml, so clean-local must run # first.... but then, build.xml doesn't get removed :-( distclean-generic: clean-local @@ -27,6 +28,7 @@ clean-local: ant clean -rm -f *.jar +endif javadir=$(datadir)/java java_DATA = linkgrammar-@VERSION@.jar @@ -38,10 +40,10 @@ endif install-data-hook: - ${LN_SF} ${javadir}/${java_DATA} ${DESTDIR}/${javadir}/linkgrammar.jar + ${LN_SF} ${javadir}/${java_DATA} ${DESTDIR}${javadir}/linkgrammar.jar uninstall-hook: - -rm ${DESTDIR}/${javadir}/linkgrammar.jar + -rm ${DESTDIR}${javadir}/linkgrammar.jar EXTRA_DIST = \ diff -Nru link-grammar-5.3.16/bindings/java/Makefile.in link-grammar-5.5.0/bindings/java/Makefile.in --- link-grammar-5.3.16/bindings/java/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/bindings/java/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -161,6 +161,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -189,6 +190,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -197,13 +199,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -270,12 +275,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -484,9 +489,15 @@ clean-generic: +@HAVE_ANT_FALSE@distclean-generic: +@HAVE_ANT_FALSE@ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) +@HAVE_ANT_FALSE@ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." +@HAVE_ANT_FALSE@clean-local: +@HAVE_ANT_FALSE@distclean-local: clean: clean-am clean-am: clean-generic clean-libtool clean-local mostlyclean-am @@ -580,20 +591,20 @@ # distclean-generic will delete build.xml, so clean-local must run # first.... but then, build.xml doesn't get removed :-( -distclean-generic: clean-local -distclean-local: clean-local - -rm -f build.xml +@HAVE_ANT_TRUE@distclean-generic: clean-local +@HAVE_ANT_TRUE@distclean-local: clean-local +@HAVE_ANT_TRUE@ -rm -f build.xml # Let ant delete the files that ant creates. -clean-local: - ant clean - -rm -f *.jar +@HAVE_ANT_TRUE@clean-local: +@HAVE_ANT_TRUE@ ant clean +@HAVE_ANT_TRUE@ -rm -f *.jar install-data-hook: - ${LN_SF} ${javadir}/${java_DATA} ${DESTDIR}/${javadir}/linkgrammar.jar + ${LN_SF} ${javadir}/${java_DATA} ${DESTDIR}${javadir}/linkgrammar.jar uninstall-hook: - -rm ${DESTDIR}/${javadir}/linkgrammar.jar + -rm ${DESTDIR}${javadir}/linkgrammar.jar # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff -Nru link-grammar-5.3.16/bindings/java/org/linkgrammar/LGService.java link-grammar-5.5.0/bindings/java/org/linkgrammar/LGService.java --- link-grammar-5.3.16/bindings/java/org/linkgrammar/LGService.java 2017-01-31 04:32:48.000000000 +0000 +++ link-grammar-5.5.0/bindings/java/org/linkgrammar/LGService.java 2017-06-17 01:29:32.000000000 +0000 @@ -417,7 +417,7 @@ System.out.println("Usage: java org.linkgrammar.LGService [-verbose] [-threads n] port [language] [dictPath]"); System.out.println("Start a link-grammar parse server on tcp/ip port. The server returns"); System.out.println("JSON-formated parse results. Socket input should be a single sentence"); - System.out.println("to parse, preceeded by the identifier \"text:\".\n"); + System.out.println("to parse, proceeded by the identifier \"text:\".\n"); System.out.println(" 'port' The TCP port the service should listen to."); System.out.println(" -verbose Generate verbose output."); System.out.println(" -threads Number of concurrent threads/clients allowed (default 1)."); diff -Nru link-grammar-5.3.16/bindings/java/README link-grammar-5.5.0/bindings/java/README --- link-grammar-5.3.16/bindings/java/README 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/bindings/java/README 2017-08-13 22:11:23.000000000 +0000 @@ -4,9 +4,15 @@ These files implement Java language bindings for Link Grammar. They do not extend the parsing algorithms already present in Link-Grammar in -any way. They do, however, allow certain types of remote/server -operation, and, in particular, are needed for use by the RelEx -dependency parser. +any way. The allow users to call the standard link-grammar API from +Java. + +These bindings do provide an additional feature, though: they can be used +in a client-server mode, so that, instead of calling the link-grammar +library directly, the parse request can be sent to a link-grammar +server, which then returns the result in JSON format, which the bindings +here are able to parse, and present the same API, as if the library +was called locally. The shell script link-grammar-server.sh demonstrates how to run the parser server. Once the server is started, parses can be obtained diff -Nru link-grammar-5.3.16/bindings/java-jni/jni-client.c link-grammar-5.5.0/bindings/java-jni/jni-client.c --- link-grammar-5.3.16/bindings/java-jni/jni-client.c 2017-04-15 21:43:39.000000000 +0000 +++ link-grammar-5.5.0/bindings/java-jni/jni-client.c 2018-04-22 14:57:33.000000000 +0000 @@ -15,7 +15,7 @@ #include #endif /* HAVE_STDATOMIC_H */ -#include +#include "link-grammar/api-structures.h" #include "link-grammar/corpus/corpus.h" #include "link-grammar/error.h" #include "jni-client.h" @@ -28,7 +28,7 @@ /* Default to the English language. */ static const char* in_language = "en"; -/* Dicationary can be and should be shared by all. */ +/* Dictionary can be and should be shared by all. */ static Dictionary dict = NULL; #ifdef HAVE_STDATOMIC_H static atomic_flag dict_is_init = ATOMIC_FLAG_INIT; @@ -49,7 +49,7 @@ parse_options_set_repeatable_rand(opts, false); parse_options_set_disjunct_cost(opts, 3.7f); parse_options_set_min_null_count(opts, 1); - parse_options_set_max_null_count(opts, MAX_SENTENCE); + parse_options_set_max_null_count(opts, 250); parse_options_set_max_parse_time(opts, 60); parse_options_set_islands_ok(opts, true); parse_options_set_short_length(opts, 6); @@ -60,9 +60,9 @@ parse_options_set_display_morphology(opts, true); } -static inline void test(void) +static inline void do_test(void) { -#ifdef DEBUG +#if 0 printf("%d\n", word_contains("said", PAST_TENSE_FORM_MARKER, dict)); printf("%d\n", word_contains("gave.v", PAST_TENSE_FORM_MARKER, dict)); printf("%d\n", word_contains("have", PAST_TENSE_FORM_MARKER, dict)); @@ -83,6 +83,9 @@ exceptionClazz = (*env)->FindClass(env, "java/lang/RuntimeException"); if ((*env)->ThrowNew(env, exceptionClazz, msg) != 0) (*env)->FatalError(env, "Fatal: link-grammar JNI: Cannot throw"); + + // I'm confused. Apparently Java exceptions return ?? + free(msg); } // Note that we do NOT offer any kind of protection from having @@ -124,7 +127,7 @@ dict = dictionary_create_lang(in_language); if (!dict) throwException(env, "Error: unable to open dictionary"); - else test(); + else do_test(); dict_version = linkgrammar_get_dict_version(dict); prt_error("Info: JNI: dictionary language '%s' version %s\n", diff -Nru link-grammar-5.3.16/bindings/java-jni/Makefile.am link-grammar-5.5.0/bindings/java-jni/Makefile.am --- link-grammar-5.3.16/bindings/java-jni/Makefile.am 2017-01-30 02:44:10.000000000 +0000 +++ link-grammar-5.5.0/bindings/java-jni/Makefile.am 2018-03-04 14:56:46.000000000 +0000 @@ -7,14 +7,14 @@ # $(top_builddir) to pick up autogened link-grammar/link-features.h AM_CPPFLAGS = -I$(top_srcdir) -I$(top_builddir) $(JAVA_CPPFLAGS) \ - $(WARN_CFLAGS) + -D_JNI_IMPLEMENTATION_ $(WARN_CFLAGS) lib_LTLIBRARIES = liblink-grammar-java.la liblink_grammar_java_la_LDFLAGS = -version-info @VERSION_INFO@ -export-dynamic -no-undefined -export-symbols $(srcdir)/link-grammar-java.def if OS_WIN32 -liblink_grammar_java_la_LDFLAGS += -D_JNI_IMPLEMENTATION_ -Wl,--kill-at +liblink_grammar_java_la_LDFLAGS += -Wl,--kill-at endif liblink_grammar_java_la_LIBADD = $(top_builddir)/link-grammar/liblink-grammar.la ${REGEX_LIBS} diff -Nru link-grammar-5.3.16/bindings/java-jni/Makefile.in link-grammar-5.5.0/bindings/java-jni/Makefile.in --- link-grammar-5.3.16/bindings/java-jni/Makefile.in 2017-04-15 21:49:48.000000000 +0000 +++ link-grammar-5.5.0/bindings/java-jni/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -92,7 +92,7 @@ POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ -@OS_WIN32_TRUE@am__append_1 = -D_JNI_IMPLEMENTATION_ -Wl,--kill-at +@OS_WIN32_TRUE@am__append_1 = -Wl,--kill-at @HAVE_HUNSPELL_TRUE@am__append_2 = ${HUNSPELL_LIBS} @HAVE_ASPELL_TRUE@am__append_3 = ${ASPELL_LIBS} @LIBMINISAT_BUNDLED_TRUE@@WITH_SAT_SOLVER_TRUE@am__append_4 = $(top_builddir)/link-grammar/minisat/libminisat.la @@ -229,6 +229,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -257,6 +258,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -265,13 +267,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -338,12 +343,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -407,7 +412,7 @@ # $(top_builddir) to pick up autogened link-grammar/link-features.h AM_CPPFLAGS = -I$(top_srcdir) -I$(top_builddir) $(JAVA_CPPFLAGS) \ - $(WARN_CFLAGS) + -D_JNI_IMPLEMENTATION_ $(WARN_CFLAGS) lib_LTLIBRARIES = liblink-grammar-java.la liblink_grammar_java_la_LDFLAGS = -version-info @VERSION_INFO@ \ diff -Nru link-grammar-5.3.16/bindings/lisp/link-interface.cl link-grammar-5.5.0/bindings/lisp/link-interface.cl --- link-grammar-5.3.16/bindings/lisp/link-interface.cl 2015-03-19 00:14:14.000000000 +0000 +++ link-grammar-5.5.0/bindings/lisp/link-interface.cl 2017-07-15 21:16:13.000000000 +0000 @@ -569,7 +569,7 @@ :returning :foreign-address ;; StringVec ) -;;; linkage_get_words returns char**, which I dont' know how to handle. +;;; linkage_get_words returns char**, which I don't know how to handle. (ff:def-foreign-call linkage_get_words ((linkage :foreign-address)) :returning ((* (* :char)) (simple-array string (*)))) diff -Nru link-grammar-5.3.16/bindings/Makefile.in link-grammar-5.5.0/bindings/Makefile.in --- link-grammar-5.3.16/bindings/Makefile.in 2017-04-15 21:49:48.000000000 +0000 +++ link-grammar-5.5.0/bindings/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -200,6 +200,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -228,6 +229,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -236,13 +238,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -309,12 +314,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/bindings/ocaml/Makefile.in link-grammar-5.5.0/bindings/ocaml/Makefile.in --- link-grammar-5.3.16/bindings/ocaml/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/bindings/ocaml/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -131,6 +131,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -159,6 +160,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -167,13 +169,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -240,12 +245,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/bindings/perl/clinkgrammar.pm link-grammar-5.5.0/bindings/perl/clinkgrammar.pm --- link-grammar-5.3.16/bindings/perl/clinkgrammar.pm 2017-01-19 20:16:16.000000000 +0000 +++ link-grammar-5.5.0/bindings/perl/clinkgrammar.pm 2018-04-29 21:29:50.000000000 +0000 @@ -50,6 +50,7 @@ package clinkgrammar; *linkgrammar_get_version = *clinkgrammarc::linkgrammar_get_version; +*linkgrammar_get_configuration = *clinkgrammarc::linkgrammar_get_configuration; *linkgrammar_get_dict_version = *clinkgrammarc::linkgrammar_get_dict_version; *linkgrammar_get_dict_locale = *clinkgrammarc::linkgrammar_get_dict_locale; *dictionary_create_lang = *clinkgrammarc::dictionary_create_lang; @@ -125,6 +126,10 @@ *linkage_get_link_domain_names = *clinkgrammarc::linkage_get_link_domain_names; *linkage_get_words = *clinkgrammarc::linkage_get_words; *linkage_get_word = *clinkgrammarc::linkage_get_word; +*linkage_get_word_byte_start = *clinkgrammarc::linkage_get_word_byte_start; +*linkage_get_word_byte_end = *clinkgrammarc::linkage_get_word_byte_end; +*linkage_get_word_char_start = *clinkgrammarc::linkage_get_word_char_start; +*linkage_get_word_char_end = *clinkgrammarc::linkage_get_word_char_end; *linkage_unused_word_cost = *clinkgrammarc::linkage_unused_word_cost; *linkage_disjunct_cost = *clinkgrammarc::linkage_disjunct_cost; *linkage_link_cost = *clinkgrammarc::linkage_link_cost; @@ -133,6 +138,7 @@ *_lg_error_formatmsg = *clinkgrammarc::_lg_error_formatmsg; *lg_error_clearall = *clinkgrammarc::lg_error_clearall; *_prt_error = *clinkgrammarc::_prt_error; +*lg_error_flush = *clinkgrammarc::lg_error_flush; ############# Class : clinkgrammar::lg_errinfo ############## @@ -185,3 +191,5195 @@ *SINGLE_LINE = *clinkgrammarc::SINGLE_LINE; *MAX_STYLES = *clinkgrammarc::MAX_STYLES; 1; +WIG_ERROR) ? r : SWIG_TypeError) + +/* The CastRankLimit says how many bits are used for the cast rank */ +#define SWIG_CASTRANKLIMIT (1 << 8) +/* The NewMask denotes the object was created (using new/malloc) */ +#define SWIG_NEWOBJMASK (SWIG_CASTRANKLIMIT << 1) +/* The TmpMask is for in/out typemaps that use temporal objects */ +#define SWIG_TMPOBJMASK (SWIG_NEWOBJMASK << 1) +/* Simple returning values */ +#define SWIG_BADOBJ (SWIG_ERROR) +#define SWIG_OLDOBJ (SWIG_OK) +#define SWIG_NEWOBJ (SWIG_OK | SWIG_NEWOBJMASK) +#define SWIG_TMPOBJ (SWIG_OK | SWIG_TMPOBJMASK) +/* Check, add and del mask methods */ +#define SWIG_AddNewMask(r) (SWIG_IsOK(r) ? (r | SWIG_NEWOBJMASK) : r) +#define SWIG_DelNewMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_NEWOBJMASK) : r) +#define SWIG_IsNewObj(r) (SWIG_IsOK(r) && (r & SWIG_NEWOBJMASK)) +#define SWIG_AddTmpMask(r) (SWIG_IsOK(r) ? (r | SWIG_TMPOBJMASK) : r) +#define SWIG_DelTmpMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_TMPOBJMASK) : r) +#define SWIG_IsTmpObj(r) (SWIG_IsOK(r) && (r & SWIG_TMPOBJMASK)) + +/* Cast-Rank Mode */ +#if defined(SWIG_CASTRANK_MODE) +# ifndef SWIG_TypeRank +# define SWIG_TypeRank unsigned long +# endif +# ifndef SWIG_MAXCASTRANK /* Default cast allowed */ +# define SWIG_MAXCASTRANK (2) +# endif +# define SWIG_CASTRANKMASK ((SWIG_CASTRANKLIMIT) -1) +# define SWIG_CastRank(r) (r & SWIG_CASTRANKMASK) +SWIGINTERNINLINE int SWIG_AddCast(int r) { + return SWIG_IsOK(r) ? ((SWIG_CastRank(r) < SWIG_MAXCASTRANK) ? (r + 1) : SWIG_ERROR) : r; +} +SWIGINTERNINLINE int SWIG_CheckState(int r) { + return SWIG_IsOK(r) ? SWIG_CastRank(r) + 1 : 0; +} +#else /* no cast-rank mode */ +# define SWIG_AddCast(r) (r) +# define SWIG_CheckState(r) (SWIG_IsOK(r) ? 1 : 0) +#endif + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void *(*swig_converter_func)(void *, int *); +typedef struct swig_type_info *(*swig_dycast_func)(void **); + +/* Structure to store information on one type */ +typedef struct swig_type_info { + const char *name; /* mangled name of this type */ + const char *str; /* human readable name of this type */ + swig_dycast_func dcast; /* dynamic cast function down a hierarchy */ + struct swig_cast_info *cast; /* linked list of types that can cast into this type */ + void *clientdata; /* language specific type data */ + int owndata; /* flag if the structure owns the clientdata */ +} swig_type_info; + +/* Structure to store a type and conversion function used for casting */ +typedef struct swig_cast_info { + swig_type_info *type; /* pointer to type that is equivalent to this type */ + swig_converter_func converter; /* function to cast the void pointers */ + struct swig_cast_info *next; /* pointer to next cast in linked list */ + struct swig_cast_info *prev; /* pointer to the previous cast */ +} swig_cast_info; + +/* Structure used to store module information + * Each module generates one structure like this, and the runtime collects + * all of these structures and stores them in a circularly linked list.*/ +typedef struct swig_module_info { + swig_type_info **types; /* Array of pointers to swig_type_info structures that are in this module */ + size_t size; /* Number of types in this module */ + struct swig_module_info *next; /* Pointer to next element in circularly linked list */ + swig_type_info **type_initial; /* Array of initially generated type structures */ + swig_cast_info **cast_initial; /* Array of initially generated casting structures */ + void *clientdata; /* Language specific module data */ +} swig_module_info; + +/* + Compare two type names skipping the space characters, therefore + "char*" == "char *" and "Class" == "Class", etc. + + Return 0 when the two name types are equivalent, as in + strncmp, but skipping ' '. +*/ +SWIGRUNTIME int +SWIG_TypeNameComp(const char *f1, const char *l1, + const char *f2, const char *l2) { + for (;(f1 != l1) && (f2 != l2); ++f1, ++f2) { + while ((*f1 == ' ') && (f1 != l1)) ++f1; + while ((*f2 == ' ') && (f2 != l2)) ++f2; + if (*f1 != *f2) return (*f1 > *f2) ? 1 : -1; + } + return (int)((l1 - f1) - (l2 - f2)); +} + +/* + Check type equivalence in a name list like ||... + Return 0 if equal, -1 if nb < tb, 1 if nb > tb +*/ +SWIGRUNTIME int +SWIG_TypeCmp(const char *nb, const char *tb) { + int equiv = 1; + const char* te = tb + strlen(tb); + const char* ne = nb; + while (equiv != 0 && *ne) { + for (nb = ne; *ne; ++ne) { + if (*ne == '|') break; + } + equiv = SWIG_TypeNameComp(nb, ne, tb, te); + if (*ne) ++ne; + } + return equiv; +} + +/* + Check type equivalence in a name list like ||... + Return 0 if not equal, 1 if equal +*/ +SWIGRUNTIME int +SWIG_TypeEquiv(const char *nb, const char *tb) { + return SWIG_TypeCmp(nb, tb) == 0 ? 1 : 0; +} + +/* + Check the typename +*/ +SWIGRUNTIME swig_cast_info * +SWIG_TypeCheck(const char *c, swig_type_info *ty) { + if (ty) { + swig_cast_info *iter = ty->cast; + while (iter) { + if (strcmp(iter->type->name, c) == 0) { + if (iter == ty->cast) + return iter; + /* Move iter to the top of the linked list */ + iter->prev->next = iter->next; + if (iter->next) + iter->next->prev = iter->prev; + iter->next = ty->cast; + iter->prev = 0; + if (ty->cast) ty->cast->prev = iter; + ty->cast = iter; + return iter; + } + iter = iter->next; + } + } + return 0; +} + +/* + Identical to SWIG_TypeCheck, except strcmp is replaced with a pointer comparison +*/ +SWIGRUNTIME swig_cast_info * +SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) { + if (ty) { + swig_cast_info *iter = ty->cast; + while (iter) { + if (iter->type == from) { + if (iter == ty->cast) + return iter; + /* Move iter to the top of the linked list */ + iter->prev->next = iter->next; + if (iter->next) + iter->next->prev = iter->prev; + iter->next = ty->cast; + iter->prev = 0; + if (ty->cast) ty->cast->prev = iter; + ty->cast = iter; + return iter; + } + iter = iter->next; + } + } + return 0; +} + +/* + Cast a pointer up an inheritance hierarchy +*/ +SWIGRUNTIMEINLINE void * +SWIG_TypeCast(swig_cast_info *ty, void *ptr, int *newmemory) { + return ((!ty) || (!ty->converter)) ? ptr : (*ty->converter)(ptr, newmemory); +} + +/* + Dynamic pointer casting. Down an inheritance hierarchy +*/ +SWIGRUNTIME swig_type_info * +SWIG_TypeDynamicCast(swig_type_info *ty, void **ptr) { + swig_type_info *lastty = ty; + if (!ty || !ty->dcast) return ty; + while (ty && (ty->dcast)) { + ty = (*ty->dcast)(ptr); + if (ty) lastty = ty; + } + return lastty; +} + +/* + Return the name associated with this type +*/ +SWIGRUNTIMEINLINE const char * +SWIG_TypeName(const swig_type_info *ty) { + return ty->name; +} + +/* + Return the pretty name associated with this type, + that is an unmangled type name in a form presentable to the user. +*/ +SWIGRUNTIME const char * +SWIG_TypePrettyName(const swig_type_info *type) { + /* The "str" field contains the equivalent pretty names of the + type, separated by vertical-bar characters. We choose + to print the last name, as it is often (?) the most + specific. */ + if (!type) return NULL; + if (type->str != NULL) { + const char *last_name = type->str; + const char *s; + for (s = type->str; *s; s++) + if (*s == '|') last_name = s+1; + return last_name; + } + else + return type->name; +} + +/* + Set the clientdata field for a type +*/ +SWIGRUNTIME void +SWIG_TypeClientData(swig_type_info *ti, void *clientdata) { + swig_cast_info *cast = ti->cast; + /* if (ti->clientdata == clientdata) return; */ + ti->clientdata = clientdata; + + while (cast) { + if (!cast->converter) { + swig_type_info *tc = cast->type; + if (!tc->clientdata) { + SWIG_TypeClientData(tc, clientdata); + } + } + cast = cast->next; + } +} +SWIGRUNTIME void +SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) { + SWIG_TypeClientData(ti, clientdata); + ti->owndata = 1; +} + +/* + Search for a swig_type_info structure only by mangled name + Search is a O(log #types) + + We start searching at module start, and finish searching when start == end. + Note: if start == end at the beginning of the function, we go all the way around + the circular list. +*/ +SWIGRUNTIME swig_type_info * +SWIG_MangledTypeQueryModule(swig_module_info *start, + swig_module_info *end, + const char *name) { + swig_module_info *iter = start; + do { + if (iter->size) { + size_t l = 0; + size_t r = iter->size - 1; + do { + /* since l+r >= 0, we can (>> 1) instead (/ 2) */ + size_t i = (l + r) >> 1; + const char *iname = iter->types[i]->name; + if (iname) { + int compare = strcmp(name, iname); + if (compare == 0) { + return iter->types[i]; + } else if (compare < 0) { + if (i) { + r = i - 1; + } else { + break; + } + } else if (compare > 0) { + l = i + 1; + } + } else { + break; /* should never happen */ + } + } while (l <= r); + } + iter = iter->next; + } while (iter != end); + return 0; +} + +/* + Search for a swig_type_info structure for either a mangled name or a human readable name. + It first searches the mangled names of the types, which is a O(log #types) + If a type is not found it then searches the human readable names, which is O(#types). + + We start searching at module start, and finish searching when start == end. + Note: if start == end at the beginning of the function, we go all the way around + the circular list. +*/ +SWIGRUNTIME swig_type_info * +SWIG_TypeQueryModule(swig_module_info *start, + swig_module_info *end, + const char *name) { + /* STEP 1: Search the name field using binary search */ + swig_type_info *ret = SWIG_MangledTypeQueryModule(start, end, name); + if (ret) { + return ret; + } else { + /* STEP 2: If the type hasn't been found, do a complete search + of the str field (the human readable name) */ + swig_module_info *iter = start; + do { + size_t i = 0; + for (; i < iter->size; ++i) { + if (iter->types[i]->str && (SWIG_TypeEquiv(iter->types[i]->str, name))) + return iter->types[i]; + } + iter = iter->next; + } while (iter != end); + } + + /* neither found a match */ + return 0; +} + +/* + Pack binary data into a string +*/ +SWIGRUNTIME char * +SWIG_PackData(char *c, void *ptr, size_t sz) { + static const char hex[17] = "0123456789abcdef"; + const unsigned char *u = (unsigned char *) ptr; + const unsigned char *eu = u + sz; + for (; u != eu; ++u) { + unsigned char uu = *u; + *(c++) = hex[(uu & 0xf0) >> 4]; + *(c++) = hex[uu & 0xf]; + } + return c; +} + +/* + Unpack binary data from a string +*/ +SWIGRUNTIME const char * +SWIG_UnpackData(const char *c, void *ptr, size_t sz) { + unsigned char *u = (unsigned char *) ptr; + const unsigned char *eu = u + sz; + for (; u != eu; ++u) { + char d = *(c++); + unsigned char uu; + if ((d >= '0') && (d <= '9')) + uu = (unsigned char)((d - '0') << 4); + else if ((d >= 'a') && (d <= 'f')) + uu = (unsigned char)((d - ('a'-10)) << 4); + else + return (char *) 0; + d = *(c++); + if ((d >= '0') && (d <= '9')) + uu |= (unsigned char)(d - '0'); + else if ((d >= 'a') && (d <= 'f')) + uu |= (unsigned char)(d - ('a'-10)); + else + return (char *) 0; + *u = uu; + } + return c; +} + +/* + Pack 'void *' into a string buffer. +*/ +SWIGRUNTIME char * +SWIG_PackVoidPtr(char *buff, void *ptr, const char *name, size_t bsz) { + char *r = buff; + if ((2*sizeof(void *) + 2) > bsz) return 0; + *(r++) = '_'; + r = SWIG_PackData(r,&ptr,sizeof(void *)); + if (strlen(name) + 1 > (bsz - (r - buff))) return 0; + strcpy(r,name); + return buff; +} + +SWIGRUNTIME const char * +SWIG_UnpackVoidPtr(const char *c, void **ptr, const char *name) { + if (*c != '_') { + if (strcmp(c,"NULL") == 0) { + *ptr = (void *) 0; + return name; + } else { + return 0; + } + } + return SWIG_UnpackData(++c,ptr,sizeof(void *)); +} + +SWIGRUNTIME char * +SWIG_PackDataName(char *buff, void *ptr, size_t sz, const char *name, size_t bsz) { + char *r = buff; + size_t lname = (name ? strlen(name) : 0); + if ((2*sz + 2 + lname) > bsz) return 0; + *(r++) = '_'; + r = SWIG_PackData(r,ptr,sz); + if (lname) { + strncpy(r,name,lname+1); + } else { + *r = 0; + } + return buff; +} + +SWIGRUNTIME const char * +SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) { + if (*c != '_') { + if (strcmp(c,"NULL") == 0) { + memset(ptr,0,sz); + return name; + } else { + return 0; + } + } + return SWIG_UnpackData(++c,ptr,sz); +} + +#ifdef __cplusplus +} +#endif + +/* Errors in SWIG */ +#define SWIG_UnknownError -1 +#define SWIG_IOError -2 +#define SWIG_RuntimeError -3 +#define SWIG_IndexError -4 +#define SWIG_TypeError -5 +#define SWIG_DivisionByZero -6 +#define SWIG_OverflowError -7 +#define SWIG_SyntaxError -8 +#define SWIG_ValueError -9 +#define SWIG_SystemError -10 +#define SWIG_AttributeError -11 +#define SWIG_MemoryError -12 +#define SWIG_NullReferenceError -13 + + + +#ifdef __cplusplus +/* Needed on some windows machines---since MS plays funny games with the header files under C++ */ +#include +#include +extern "C" { +#endif +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +/* Add in functionality missing in older versions of Perl. Much of this is based on Devel-PPPort on cpan. */ + +/* Add PERL_REVISION, PERL_VERSION, PERL_SUBVERSION if missing */ +#ifndef PERL_REVISION +# if !defined(__PATCHLEVEL_H_INCLUDED__) && !(defined(PATCHLEVEL) && defined(SUBVERSION)) +# define PERL_PATCHLEVEL_H_IMPLICIT +# include +# endif +# if !(defined(PERL_VERSION) || (defined(SUBVERSION) && defined(PATCHLEVEL))) +# include +# endif +# ifndef PERL_REVISION +# define PERL_REVISION (5) +# define PERL_VERSION PATCHLEVEL +# define PERL_SUBVERSION SUBVERSION +# endif +#endif + +#if defined(WIN32) && defined(PERL_OBJECT) && !defined(PerlIO_exportFILE) +#define PerlIO_exportFILE(fh,fl) (FILE*)(fh) +#endif + +#ifndef SvIOK_UV +# define SvIOK_UV(sv) (SvIOK(sv) && (SvUVX(sv) == SvIVX(sv))) +#endif + +#ifndef SvUOK +# define SvUOK(sv) SvIOK_UV(sv) +#endif + +#if ((PERL_VERSION < 4) || ((PERL_VERSION == 4) && (PERL_SUBVERSION <= 5))) +# define PL_sv_undef sv_undef +# define PL_na na +# define PL_errgv errgv +# define PL_sv_no sv_no +# define PL_sv_yes sv_yes +# define PL_markstack_ptr markstack_ptr +#endif + +#ifndef IVSIZE +# ifdef LONGSIZE +# define IVSIZE LONGSIZE +# else +# define IVSIZE 4 /* A bold guess, but the best we can make. */ +# endif +#endif + +#ifndef INT2PTR +# if (IVSIZE == PTRSIZE) && (UVSIZE == PTRSIZE) +# define PTRV UV +# define INT2PTR(any,d) (any)(d) +# else +# if PTRSIZE == LONGSIZE +# define PTRV unsigned long +# else +# define PTRV unsigned +# endif +# define INT2PTR(any,d) (any)(PTRV)(d) +# endif + +# define NUM2PTR(any,d) (any)(PTRV)(d) +# define PTR2IV(p) INT2PTR(IV,p) +# define PTR2UV(p) INT2PTR(UV,p) +# define PTR2NV(p) NUM2PTR(NV,p) + +# if PTRSIZE == LONGSIZE +# define PTR2ul(p) (unsigned long)(p) +# else +# define PTR2ul(p) INT2PTR(unsigned long,p) +# endif +#endif /* !INT2PTR */ + +#ifndef SvPV_nolen +# define SvPV_nolen(x) SvPV(x,PL_na) +#endif + +#ifndef get_sv +# define get_sv perl_get_sv +#endif + +#ifndef ERRSV +# define ERRSV get_sv("@",FALSE) +#endif + +#ifndef pTHX_ +#define pTHX_ +#endif + +#include +#ifdef __cplusplus +} +#endif + +/* ----------------------------------------------------------------------------- + * error manipulation + * ----------------------------------------------------------------------------- */ + +SWIGINTERN const char* +SWIG_Perl_ErrorType(int code) { + switch(code) { + case SWIG_MemoryError: + return "MemoryError"; + case SWIG_IOError: + return "IOError"; + case SWIG_RuntimeError: + return "RuntimeError"; + case SWIG_IndexError: + return "IndexError"; + case SWIG_TypeError: + return "TypeError"; + case SWIG_DivisionByZero: + return "ZeroDivisionError"; + case SWIG_OverflowError: + return "OverflowError"; + case SWIG_SyntaxError: + return "SyntaxError"; + case SWIG_ValueError: + return "ValueError"; + case SWIG_SystemError: + return "SystemError"; + case SWIG_AttributeError: + return "AttributeError"; + default: + return "RuntimeError"; + } +} + + +/* ----------------------------------------------------------------------------- + * perlrun.swg + * + * This file contains the runtime support for Perl modules + * and includes code for managing global variables and pointer + * type checking. + * ----------------------------------------------------------------------------- */ + +#ifdef PERL_OBJECT +#define SWIG_PERL_OBJECT_DECL CPerlObj *SWIGUNUSEDPARM(pPerl), +#define SWIG_PERL_OBJECT_CALL pPerl, +#else +#define SWIG_PERL_OBJECT_DECL +#define SWIG_PERL_OBJECT_CALL +#endif + +/* Common SWIG API */ + +/* for raw pointers */ +#define SWIG_ConvertPtr(obj, pp, type, flags) SWIG_Perl_ConvertPtr(SWIG_PERL_OBJECT_CALL obj, pp, type, flags) +#define SWIG_ConvertPtrAndOwn(obj, pp, type, flags,own) SWIG_Perl_ConvertPtrAndOwn(SWIG_PERL_OBJECT_CALL obj, pp, type, flags, own) +#define SWIG_NewPointerObj(p, type, flags) SWIG_Perl_NewPointerObj(SWIG_PERL_OBJECT_CALL p, type, flags) +#define swig_owntype int + +/* for raw packed data */ +#define SWIG_ConvertPacked(obj, p, s, type) SWIG_Perl_ConvertPacked(SWIG_PERL_OBJECT_CALL obj, p, s, type) +#define SWIG_NewPackedObj(p, s, type) SWIG_Perl_NewPackedObj(SWIG_PERL_OBJECT_CALL p, s, type) + +/* for class or struct pointers */ +#define SWIG_ConvertInstance(obj, pptr, type, flags) SWIG_ConvertPtr(obj, pptr, type, flags) +#define SWIG_NewInstanceObj(ptr, type, flags) SWIG_NewPointerObj(ptr, type, flags) + +/* for C or C++ function pointers */ +#define SWIG_ConvertFunctionPtr(obj, pptr, type) SWIG_ConvertPtr(obj, pptr, type, 0) +#define SWIG_NewFunctionPtrObj(ptr, type) SWIG_NewPointerObj(ptr, type, 0) + +/* for C++ member pointers, ie, member methods */ +#define SWIG_ConvertMember(obj, ptr, sz, ty) SWIG_ConvertPacked(obj, ptr, sz, ty) +#define SWIG_NewMemberObj(ptr, sz, type) SWIG_NewPackedObj(ptr, sz, type) + + +/* Runtime API */ + +#define SWIG_GetModule(clientdata) SWIG_Perl_GetModule(clientdata) +#define SWIG_SetModule(clientdata, pointer) SWIG_Perl_SetModule(pointer) + + +/* Error manipulation */ + +#define SWIG_ErrorType(code) SWIG_Perl_ErrorType(code) +#define SWIG_Error(code, msg) sv_setpvf(get_sv("@", GV_ADD), "%s %s", SWIG_ErrorType(code), msg) +#define SWIG_fail goto fail + +/* Perl-specific SWIG API */ + +#define SWIG_MakePtr(sv, ptr, type, flags) SWIG_Perl_MakePtr(SWIG_PERL_OBJECT_CALL sv, ptr, type, flags) +#define SWIG_MakePackedObj(sv, p, s, type) SWIG_Perl_MakePackedObj(SWIG_PERL_OBJECT_CALL sv, p, s, type) +#define SWIG_SetError(str) SWIG_Error(SWIG_RuntimeError, str) + + +#define SWIG_PERL_DECL_ARGS_1(arg1) (SWIG_PERL_OBJECT_DECL arg1) +#define SWIG_PERL_CALL_ARGS_1(arg1) (SWIG_PERL_OBJECT_CALL arg1) +#define SWIG_PERL_DECL_ARGS_2(arg1, arg2) (SWIG_PERL_OBJECT_DECL arg1, arg2) +#define SWIG_PERL_CALL_ARGS_2(arg1, arg2) (SWIG_PERL_OBJECT_CALL arg1, arg2) + +/* ----------------------------------------------------------------------------- + * pointers/data manipulation + * ----------------------------------------------------------------------------- */ + +/* For backward compatibility only */ +#define SWIG_POINTER_EXCEPTION 0 + +#ifdef __cplusplus +extern "C" { +#endif + +#define SWIG_OWNER SWIG_POINTER_OWN +#define SWIG_SHADOW SWIG_OWNER << 1 + +#define SWIG_MAYBE_PERL_OBJECT SWIG_PERL_OBJECT_DECL + +/* SWIG Perl macros */ + +/* Macro to declare an XS function */ +#ifndef XSPROTO +# define XSPROTO(name) void name(pTHX_ CV* cv) +#endif + +/* Macro to call an XS function */ +#ifdef PERL_OBJECT +# define SWIG_CALLXS(_name) _name(cv,pPerl) +#else +# ifndef MULTIPLICITY +# define SWIG_CALLXS(_name) _name(cv) +# else +# define SWIG_CALLXS(_name) _name(PERL_GET_THX, cv) +# endif +#endif + +#ifdef PERL_OBJECT +#define MAGIC_PPERL CPerlObj *pPerl = (CPerlObj *) this; + +#ifdef __cplusplus +extern "C" { +#endif +typedef int (CPerlObj::*SwigMagicFunc)(SV *, MAGIC *); +#ifdef __cplusplus +} +#endif + +#define SWIG_MAGIC(a,b) (SV *a, MAGIC *b) +#define SWIGCLASS_STATIC + +#else /* PERL_OBJECT */ + +#define MAGIC_PPERL +#define SWIGCLASS_STATIC static SWIGUNUSED + +#ifndef MULTIPLICITY +#define SWIG_MAGIC(a,b) (SV *a, MAGIC *b) + +#ifdef __cplusplus +extern "C" { +#endif +typedef int (*SwigMagicFunc)(SV *, MAGIC *); +#ifdef __cplusplus +} +#endif + +#else /* MULTIPLICITY */ + +#define SWIG_MAGIC(a,b) (struct interpreter *interp, SV *a, MAGIC *b) + +#ifdef __cplusplus +extern "C" { +#endif +typedef int (*SwigMagicFunc)(struct interpreter *, SV *, MAGIC *); +#ifdef __cplusplus +} +#endif + +#endif /* MULTIPLICITY */ +#endif /* PERL_OBJECT */ + +# ifdef PERL_OBJECT +# define SWIG_croak_null() SWIG_Perl_croak_null(pPerl) +static void SWIGUNUSED SWIG_Perl_croak_null(CPerlObj *pPerl) +# else +static void SWIGUNUSED SWIG_croak_null() +# endif +{ + SV *err = get_sv("@", GV_ADD); +# if (PERL_VERSION < 6) + croak("%_", err); +# else + if (sv_isobject(err)) + croak(0); + else + croak("%s", SvPV_nolen(err)); +# endif +} + + +/* + Define how strict is the cast between strings and integers/doubles + when overloading between these types occurs. + + The default is making it as strict as possible by using SWIG_AddCast + when needed. + + You can use -DSWIG_PERL_NO_STRICT_STR2NUM at compilation time to + disable the SWIG_AddCast, making the casting between string and + numbers less strict. + + In the end, we try to solve the overloading between strings and + numerical types in the more natural way, but if you can avoid it, + well, avoid it using %rename, for example. +*/ +#ifndef SWIG_PERL_NO_STRICT_STR2NUM +# ifndef SWIG_PERL_STRICT_STR2NUM +# define SWIG_PERL_STRICT_STR2NUM +# endif +#endif +#ifdef SWIG_PERL_STRICT_STR2NUM +/* string takes precedence */ +#define SWIG_Str2NumCast(x) SWIG_AddCast(x) +#else +/* number takes precedence */ +#define SWIG_Str2NumCast(x) x +#endif + + + +#include + +SWIGRUNTIME const char * +SWIG_Perl_TypeProxyName(const swig_type_info *type) { + if (!type) return NULL; + if (type->clientdata != NULL) { + return (const char*) type->clientdata; + } + else { + return type->name; + } +} + +/* Identical to SWIG_TypeCheck, except for strcmp comparison */ +SWIGRUNTIME swig_cast_info * +SWIG_TypeProxyCheck(const char *c, swig_type_info *ty) { + if (ty) { + swig_cast_info *iter = ty->cast; + while (iter) { + if (strcmp(SWIG_Perl_TypeProxyName(iter->type), c) == 0) { + if (iter == ty->cast) + return iter; + /* Move iter to the top of the linked list */ + iter->prev->next = iter->next; + if (iter->next) + iter->next->prev = iter->prev; + iter->next = ty->cast; + iter->prev = 0; + if (ty->cast) ty->cast->prev = iter; + ty->cast = iter; + return iter; + } + iter = iter->next; + } + } + return 0; +} + +/* Function for getting a pointer value */ + +SWIGRUNTIME int +SWIG_Perl_ConvertPtrAndOwn(SWIG_MAYBE_PERL_OBJECT SV *sv, void **ptr, swig_type_info *_t, int flags, int *own) { + swig_cast_info *tc; + void *voidptr = (void *)0; + SV *tsv = 0; + + if (own) + *own = 0; + + /* If magical, apply more magic */ + if (SvGMAGICAL(sv)) + mg_get(sv); + + /* Check to see if this is an object */ + if (sv_isobject(sv)) { + IV tmp = 0; + tsv = (SV*) SvRV(sv); + if ((SvTYPE(tsv) == SVt_PVHV)) { + MAGIC *mg; + if (SvMAGICAL(tsv)) { + mg = mg_find(tsv,'P'); + if (mg) { + sv = mg->mg_obj; + if (sv_isobject(sv)) { + tsv = (SV*)SvRV(sv); + tmp = SvIV(tsv); + } + } + } else { + return SWIG_ERROR; + } + } else { + tmp = SvIV(tsv); + } + voidptr = INT2PTR(void *,tmp); + } else if (! SvOK(sv)) { /* Check for undef */ + *(ptr) = (void *) 0; + return SWIG_OK; + } else if (SvTYPE(sv) == SVt_RV) { /* Check for NULL pointer */ + if (!SvROK(sv)) { + /* In Perl 5.12 and later, SVt_RV == SVt_IV, so sv could be a valid integer value. */ + if (SvIOK(sv)) { + return SWIG_ERROR; + } else { + /* NULL pointer (reference to undef). */ + *(ptr) = (void *) 0; + return SWIG_OK; + } + } else { + return SWIG_ERROR; + } + } else { /* Don't know what it is */ + return SWIG_ERROR; + } + if (_t) { + /* Now see if the types match */ + char *_c = HvNAME(SvSTASH(SvRV(sv))); + tc = SWIG_TypeProxyCheck(_c,_t); +#ifdef SWIG_DIRECTORS + if (!tc && !sv_derived_from(sv,SWIG_Perl_TypeProxyName(_t))) { +#else + if (!tc) { +#endif + return SWIG_ERROR; + } + { + int newmemory = 0; + *ptr = SWIG_TypeCast(tc,voidptr,&newmemory); + if (newmemory == SWIG_CAST_NEW_MEMORY) { + assert(own); /* badly formed typemap which will lead to a memory leak - it must set and use own to delete *ptr */ + if (own) + *own = *own | SWIG_CAST_NEW_MEMORY; + } + } + } else { + *ptr = voidptr; + } + + /* + * DISOWN implementation: we need a perl guru to check this one. + */ + if (tsv && (flags & SWIG_POINTER_DISOWN)) { + /* + * almost copy paste code from below SWIG_POINTER_OWN setting + */ + SV *obj = sv; + HV *stash = SvSTASH(SvRV(obj)); + GV *gv = *(GV**)hv_fetch(stash, "OWNER", 5, TRUE); + if (isGV(gv)) { + HV *hv = GvHVn(gv); + /* + * To set ownership (see below), a newSViv(1) entry is added. + * Hence, to remove ownership, we delete the entry. + */ + if (hv_exists_ent(hv, obj, 0)) { + hv_delete_ent(hv, obj, 0, 0); + } + } + } + return SWIG_OK; +} + +SWIGRUNTIME int +SWIG_Perl_ConvertPtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void **ptr, swig_type_info *_t, int flags) { + return SWIG_Perl_ConvertPtrAndOwn(sv, ptr, _t, flags, 0); +} + +SWIGRUNTIME void +SWIG_Perl_MakePtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void *ptr, swig_type_info *t, int flags) { + if (ptr && (flags & (SWIG_SHADOW | SWIG_POINTER_OWN))) { + SV *self; + SV *obj=newSV(0); + HV *hash=newHV(); + HV *stash; + sv_setref_pv(obj, SWIG_Perl_TypeProxyName(t), ptr); + stash=SvSTASH(SvRV(obj)); + if (flags & SWIG_POINTER_OWN) { + HV *hv; + GV *gv = *(GV**)hv_fetch(stash, "OWNER", 5, TRUE); + if (!isGV(gv)) + gv_init(gv, stash, "OWNER", 5, FALSE); + hv=GvHVn(gv); + hv_store_ent(hv, obj, newSViv(1), 0); + } + sv_magic((SV *)hash, (SV *)obj, 'P', Nullch, 0); + SvREFCNT_dec(obj); + self=newRV_noinc((SV *)hash); + sv_setsv(sv, self); + SvREFCNT_dec((SV *)self); + sv_bless(sv, stash); + } + else { + sv_setref_pv(sv, SWIG_Perl_TypeProxyName(t), ptr); + } +} + +SWIGRUNTIMEINLINE SV * +SWIG_Perl_NewPointerObj(SWIG_MAYBE_PERL_OBJECT void *ptr, swig_type_info *t, int flags) { + SV *result = sv_newmortal(); + SWIG_MakePtr(result, ptr, t, flags); + return result; +} + +SWIGRUNTIME void +SWIG_Perl_MakePackedObj(SWIG_MAYBE_PERL_OBJECT SV *sv, void *ptr, int sz, swig_type_info *type) { + char result[1024]; + char *r = result; + if ((2*sz + 1 + strlen(SWIG_Perl_TypeProxyName(type))) > 1000) return; + *(r++) = '_'; + r = SWIG_PackData(r,ptr,sz); + strcpy(r,SWIG_Perl_TypeProxyName(type)); + sv_setpv(sv, result); +} + +SWIGRUNTIME SV * +SWIG_Perl_NewPackedObj(SWIG_MAYBE_PERL_OBJECT void *ptr, int sz, swig_type_info *type) { + SV *result = sv_newmortal(); + SWIG_Perl_MakePackedObj(result, ptr, sz, type); + return result; +} + +/* Convert a packed value value */ +SWIGRUNTIME int +SWIG_Perl_ConvertPacked(SWIG_MAYBE_PERL_OBJECT SV *obj, void *ptr, int sz, swig_type_info *ty) { + swig_cast_info *tc; + const char *c = 0; + + if ((!obj) || (!SvOK(obj))) return SWIG_ERROR; + c = SvPV_nolen(obj); + /* Pointer values must start with leading underscore */ + if (*c != '_') return SWIG_ERROR; + c++; + c = SWIG_UnpackData(c,ptr,sz); + if (ty) { + tc = SWIG_TypeCheck(c,ty); + if (!tc) return SWIG_ERROR; + } + return SWIG_OK; +} + + +/* Macros for low-level exception handling */ +#define SWIG_croak(x) { SWIG_Error(SWIG_RuntimeError, x); SWIG_fail; } + + +typedef XSPROTO(SwigPerlWrapper); +typedef SwigPerlWrapper *SwigPerlWrapperPtr; + +/* Structure for command table */ +typedef struct { + const char *name; + SwigPerlWrapperPtr wrapper; +} swig_command_info; + +/* Information for constant table */ + +#define SWIG_INT 1 +#define SWIG_FLOAT 2 +#define SWIG_STRING 3 +#define SWIG_POINTER 4 +#define SWIG_BINARY 5 + +/* Constant information structure */ +typedef struct swig_constant_info { + int type; + const char *name; + long lvalue; + double dvalue; + void *pvalue; + swig_type_info **ptype; +} swig_constant_info; + + +/* Structure for variable table */ +typedef struct { + const char *name; + SwigMagicFunc set; + SwigMagicFunc get; + swig_type_info **type; +} swig_variable_info; + +/* Magic variable code */ +#ifndef PERL_OBJECT +# ifdef __cplusplus +# define swig_create_magic(s,a,b,c) _swig_create_magic(s,const_cast(a),b,c) +# else +# define swig_create_magic(s,a,b,c) _swig_create_magic(s,(char*)(a),b,c) +# endif +# ifndef MULTIPLICITY +SWIGRUNTIME void _swig_create_magic(SV *sv, char *name, int (*set)(SV *, MAGIC *), int (*get)(SV *,MAGIC *)) +# else +SWIGRUNTIME void _swig_create_magic(SV *sv, char *name, int (*set)(struct interpreter*, SV *, MAGIC *), int (*get)(struct interpreter*, SV *,MAGIC *)) +# endif +#else +# define swig_create_magic(s,a,b,c) _swig_create_magic(pPerl,s,a,b,c) +SWIGRUNTIME void _swig_create_magic(CPerlObj *pPerl, SV *sv, const char *name, int (CPerlObj::*set)(SV *, MAGIC *), int (CPerlObj::*get)(SV *, MAGIC *)) +#endif +{ + MAGIC *mg; + sv_magic(sv,sv,'U',name,strlen(name)); + mg = mg_find(sv,'U'); + mg->mg_virtual = (MGVTBL *) malloc(sizeof(MGVTBL)); + mg->mg_virtual->svt_get = (SwigMagicFunc) get; + mg->mg_virtual->svt_set = (SwigMagicFunc) set; + mg->mg_virtual->svt_len = 0; + mg->mg_virtual->svt_clear = 0; + mg->mg_virtual->svt_free = 0; +} + + +SWIGRUNTIME swig_module_info * +SWIG_Perl_GetModule(void *SWIGUNUSEDPARM(clientdata)) { + static void *type_pointer = (void *)0; + SV *pointer; + + /* first check if pointer already created */ + if (!type_pointer) { + pointer = get_sv("swig_runtime_data::type_pointer" SWIG_RUNTIME_VERSION SWIG_TYPE_TABLE_NAME, FALSE | GV_ADDMULTI); + if (pointer && SvOK(pointer)) { + type_pointer = INT2PTR(swig_type_info **, SvIV(pointer)); + } + } + + return (swig_module_info *) type_pointer; +} + +SWIGRUNTIME void +SWIG_Perl_SetModule(swig_module_info *module) { + SV *pointer; + + /* create a new pointer */ + pointer = get_sv("swig_runtime_data::type_pointer" SWIG_RUNTIME_VERSION SWIG_TYPE_TABLE_NAME, TRUE | GV_ADDMULTI); + sv_setiv(pointer, PTR2IV(module)); +} + +#ifdef __cplusplus +} +#endif + +/* Workaround perl5 global namespace pollution. Note that undefining library + * functions like fopen will not solve the problem on all platforms as fopen + * might be a macro on Windows but not necessarily on other operating systems. */ +#ifdef do_open + #undef do_open +#endif +#ifdef do_close + #undef do_close +#endif +#ifdef do_exec + #undef do_exec +#endif +#ifdef scalar + #undef scalar +#endif +#ifdef list + #undef list +#endif +#ifdef apply + #undef apply +#endif +#ifdef convert + #undef convert +#endif +#ifdef Error + #undef Error +#endif +#ifdef form + #undef form +#endif +#ifdef vform + #undef vform +#endif +#ifdef LABEL + #undef LABEL +#endif +#ifdef METHOD + #undef METHOD +#endif +#ifdef Move + #undef Move +#endif +#ifdef yylex + #undef yylex +#endif +#ifdef yyparse + #undef yyparse +#endif +#ifdef yyerror + #undef yyerror +#endif +#ifdef invert + #undef invert +#endif +#ifdef ref + #undef ref +#endif +#ifdef read + #undef read +#endif +#ifdef write + #undef write +#endif +#ifdef eof + #undef eof +#endif +#ifdef close + #undef close +#endif +#ifdef rewind + #undef rewind +#endif +#ifdef free + #undef free +#endif +#ifdef malloc + #undef malloc +#endif +#ifdef calloc + #undef calloc +#endif +#ifdef Stat + #undef Stat +#endif +#ifdef check + #undef check +#endif +#ifdef seekdir + #undef seekdir +#endif +#ifdef open + #undef open +#endif +#ifdef readdir + #undef readdir +#endif +#ifdef bind + #undef bind +#endif +#ifdef access + #undef access +#endif +#ifdef stat + #undef stat +#endif +#ifdef seed + #undef seed +#endif + +#ifdef bool + /* Leave if macro is from C99 stdbool.h */ + #ifndef __bool_true_false_are_defined + #undef bool + #endif +#endif + + + + +#define SWIG_exception_fail(code, msg) do { SWIG_Error(code, msg); SWIG_fail; } while(0) + +#define SWIG_contract_assert(expr, msg) if (!(expr)) { SWIG_Error(SWIG_RuntimeError, msg); SWIG_fail; } else + + + +/* -------- TYPES TABLE (BEGIN) -------- */ + +#define SWIGTYPE_p_ConstituentDisplayStyle swig_types[0] +#define SWIGTYPE_p_Cost_Model_type swig_types[1] +#define SWIGTYPE_p_Dictionary_s swig_types[2] +#define SWIGTYPE_p_Linkage_s swig_types[3] +#define SWIGTYPE_p_Parse_Options_s swig_types[4] +#define SWIGTYPE_p_Sentence_s swig_types[5] +#define SWIGTYPE_p_char swig_types[6] +#define SWIGTYPE_p_lg_errinfo swig_types[7] +#define SWIGTYPE_p_lg_error_severity swig_types[8] +#define SWIGTYPE_p_p_char swig_types[9] +#define SWIGTYPE_p_size_t swig_types[10] +static swig_type_info *swig_types[12]; +static swig_module_info swig_module = {swig_types, 11, 0, 0, 0, 0}; +#define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) +#define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name) + +/* -------- TYPES TABLE (END) -------- */ + +#define SWIG_init boot_clinkgrammar + +#define SWIG_name "clinkgrammarc::boot_clinkgrammar" +#define SWIG_prefix "clinkgrammarc::" + +#define SWIGVERSION 0x030010 +#define SWIG_VERSION SWIGVERSION + + +#define SWIG_as_voidptr(a) const_cast< void * >(static_cast< const void * >(a)) +#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),reinterpret_cast< void** >(a)) + + +#include + + +#ifdef __cplusplus +extern "C" +#endif +#ifndef PERL_OBJECT +#ifndef MULTIPLICITY +SWIGEXPORT void SWIG_init (CV* cv); +#else +SWIGEXPORT void SWIG_init (pTHXo_ CV* cv); +#endif +#else +SWIGEXPORT void SWIG_init (CV *cv, CPerlObj *); +#endif + + + +#include + + + +SWIGINTERNINLINE SV * +SWIG_From_long SWIG_PERL_DECL_ARGS_1(long value) +{ + SV *sv; + if (IVSIZE >= sizeof(value) || (value >= IV_MIN && value <= IV_MAX)) + sv = newSViv(value); + else + sv = newSVpvf("%ld", value); + return sv_2mortal(sv); +} + + +SWIGINTERNINLINE SV * +SWIG_From_int SWIG_PERL_DECL_ARGS_1(int value) +{ + return SWIG_From_long SWIG_PERL_CALL_ARGS_1(value); +} + + +SWIGINTERNINLINE SV * +SWIG_FromCharPtrAndSize(const char* carray, size_t size) +{ + SV *obj = sv_newmortal(); + if (carray) { + sv_setpvn(obj, carray, size); + } else { + sv_setsv(obj, &PL_sv_undef); + } + return obj; +} + + +SWIGINTERNINLINE SV * +SWIG_FromCharPtr(const char *cptr) +{ + return SWIG_FromCharPtrAndSize(cptr, (cptr ? strlen(cptr) : 0)); +} + + +SWIGINTERN swig_type_info* +SWIG_pchar_descriptor(void) +{ + static int init = 0; + static swig_type_info* info = 0; + if (!init) { + info = SWIG_TypeQuery("_p_char"); + init = 1; + } + return info; +} + + +SWIGINTERN int +SWIG_AsCharPtrAndSize(SV *obj, char** cptr, size_t* psize, int *alloc) +{ + if (SvMAGICAL(obj)) { + SV *tmp = sv_newmortal(); + SvSetSV(tmp, obj); + obj = tmp; + } + if (SvPOK(obj)) { + STRLEN len = 0; + char *cstr = SvPV(obj, len); + size_t size = len + 1; + if (cptr) { + if (alloc) { + if (*alloc == SWIG_NEWOBJ) { + *cptr = reinterpret_cast< char* >(memcpy((new char[size]), cstr, sizeof(char)*(size))); + } else { + *cptr = cstr; + *alloc = SWIG_OLDOBJ; + } + } + } + if (psize) *psize = size; + return SWIG_OK; + } else { + swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); + if (pchar_descriptor) { + char* vptr = 0; + if (SWIG_ConvertPtr(obj, (void**)&vptr, pchar_descriptor, 0) == SWIG_OK) { + if (cptr) *cptr = vptr; + if (psize) *psize = vptr ? (strlen(vptr) + 1) : 0; + if (alloc) *alloc = SWIG_OLDOBJ; + return SWIG_OK; + } + } + } + return SWIG_TypeError; +} + + + + + +#include +#if !defined(SWIG_NO_LLONG_MAX) +# if !defined(LLONG_MAX) && defined(__GNUC__) && defined (__LONG_LONG_MAX__) +# define LLONG_MAX __LONG_LONG_MAX__ +# define LLONG_MIN (-LLONG_MAX - 1LL) +# define ULLONG_MAX (LLONG_MAX * 2ULL + 1ULL) +# endif +#endif + + +#include +#ifdef _MSC_VER +# ifndef strtoull +# define strtoull _strtoui64 +# endif +# ifndef strtoll +# define strtoll _strtoi64 +# endif +#endif + + +SWIGINTERN int +SWIG_AsVal_double SWIG_PERL_DECL_ARGS_2(SV *obj, double *val) +{ + if (SvNIOK(obj)) { + if (val) *val = SvNV(obj); + return SWIG_OK; + } else if (SvIOK(obj)) { + if (val) *val = (double) SvIV(obj); + return SWIG_AddCast(SWIG_OK); + } else { + const char *nptr = SvPV_nolen(obj); + if (nptr) { + char *endptr; + double v; + errno = 0; + v = strtod(nptr, &endptr); + if (errno == ERANGE) { + errno = 0; + return SWIG_OverflowError; + } else { + if (*endptr == '\0') { + if (val) *val = v; + return SWIG_Str2NumCast(SWIG_OK); + } + } + } + } + return SWIG_TypeError; +} + + +#include + + +#include + + +SWIGINTERNINLINE int +SWIG_CanCastAsInteger(double *d, double min, double max) { + double x = *d; + if ((min <= x && x <= max)) { + double fx = floor(x); + double cx = ceil(x); + double rd = ((x - fx) < 0.5) ? fx : cx; /* simple rint */ + if ((errno == EDOM) || (errno == ERANGE)) { + errno = 0; + } else { + double summ, reps, diff; + if (rd < x) { + diff = x - rd; + } else if (rd > x) { + diff = rd - x; + } else { + return 1; + } + summ = rd + x; + reps = diff/summ; + if (reps < 8*DBL_EPSILON) { + *d = rd; + return 1; + } + } + } + return 0; +} + + +SWIGINTERN int +SWIG_AsVal_long SWIG_PERL_DECL_ARGS_2(SV *obj, long* val) +{ + if (SvUOK(obj)) { + UV v = SvUV(obj); + if (UVSIZE < sizeof(*val) || v <= LONG_MAX) { + if (val) *val = v; + return SWIG_OK; + } + return SWIG_OverflowError; + } else if (SvIOK(obj)) { + IV v = SvIV(obj); + if (IVSIZE <= sizeof(*val) || (v >= LONG_MIN && v <= LONG_MAX)) { + if(val) *val = v; + return SWIG_OK; + } + return SWIG_OverflowError; + } else { + int dispatch = 0; + const char *nptr = SvPV_nolen(obj); + if (nptr) { + char *endptr; + long v; + errno = 0; + v = strtol(nptr, &endptr,0); + if (errno == ERANGE) { + errno = 0; + return SWIG_OverflowError; + } else { + if (*endptr == '\0') { + if (val) *val = v; + return SWIG_Str2NumCast(SWIG_OK); + } + } + } + if (!dispatch) { + double d; + int res = SWIG_AddCast(SWIG_AsVal_double SWIG_PERL_CALL_ARGS_2(obj,&d)); + if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, LONG_MIN, LONG_MAX)) { + if (val) *val = (long)(d); + return res; + } + } + } + return SWIG_TypeError; +} + + +SWIGINTERN int +SWIG_AsVal_int SWIG_PERL_DECL_ARGS_2(SV * obj, int *val) +{ + long v; + int res = SWIG_AsVal_long SWIG_PERL_CALL_ARGS_2(obj, &v); + if (SWIG_IsOK(res)) { + if ((v < INT_MIN || v > INT_MAX)) { + return SWIG_OverflowError; + } else { + if (val) *val = static_cast< int >(v); + } + } + return res; +} + + +SWIGINTERNINLINE SV * +SWIG_From_double SWIG_PERL_DECL_ARGS_1(double value) +{ + return sv_2mortal(newSVnv(value)); +} + + +SWIGINTERN int +SWIG_AsVal_unsigned_SS_long SWIG_PERL_DECL_ARGS_2(SV *obj, unsigned long *val) +{ + if (SvUOK(obj)) { + UV v = SvUV(obj); + if (UVSIZE <= sizeof(*val) || v <= ULONG_MAX) { + if (val) *val = v; + return SWIG_OK; + } + return SWIG_OverflowError; + } else if (SvIOK(obj)) { + IV v = SvIV(obj); + if (v >= 0 && (IVSIZE <= sizeof(*val) || v <= ULONG_MAX)) { + if (val) *val = v; + return SWIG_OK; + } + return SWIG_OverflowError; + } else { + int dispatch = 0; + const char *nptr = SvPV_nolen(obj); + if (nptr) { + char *endptr; + unsigned long v; + errno = 0; + v = strtoul(nptr, &endptr,0); + if (errno == ERANGE) { + errno = 0; + return SWIG_OverflowError; + } else { + if (*endptr == '\0') { + if (val) *val = v; + return SWIG_Str2NumCast(SWIG_OK); + } + } + } + if (!dispatch) { + double d; + int res = SWIG_AddCast(SWIG_AsVal_double SWIG_PERL_CALL_ARGS_2(obj,&d)); + if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, 0, ULONG_MAX)) { + if (val) *val = (unsigned long)(d); + return res; + } + } + } + return SWIG_TypeError; +} + + +#if defined(LLONG_MAX) && !defined(SWIG_LONG_LONG_AVAILABLE) +# define SWIG_LONG_LONG_AVAILABLE +#endif + + +#ifdef SWIG_LONG_LONG_AVAILABLE +SWIGINTERN int +SWIG_AsVal_unsigned_SS_long_SS_long SWIG_PERL_DECL_ARGS_2(SV *obj, unsigned long long *val) +{ + if (SvUOK(obj)) { + /* pretty sure this should be conditional on + * (UVSIZE <= sizeof(*val) || v <= ULLONG_MAX) */ + if (val) *val = SvUV(obj); + return SWIG_OK; + } else if (SvIOK(obj)) { + IV v = SvIV(obj); + if (v >= 0 && (IVSIZE <= sizeof(*val) || v <= ULLONG_MAX)) { + if (val) *val = v; + return SWIG_OK; + } else { + return SWIG_OverflowError; + } + } else { + int dispatch = 0; + const char *nptr = SvPV_nolen(obj); + if (nptr) { + char *endptr; + unsigned long long v; + errno = 0; + v = strtoull(nptr, &endptr,0); + if (errno == ERANGE) { + errno = 0; + return SWIG_OverflowError; + } else { + if (*endptr == '\0') { + if (val) *val = v; + return SWIG_Str2NumCast(SWIG_OK); + } + } + } + if (!dispatch) { + const double mant_max = 1LL << DBL_MANT_DIG; + double d; + int res = SWIG_AddCast(SWIG_AsVal_double SWIG_PERL_CALL_ARGS_2(obj,&d)); + if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, 0, mant_max)) { + if (val) *val = (unsigned long long)(d); + return res; + } + } + } + return SWIG_TypeError; +} +#endif + + +SWIGINTERNINLINE int +SWIG_AsVal_size_t SWIG_PERL_DECL_ARGS_2(SV * obj, size_t *val) +{ + int res = SWIG_TypeError; +#ifdef SWIG_LONG_LONG_AVAILABLE + if (sizeof(size_t) <= sizeof(unsigned long)) { +#endif + unsigned long v; + res = SWIG_AsVal_unsigned_SS_long SWIG_PERL_CALL_ARGS_2(obj, val ? &v : 0); + if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); +#ifdef SWIG_LONG_LONG_AVAILABLE + } else if (sizeof(size_t) <= sizeof(unsigned long long)) { + unsigned long long v; + res = SWIG_AsVal_unsigned_SS_long_SS_long SWIG_PERL_CALL_ARGS_2(obj, val ? &v : 0); + if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); + } +#endif + return res; +} + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef PERL_OBJECT +#define MAGIC_CLASS _wrap_clinkgrammar_var:: +class _wrap_clinkgrammar_var : public CPerlObj { +public: +#else +#define MAGIC_CLASS +#endif +SWIGCLASS_STATIC int swig_magic_readonly(pTHX_ SV *SWIGUNUSEDPARM(sv), MAGIC *SWIGUNUSEDPARM(mg)) { + MAGIC_PPERL + croak("Value is read-only."); + return 0; +} + + +#ifdef PERL_OBJECT +}; +#endif + +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus +extern "C" { +#endif +XS(_wrap_lg_errinfo_severity_get) { + { + lg_errinfo *arg1 = (lg_errinfo *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + lg_error_severity result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: lg_errinfo_severity_get(self);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_lg_errinfo, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_severity_get" "', argument " "1"" of type '" "lg_errinfo *""'"); + } + arg1 = reinterpret_cast< lg_errinfo * >(argp1); + result = (lg_error_severity) ((arg1)->severity); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_lg_errinfo_severity_label_get) { + { + lg_errinfo *arg1 = (lg_errinfo *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: lg_errinfo_severity_label_get(self);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_lg_errinfo, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_severity_label_get" "', argument " "1"" of type '" "lg_errinfo *""'"); + } + arg1 = reinterpret_cast< lg_errinfo * >(argp1); + result = (char *) ((arg1)->severity_label); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_lg_errinfo_text_get) { + { + lg_errinfo *arg1 = (lg_errinfo *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: lg_errinfo_text_get(self);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_lg_errinfo, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_text_get" "', argument " "1"" of type '" "lg_errinfo *""'"); + } + arg1 = reinterpret_cast< lg_errinfo * >(argp1); + result = (char *) ((arg1)->text); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_new_lg_errinfo) { + { + int argvi = 0; + lg_errinfo *result = 0 ; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: new_lg_errinfo();"); + } + result = (lg_errinfo *)new lg_errinfo(); + ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_lg_errinfo, SWIG_OWNER | SWIG_SHADOW); argvi++ ; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + +XS(_wrap_linkgrammar_get_version) { + { + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: linkgrammar_get_version();"); + } + result = (char *)linkgrammar_get_version(); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + +XS(_wrap_linkgrammar_get_configuration) { + { + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: linkgrammar_get_configuration();"); + } + result = (char *)linkgrammar_get_configuration(); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + +XS(_wrap_linkgrammar_get_dict_version) { + { + Dictionary arg1 = (Dictionary) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkgrammar_get_dict_version(dict);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Dictionary_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkgrammar_get_dict_version" "', argument " "1"" of type '" "Dictionary""'"); + } + arg1 = reinterpret_cast< Dictionary >(argp1); + result = (char *)linkgrammar_get_dict_version(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkgrammar_get_dict_locale) { + { + Dictionary arg1 = (Dictionary) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkgrammar_get_dict_locale(dict);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Dictionary_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkgrammar_get_dict_locale" "', argument " "1"" of type '" "Dictionary""'"); + } + arg1 = reinterpret_cast< Dictionary >(argp1); + result = (char *)linkgrammar_get_dict_locale(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_dictionary_create_lang) { + { + char *arg1 = (char *) 0 ; + int res1 ; + char *buf1 = 0 ; + int alloc1 = 0 ; + int argvi = 0; + Dictionary result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: dictionary_create_lang(lang);"); + } + res1 = SWIG_AsCharPtrAndSize(ST(0), &buf1, NULL, &alloc1); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_create_lang" "', argument " "1"" of type '" "char const *""'"); + } + arg1 = reinterpret_cast< char * >(buf1); + result = (Dictionary)dictionary_create_lang((char const *)arg1); + ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Dictionary_s, 0 | 0); argvi++ ; + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + XSRETURN(argvi); + fail: + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + SWIG_croak_null(); + } +} + + +XS(_wrap_dictionary_create_default_lang) { + { + int argvi = 0; + Dictionary result; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: dictionary_create_default_lang();"); + } + result = (Dictionary)dictionary_create_default_lang(); + ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Dictionary_s, 0 | 0); argvi++ ; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + +XS(_wrap_dictionary_get_lang) { + { + Dictionary arg1 = (Dictionary) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: dictionary_get_lang(dict);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Dictionary_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_get_lang" "', argument " "1"" of type '" "Dictionary""'"); + } + arg1 = reinterpret_cast< Dictionary >(argp1); + result = (char *)dictionary_get_lang(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_dictionary_delete) { + { + Dictionary arg1 = (Dictionary) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: dictionary_delete(dict);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Dictionary_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_delete" "', argument " "1"" of type '" "Dictionary""'"); + } + arg1 = reinterpret_cast< Dictionary >(argp1); + dictionary_delete(arg1); + ST(argvi) = sv_newmortal(); + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_dictionary_set_data_dir) { + { + char *arg1 = (char *) 0 ; + int res1 ; + char *buf1 = 0 ; + int alloc1 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: dictionary_set_data_dir(path);"); + } + res1 = SWIG_AsCharPtrAndSize(ST(0), &buf1, NULL, &alloc1); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_set_data_dir" "', argument " "1"" of type '" "char const *""'"); + } + arg1 = reinterpret_cast< char * >(buf1); + dictionary_set_data_dir((char const *)arg1); + ST(argvi) = sv_newmortal(); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + XSRETURN(argvi); + fail: + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + SWIG_croak_null(); + } +} + + +XS(_wrap_dictionary_get_data_dir) { + { + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: dictionary_get_data_dir();"); + } + result = (char *)dictionary_get_data_dir(); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + delete[] result; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_create) { + { + int argvi = 0; + Parse_Options result; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: parse_options_create();"); + } + result = (Parse_Options)parse_options_create(); + ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Parse_Options_s, 0 | 0); argvi++ ; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_delete) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_delete(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_delete" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_delete(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_verbosity) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_verbosity(opts,verbosity);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_verbosity" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_verbosity" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_verbosity(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_verbosity) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_verbosity(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_verbosity" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_verbosity(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_linkage_limit) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_linkage_limit(opts,linkage_limit);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_linkage_limit" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_linkage_limit" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_linkage_limit(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_linkage_limit) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_linkage_limit(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_linkage_limit" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_linkage_limit(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_disjunct_cost) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + double arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + double val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_disjunct_cost(opts,disjunct_cost);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_disjunct_cost" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_double SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_disjunct_cost" "', argument " "2"" of type '" "double""'"); + } + arg2 = static_cast< double >(val2); + parse_options_set_disjunct_cost(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_disjunct_cost) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + double result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_disjunct_cost(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_disjunct_cost" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (double)parse_options_get_disjunct_cost(arg1); + ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1(static_cast< double >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_min_null_count) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_min_null_count(opts,null_count);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_min_null_count" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_min_null_count" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_min_null_count(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_min_null_count) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_min_null_count(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_min_null_count" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_min_null_count(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_max_null_count) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_max_null_count(opts,null_count);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_null_count" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_null_count" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_max_null_count(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_max_null_count) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_max_null_count(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_null_count" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_max_null_count(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_islands_ok) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_islands_ok(opts,islands_ok);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_islands_ok" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_islands_ok" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_islands_ok(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_islands_ok) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_islands_ok(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_islands_ok" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_islands_ok(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_short_length) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_short_length(opts,short_length);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_short_length" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_short_length" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_short_length(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_short_length) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_short_length(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_short_length" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_short_length(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_max_memory) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_max_memory(opts,mem);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_memory" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_memory" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_max_memory(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_max_memory) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_max_memory(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_memory" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_max_memory(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_max_parse_time) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_max_parse_time(opts,secs);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_parse_time" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_parse_time" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_max_parse_time(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_max_parse_time) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_max_parse_time(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_parse_time" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_max_parse_time(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_cost_model_type) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + Cost_Model_type arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_cost_model_type(opts,cm);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_cost_model_type" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_cost_model_type" "', argument " "2"" of type '" "Cost_Model_type""'"); + } + arg2 = static_cast< Cost_Model_type >(val2); + parse_options_set_cost_model_type(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_cost_model_type) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + Cost_Model_type result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_cost_model_type(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_cost_model_type" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (Cost_Model_type)parse_options_get_cost_model_type(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_timer_expired) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_timer_expired(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_timer_expired" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_timer_expired(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_memory_exhausted) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_memory_exhausted(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_memory_exhausted" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_memory_exhausted(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_resources_exhausted) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_resources_exhausted(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_resources_exhausted" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_resources_exhausted(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_display_morphology) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_display_morphology(opts,val);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_display_morphology" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_display_morphology" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_display_morphology(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_display_morphology) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_display_morphology(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_display_morphology" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_display_morphology(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_spell_guess) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_spell_guess(opts,val);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_spell_guess" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_spell_guess" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_spell_guess(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_spell_guess) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_spell_guess(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_spell_guess" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_spell_guess(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_all_short_connectors) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_all_short_connectors(opts,val);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_all_short_connectors" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_all_short_connectors" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_all_short_connectors(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_all_short_connectors) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_all_short_connectors(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_all_short_connectors" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_all_short_connectors(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_reset_resources) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_reset_resources(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_reset_resources" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + parse_options_reset_resources(arg1); + ST(argvi) = sv_newmortal(); + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_set_use_sat_parser) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: parse_options_set_use_sat_parser(opts,val);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_use_sat_parser" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_use_sat_parser" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + parse_options_set_use_sat_parser(arg1,arg2); + ST(argvi) = sv_newmortal(); + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_parse_options_get_use_sat_parser) { + { + Parse_Options arg1 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: parse_options_get_use_sat_parser(opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_use_sat_parser" "', argument " "1"" of type '" "Parse_Options""'"); + } + arg1 = reinterpret_cast< Parse_Options >(argp1); + result = (int)parse_options_get_use_sat_parser(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_create) { + { + char *arg1 = (char *) 0 ; + Dictionary arg2 = (Dictionary) 0 ; + int res1 ; + char *buf1 = 0 ; + int alloc1 = 0 ; + void *argp2 = 0 ; + int res2 = 0 ; + int argvi = 0; + Sentence result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: sentence_create(input_string,dict);"); + } + res1 = SWIG_AsCharPtrAndSize(ST(0), &buf1, NULL, &alloc1); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_create" "', argument " "1"" of type '" "char const *""'"); + } + arg1 = reinterpret_cast< char * >(buf1); + res2 = SWIG_ConvertPtr(ST(1), &argp2,SWIGTYPE_p_Dictionary_s, 0 | 0 ); + if (!SWIG_IsOK(res2)) { + SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_create" "', argument " "2"" of type '" "Dictionary""'"); + } + arg2 = reinterpret_cast< Dictionary >(argp2); + result = (Sentence)sentence_create((char const *)arg1,arg2); + ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Sentence_s, 0 | 0); argvi++ ; + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + + XSRETURN(argvi); + fail: + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_delete) { + { + Sentence arg1 = (Sentence) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: sentence_delete(sent);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_delete" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + sentence_delete(arg1); + ST(argvi) = sv_newmortal(); + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_split) { + { + Sentence arg1 = (Sentence) 0 ; + Parse_Options arg2 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + void *argp2 = 0 ; + int res2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: sentence_split(sent,opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_split" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + res2 = SWIG_ConvertPtr(ST(1), &argp2,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res2)) { + SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_split" "', argument " "2"" of type '" "Parse_Options""'"); + } + arg2 = reinterpret_cast< Parse_Options >(argp2); + result = (int)sentence_split(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_parse) { + { + Sentence arg1 = (Sentence) 0 ; + Parse_Options arg2 = (Parse_Options) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + void *argp2 = 0 ; + int res2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: sentence_parse(sent,opts);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_parse" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + res2 = SWIG_ConvertPtr(ST(1), &argp2,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res2)) { + SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_parse" "', argument " "2"" of type '" "Parse_Options""'"); + } + arg2 = reinterpret_cast< Parse_Options >(argp2); + result = (int)sentence_parse(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_length) { + { + Sentence arg1 = (Sentence) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: sentence_length(sent);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_length" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + result = (int)sentence_length(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_null_count) { + { + Sentence arg1 = (Sentence) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: sentence_null_count(sent);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_null_count" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + result = (int)sentence_null_count(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_num_linkages_found) { + { + Sentence arg1 = (Sentence) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: sentence_num_linkages_found(sent);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_linkages_found" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + result = (int)sentence_num_linkages_found(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_num_valid_linkages) { + { + Sentence arg1 = (Sentence) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: sentence_num_valid_linkages(sent);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_valid_linkages" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + result = (int)sentence_num_valid_linkages(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_num_linkages_post_processed) { + { + Sentence arg1 = (Sentence) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: sentence_num_linkages_post_processed(sent);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_linkages_post_processed" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + result = (int)sentence_num_linkages_post_processed(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_num_violations) { + { + Sentence arg1 = (Sentence) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: sentence_num_violations(sent,i);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_violations" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_num_violations" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)sentence_num_violations(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_disjunct_cost) { + { + Sentence arg1 = (Sentence) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + double result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: sentence_disjunct_cost(sent,i);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_disjunct_cost" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_disjunct_cost" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (double)sentence_disjunct_cost(arg1,arg2); + ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1(static_cast< double >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_sentence_link_cost) { + { + Sentence arg1 = (Sentence) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: sentence_link_cost(sent,i);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_link_cost" "', argument " "1"" of type '" "Sentence""'"); + } + arg1 = reinterpret_cast< Sentence >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_link_cost" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)sentence_link_cost(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_create) { + { + int arg1 ; + Sentence arg2 = (Sentence) 0 ; + Parse_Options arg3 = (Parse_Options) 0 ; + int val1 ; + int ecode1 = 0 ; + void *argp2 = 0 ; + int res2 = 0 ; + void *argp3 = 0 ; + int res3 = 0 ; + int argvi = 0; + Linkage result; + dXSARGS; + + if ((items < 3) || (items > 3)) { + SWIG_croak("Usage: linkage_create(index,sent,opts);"); + } + ecode1 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(0), &val1); + if (!SWIG_IsOK(ecode1)) { + SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "linkage_create" "', argument " "1"" of type '" "int""'"); + } + arg1 = static_cast< int >(val1); + res2 = SWIG_ConvertPtr(ST(1), &argp2,SWIGTYPE_p_Sentence_s, 0 | 0 ); + if (!SWIG_IsOK(res2)) { + SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "linkage_create" "', argument " "2"" of type '" "Sentence""'"); + } + arg2 = reinterpret_cast< Sentence >(argp2); + res3 = SWIG_ConvertPtr(ST(2), &argp3,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); + if (!SWIG_IsOK(res3)) { + SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "linkage_create" "', argument " "3"" of type '" "Parse_Options""'"); + } + arg3 = reinterpret_cast< Parse_Options >(argp3); + result = (Linkage)linkage_create(arg1,arg2,arg3); + ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Linkage_s, 0 | 0); argvi++ ; + + + + XSRETURN(argvi); + fail: + + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_delete) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_delete(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_delete" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + linkage_delete(arg1); + ST(argvi) = sv_newmortal(); + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_print_diagram) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + size_t arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + size_t val3 ; + int ecode3 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 3) || (items > 3)) { + SWIG_croak("Usage: linkage_print_diagram(linkage,display_walls,screen_width);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_diagram" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_diagram" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + ecode3 = SWIG_AsVal_size_t SWIG_PERL_CALL_ARGS_2(ST(2), &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "linkage_print_diagram" "', argument " "3"" of type '" "size_t""'"); + } + arg3 = static_cast< size_t >(val3); + result = (char *)linkage_print_diagram(arg1,arg2,arg3); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + + + { + linkage_free_diagram(result); + } + XSRETURN(argvi); + fail: + + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_print_postscript) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + int arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int val3 ; + int ecode3 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 3) || (items > 3)) { + SWIG_croak("Usage: linkage_print_postscript(linkage,display_walls,print_ps_header);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_postscript" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_postscript" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + ecode3 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(2), &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "linkage_print_postscript" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + result = (char *)linkage_print_postscript(arg1,arg2,arg3); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + + + { + linkage_free_postscript(result); + } + XSRETURN(argvi); + fail: + + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_print_links_and_domains) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_print_links_and_domains(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_links_and_domains" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (char *)linkage_print_links_and_domains(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + { + linkage_free_links_and_domains(result); + } + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_print_senses) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_print_senses(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_senses" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (char *)linkage_print_senses(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + { + linkage_free_senses(result); + } + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_print_constituent_tree) { + { + Linkage arg1 = (Linkage) 0 ; + ConstituentDisplayStyle arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_print_constituent_tree(linkage,mode);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_constituent_tree" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_constituent_tree" "', argument " "2"" of type '" "ConstituentDisplayStyle""'"); + } + arg2 = static_cast< ConstituentDisplayStyle >(val2); + result = (char *)linkage_print_constituent_tree(arg1,arg2); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + + { + linkage_free_constituent_tree_str(result); + } + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_print_disjuncts) { + { + Linkage arg1 = (Linkage) (Linkage)0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_print_disjuncts(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_disjuncts" "', argument " "1"" of type '" "Linkage const""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (char *)linkage_print_disjuncts(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_print_pp_msgs) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_print_pp_msgs(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_pp_msgs" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (char *)linkage_print_pp_msgs(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_num_words) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_get_num_words(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_num_words" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (int)linkage_get_num_words(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_num_links) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_get_num_links(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_num_links" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (int)linkage_get_num_links(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_link_lword) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_link_lword(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_lword" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_lword" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_link_lword(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_link_rword) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_link_rword(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_rword" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_rword" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_link_rword(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_link_length) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_link_length(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_length" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_length" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_link_length(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_link_label) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_link_label(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_label" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_label" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (char *)linkage_get_link_label(arg1,arg2); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_link_llabel) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_link_llabel(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_llabel" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_llabel" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (char *)linkage_get_link_llabel(arg1,arg2); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_link_rlabel) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_link_rlabel(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_rlabel" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_rlabel" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (char *)linkage_get_link_rlabel(arg1,arg2); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_link_num_domains) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_link_num_domains(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_num_domains" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_num_domains" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_link_num_domains(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_link_domain_names) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + char **result = 0 ; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_link_domain_names(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_domain_names" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_domain_names" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (char **)linkage_get_link_domain_names(arg1,arg2); + ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_p_char, 0 | 0); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_words) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char **result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_get_words(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_words" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (char **)linkage_get_words(arg1); + ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_p_char, 0 | 0); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_word) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word(linkage,w);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (char *)linkage_get_word(arg1,arg2); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_word_byte_start) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word_byte_start(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_byte_start" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_byte_start" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_byte_start(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_word_byte_end) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word_byte_end(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_byte_end" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_byte_end" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_byte_end(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_word_char_start) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word_char_start(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_char_start" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_char_start" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_char_start(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_word_char_end) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word_char_end(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_char_end" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_char_end" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_char_end(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_unused_word_cost) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_unused_word_cost(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_unused_word_cost" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (int)linkage_unused_word_cost(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_disjunct_cost) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + double result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_disjunct_cost(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_disjunct_cost" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (double)linkage_disjunct_cost(arg1); + ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1(static_cast< double >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_link_cost) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_link_cost(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_link_cost" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (int)linkage_link_cost(arg1); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_corpus_cost) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + double result; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_corpus_cost(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_corpus_cost" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (double)linkage_corpus_cost(arg1); + ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1(static_cast< double >(result)); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_violation_name) { + { + Linkage arg1 = (Linkage) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: linkage_get_violation_name(linkage);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_violation_name" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + result = (char *)linkage_get_violation_name(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap__lg_error_formatmsg) { + { + lg_errinfo *arg1 = (lg_errinfo *) 0 ; + void *argp1 = 0 ; + int res1 = 0 ; + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 1) || (items > 1)) { + SWIG_croak("Usage: _lg_error_formatmsg(lge);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_lg_errinfo, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_lg_error_formatmsg" "', argument " "1"" of type '" "lg_errinfo *""'"); + } + arg1 = reinterpret_cast< lg_errinfo * >(argp1); + result = (char *)lg_error_formatmsg(arg1); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + + { + free(result); + } + XSRETURN(argvi); + fail: + + SWIG_croak_null(); + } +} + + +XS(_wrap_lg_error_clearall) { + { + int argvi = 0; + int result; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: lg_error_clearall();"); + } + result = (int)lg_error_clearall(); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + +XS(_wrap__prt_error) { + { + char *arg1 = (char *) 0 ; + char *arg2 = (char *) 0 ; + int res1 ; + char *buf1 = 0 ; + int alloc1 = 0 ; + int res2 ; + char *buf2 = 0 ; + int alloc2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: _prt_error(char const *,char const *);"); + } + res1 = SWIG_AsCharPtrAndSize(ST(0), &buf1, NULL, &alloc1); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_prt_error" "', argument " "1"" of type '" "char const *""'"); + } + arg1 = reinterpret_cast< char * >(buf1); + res2 = SWIG_AsCharPtrAndSize(ST(1), &buf2, NULL, &alloc2); + if (!SWIG_IsOK(res2)) { + SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "_prt_error" "', argument " "2"" of type '" "char const *""'"); + } + arg2 = reinterpret_cast< char * >(buf2); + result = (int)prt_error((char const *)arg1,(char const *)arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + if (alloc2 == SWIG_NEWOBJ) delete[] buf2; + XSRETURN(argvi); + fail: + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + if (alloc2 == SWIG_NEWOBJ) delete[] buf2; + SWIG_croak_null(); + } +} + + +XS(_wrap_lg_error_flush) { + { + int argvi = 0; + int result; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: lg_error_flush();"); + } + result = (int)lg_error_flush(); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + + +/* -------- TYPE CONVERSION AND EQUIVALENCE RULES (BEGIN) -------- */ + +static swig_type_info _swigt__p_ConstituentDisplayStyle = {"_p_ConstituentDisplayStyle", "enum ConstituentDisplayStyle *|ConstituentDisplayStyle *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Cost_Model_type = {"_p_Cost_Model_type", "enum Cost_Model_type *|Cost_Model_type *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Dictionary_s = {"_p_Dictionary_s", "Dictionary_s *|Dictionary", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Linkage_s = {"_p_Linkage_s", "Linkage_s *|Linkage", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Parse_Options_s = {"_p_Parse_Options_s", "Parse_Options_s *|Parse_Options", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Sentence_s = {"_p_Sentence_s", "Sentence|Sentence_s *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_lg_errinfo = {"_p_lg_errinfo", "lg_errinfo *", 0, 0, (void*)"clinkgrammar::lg_errinfo", 0}; +static swig_type_info _swigt__p_lg_error_severity = {"_p_lg_error_severity", "enum lg_error_severity *|lg_error_severity *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_p_char = {"_p_p_char", "char **", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_size_t = {"_p_size_t", "LinkageIdx *|size_t *|LinkIdx *|WordIdx *", 0, 0, (void*)0, 0}; + +static swig_type_info *swig_type_initial[] = { + &_swigt__p_ConstituentDisplayStyle, + &_swigt__p_Cost_Model_type, + &_swigt__p_Dictionary_s, + &_swigt__p_Linkage_s, + &_swigt__p_Parse_Options_s, + &_swigt__p_Sentence_s, + &_swigt__p_char, + &_swigt__p_lg_errinfo, + &_swigt__p_lg_error_severity, + &_swigt__p_p_char, + &_swigt__p_size_t, +}; + +static swig_cast_info _swigc__p_ConstituentDisplayStyle[] = { {&_swigt__p_ConstituentDisplayStyle, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_Cost_Model_type[] = { {&_swigt__p_Cost_Model_type, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_Dictionary_s[] = { {&_swigt__p_Dictionary_s, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_Linkage_s[] = { {&_swigt__p_Linkage_s, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_Parse_Options_s[] = { {&_swigt__p_Parse_Options_s, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_Sentence_s[] = { {&_swigt__p_Sentence_s, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_lg_errinfo[] = { {&_swigt__p_lg_errinfo, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_lg_error_severity[] = { {&_swigt__p_lg_error_severity, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_p_char[] = { {&_swigt__p_p_char, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_size_t[] = { {&_swigt__p_size_t, 0, 0, 0},{0, 0, 0, 0}}; + +static swig_cast_info *swig_cast_initial[] = { + _swigc__p_ConstituentDisplayStyle, + _swigc__p_Cost_Model_type, + _swigc__p_Dictionary_s, + _swigc__p_Linkage_s, + _swigc__p_Parse_Options_s, + _swigc__p_Sentence_s, + _swigc__p_char, + _swigc__p_lg_errinfo, + _swigc__p_lg_error_severity, + _swigc__p_p_char, + _swigc__p_size_t, +}; + + +/* -------- TYPE CONVERSION AND EQUIVALENCE RULES (END) -------- */ + +static swig_constant_info swig_constants[] = { +{0,0,0,0,0,0} +}; +#ifdef __cplusplus +} +#endif +static swig_variable_info swig_variables[] = { +{0,0,0,0} +}; +static swig_command_info swig_commands[] = { +{"clinkgrammarc::lg_errinfo_severity_get", _wrap_lg_errinfo_severity_get}, +{"clinkgrammarc::lg_errinfo_severity_label_get", _wrap_lg_errinfo_severity_label_get}, +{"clinkgrammarc::lg_errinfo_text_get", _wrap_lg_errinfo_text_get}, +{"clinkgrammarc::new_lg_errinfo", _wrap_new_lg_errinfo}, +{"clinkgrammarc::linkgrammar_get_version", _wrap_linkgrammar_get_version}, +{"clinkgrammarc::linkgrammar_get_configuration", _wrap_linkgrammar_get_configuration}, +{"clinkgrammarc::linkgrammar_get_dict_version", _wrap_linkgrammar_get_dict_version}, +{"clinkgrammarc::linkgrammar_get_dict_locale", _wrap_linkgrammar_get_dict_locale}, +{"clinkgrammarc::dictionary_create_lang", _wrap_dictionary_create_lang}, +{"clinkgrammarc::dictionary_create_default_lang", _wrap_dictionary_create_default_lang}, +{"clinkgrammarc::dictionary_get_lang", _wrap_dictionary_get_lang}, +{"clinkgrammarc::dictionary_delete", _wrap_dictionary_delete}, +{"clinkgrammarc::dictionary_set_data_dir", _wrap_dictionary_set_data_dir}, +{"clinkgrammarc::dictionary_get_data_dir", _wrap_dictionary_get_data_dir}, +{"clinkgrammarc::parse_options_create", _wrap_parse_options_create}, +{"clinkgrammarc::parse_options_delete", _wrap_parse_options_delete}, +{"clinkgrammarc::parse_options_set_verbosity", _wrap_parse_options_set_verbosity}, +{"clinkgrammarc::parse_options_get_verbosity", _wrap_parse_options_get_verbosity}, +{"clinkgrammarc::parse_options_set_linkage_limit", _wrap_parse_options_set_linkage_limit}, +{"clinkgrammarc::parse_options_get_linkage_limit", _wrap_parse_options_get_linkage_limit}, +{"clinkgrammarc::parse_options_set_disjunct_cost", _wrap_parse_options_set_disjunct_cost}, +{"clinkgrammarc::parse_options_get_disjunct_cost", _wrap_parse_options_get_disjunct_cost}, +{"clinkgrammarc::parse_options_set_min_null_count", _wrap_parse_options_set_min_null_count}, +{"clinkgrammarc::parse_options_get_min_null_count", _wrap_parse_options_get_min_null_count}, +{"clinkgrammarc::parse_options_set_max_null_count", _wrap_parse_options_set_max_null_count}, +{"clinkgrammarc::parse_options_get_max_null_count", _wrap_parse_options_get_max_null_count}, +{"clinkgrammarc::parse_options_set_islands_ok", _wrap_parse_options_set_islands_ok}, +{"clinkgrammarc::parse_options_get_islands_ok", _wrap_parse_options_get_islands_ok}, +{"clinkgrammarc::parse_options_set_short_length", _wrap_parse_options_set_short_length}, +{"clinkgrammarc::parse_options_get_short_length", _wrap_parse_options_get_short_length}, +{"clinkgrammarc::parse_options_set_max_memory", _wrap_parse_options_set_max_memory}, +{"clinkgrammarc::parse_options_get_max_memory", _wrap_parse_options_get_max_memory}, +{"clinkgrammarc::parse_options_set_max_parse_time", _wrap_parse_options_set_max_parse_time}, +{"clinkgrammarc::parse_options_get_max_parse_time", _wrap_parse_options_get_max_parse_time}, +{"clinkgrammarc::parse_options_set_cost_model_type", _wrap_parse_options_set_cost_model_type}, +{"clinkgrammarc::parse_options_get_cost_model_type", _wrap_parse_options_get_cost_model_type}, +{"clinkgrammarc::parse_options_timer_expired", _wrap_parse_options_timer_expired}, +{"clinkgrammarc::parse_options_memory_exhausted", _wrap_parse_options_memory_exhausted}, +{"clinkgrammarc::parse_options_resources_exhausted", _wrap_parse_options_resources_exhausted}, +{"clinkgrammarc::parse_options_set_display_morphology", _wrap_parse_options_set_display_morphology}, +{"clinkgrammarc::parse_options_get_display_morphology", _wrap_parse_options_get_display_morphology}, +{"clinkgrammarc::parse_options_set_spell_guess", _wrap_parse_options_set_spell_guess}, +{"clinkgrammarc::parse_options_get_spell_guess", _wrap_parse_options_get_spell_guess}, +{"clinkgrammarc::parse_options_set_all_short_connectors", _wrap_parse_options_set_all_short_connectors}, +{"clinkgrammarc::parse_options_get_all_short_connectors", _wrap_parse_options_get_all_short_connectors}, +{"clinkgrammarc::parse_options_reset_resources", _wrap_parse_options_reset_resources}, +{"clinkgrammarc::parse_options_set_use_sat_parser", _wrap_parse_options_set_use_sat_parser}, +{"clinkgrammarc::parse_options_get_use_sat_parser", _wrap_parse_options_get_use_sat_parser}, +{"clinkgrammarc::sentence_create", _wrap_sentence_create}, +{"clinkgrammarc::sentence_delete", _wrap_sentence_delete}, +{"clinkgrammarc::sentence_split", _wrap_sentence_split}, +{"clinkgrammarc::sentence_parse", _wrap_sentence_parse}, +{"clinkgrammarc::sentence_length", _wrap_sentence_length}, +{"clinkgrammarc::sentence_null_count", _wrap_sentence_null_count}, +{"clinkgrammarc::sentence_num_linkages_found", _wrap_sentence_num_linkages_found}, +{"clinkgrammarc::sentence_num_valid_linkages", _wrap_sentence_num_valid_linkages}, +{"clinkgrammarc::sentence_num_linkages_post_processed", _wrap_sentence_num_linkages_post_processed}, +{"clinkgrammarc::sentence_num_violations", _wrap_sentence_num_violations}, +{"clinkgrammarc::sentence_disjunct_cost", _wrap_sentence_disjunct_cost}, +{"clinkgrammarc::sentence_link_cost", _wrap_sentence_link_cost}, +{"clinkgrammarc::linkage_create", _wrap_linkage_create}, +{"clinkgrammarc::linkage_delete", _wrap_linkage_delete}, +{"clinkgrammarc::linkage_print_diagram", _wrap_linkage_print_diagram}, +{"clinkgrammarc::linkage_print_postscript", _wrap_linkage_print_postscript}, +{"clinkgrammarc::linkage_print_links_and_domains", _wrap_linkage_print_links_and_domains}, +{"clinkgrammarc::linkage_print_senses", _wrap_linkage_print_senses}, +{"clinkgrammarc::linkage_print_constituent_tree", _wrap_linkage_print_constituent_tree}, +{"clinkgrammarc::linkage_print_disjuncts", _wrap_linkage_print_disjuncts}, +{"clinkgrammarc::linkage_print_pp_msgs", _wrap_linkage_print_pp_msgs}, +{"clinkgrammarc::linkage_get_num_words", _wrap_linkage_get_num_words}, +{"clinkgrammarc::linkage_get_num_links", _wrap_linkage_get_num_links}, +{"clinkgrammarc::linkage_get_link_lword", _wrap_linkage_get_link_lword}, +{"clinkgrammarc::linkage_get_link_rword", _wrap_linkage_get_link_rword}, +{"clinkgrammarc::linkage_get_link_length", _wrap_linkage_get_link_length}, +{"clinkgrammarc::linkage_get_link_label", _wrap_linkage_get_link_label}, +{"clinkgrammarc::linkage_get_link_llabel", _wrap_linkage_get_link_llabel}, +{"clinkgrammarc::linkage_get_link_rlabel", _wrap_linkage_get_link_rlabel}, +{"clinkgrammarc::linkage_get_link_num_domains", _wrap_linkage_get_link_num_domains}, +{"clinkgrammarc::linkage_get_link_domain_names", _wrap_linkage_get_link_domain_names}, +{"clinkgrammarc::linkage_get_words", _wrap_linkage_get_words}, +{"clinkgrammarc::linkage_get_word", _wrap_linkage_get_word}, +{"clinkgrammarc::linkage_get_word_byte_start", _wrap_linkage_get_word_byte_start}, +{"clinkgrammarc::linkage_get_word_byte_end", _wrap_linkage_get_word_byte_end}, +{"clinkgrammarc::linkage_get_word_char_start", _wrap_linkage_get_word_char_start}, +{"clinkgrammarc::linkage_get_word_char_end", _wrap_linkage_get_word_char_end}, +{"clinkgrammarc::linkage_unused_word_cost", _wrap_linkage_unused_word_cost}, +{"clinkgrammarc::linkage_disjunct_cost", _wrap_linkage_disjunct_cost}, +{"clinkgrammarc::linkage_link_cost", _wrap_linkage_link_cost}, +{"clinkgrammarc::linkage_corpus_cost", _wrap_linkage_corpus_cost}, +{"clinkgrammarc::linkage_get_violation_name", _wrap_linkage_get_violation_name}, +{"clinkgrammarc::_lg_error_formatmsg", _wrap__lg_error_formatmsg}, +{"clinkgrammarc::lg_error_clearall", _wrap_lg_error_clearall}, +{"clinkgrammarc::_prt_error", _wrap__prt_error}, +{"clinkgrammarc::lg_error_flush", _wrap_lg_error_flush}, +{0,0} +}; +/* ----------------------------------------------------------------------------- + * Type initialization: + * This problem is tough by the requirement that no dynamic + * memory is used. Also, since swig_type_info structures store pointers to + * swig_cast_info structures and swig_cast_info structures store pointers back + * to swig_type_info structures, we need some lookup code at initialization. + * The idea is that swig generates all the structures that are needed. + * The runtime then collects these partially filled structures. + * The SWIG_InitializeModule function takes these initial arrays out of + * swig_module, and does all the lookup, filling in the swig_module.types + * array with the correct data and linking the correct swig_cast_info + * structures together. + * + * The generated swig_type_info structures are assigned statically to an initial + * array. We just loop through that array, and handle each type individually. + * First we lookup if this type has been already loaded, and if so, use the + * loaded structure instead of the generated one. Then we have to fill in the + * cast linked list. The cast data is initially stored in something like a + * two-dimensional array. Each row corresponds to a type (there are the same + * number of rows as there are in the swig_type_initial array). Each entry in + * a column is one of the swig_cast_info structures for that type. + * The cast_initial array is actually an array of arrays, because each row has + * a variable number of columns. So to actually build the cast linked list, + * we find the array of casts associated with the type, and loop through it + * adding the casts to the list. The one last trick we need to do is making + * sure the type pointer in the swig_cast_info struct is correct. + * + * First off, we lookup the cast->type name to see if it is already loaded. + * There are three cases to handle: + * 1) If the cast->type has already been loaded AND the type we are adding + * casting info to has not been loaded (it is in this module), THEN we + * replace the cast->type pointer with the type pointer that has already + * been loaded. + * 2) If BOTH types (the one we are adding casting info to, and the + * cast->type) are loaded, THEN the cast info has already been loaded by + * the previous module so we just ignore it. + * 3) Finally, if cast->type has not already been loaded, then we add that + * swig_cast_info to the linked list (because the cast->type) pointer will + * be correct. + * ----------------------------------------------------------------------------- */ + +#ifdef __cplusplus +extern "C" { +#if 0 +} /* c-mode */ +#endif +#endif + +#if 0 +#define SWIGRUNTIME_DEBUG +#endif + + +SWIGRUNTIME void +SWIG_InitializeModule(void *clientdata) { + size_t i; + swig_module_info *module_head, *iter; + int init; + + /* check to see if the circular list has been setup, if not, set it up */ + if (swig_module.next==0) { + /* Initialize the swig_module */ + swig_module.type_initial = swig_type_initial; + swig_module.cast_initial = swig_cast_initial; + swig_module.next = &swig_module; + init = 1; + } else { + init = 0; + } + + /* Try and load any already created modules */ + module_head = SWIG_GetModule(clientdata); + if (!module_head) { + /* This is the first module loaded for this interpreter */ + /* so set the swig module into the interpreter */ + SWIG_SetModule(clientdata, &swig_module); + } else { + /* the interpreter has loaded a SWIG module, but has it loaded this one? */ + iter=module_head; + do { + if (iter==&swig_module) { + /* Our module is already in the list, so there's nothing more to do. */ + return; + } + iter=iter->next; + } while (iter!= module_head); + + /* otherwise we must add our module into the list */ + swig_module.next = module_head->next; + module_head->next = &swig_module; + } + + /* When multiple interpreters are used, a module could have already been initialized in + a different interpreter, but not yet have a pointer in this interpreter. + In this case, we do not want to continue adding types... everything should be + set up already */ + if (init == 0) return; + + /* Now work on filling in swig_module.types */ +#ifdef SWIGRUNTIME_DEBUG + printf("SWIG_InitializeModule: size %d\n", swig_module.size); +#endif + for (i = 0; i < swig_module.size; ++i) { + swig_type_info *type = 0; + swig_type_info *ret; + swig_cast_info *cast; + +#ifdef SWIGRUNTIME_DEBUG + printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name); +#endif + + /* if there is another module already loaded */ + if (swig_module.next != &swig_module) { + type = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, swig_module.type_initial[i]->name); + } + if (type) { + /* Overwrite clientdata field */ +#ifdef SWIGRUNTIME_DEBUG + printf("SWIG_InitializeModule: found type %s\n", type->name); +#endif + if (swig_module.type_initial[i]->clientdata) { + type->clientdata = swig_module.type_initial[i]->clientdata; +#ifdef SWIGRUNTIME_DEBUG + printf("SWIG_InitializeModule: found and overwrite type %s \n", type->name); +#endif + } + } else { + type = swig_module.type_initial[i]; + } + + /* Insert casting types */ + cast = swig_module.cast_initial[i]; + while (cast->type) { + /* Don't need to add information already in the list */ + ret = 0; +#ifdef SWIGRUNTIME_DEBUG + printf("SWIG_InitializeModule: look cast %s\n", cast->type->name); +#endif + if (swig_module.next != &swig_module) { + ret = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, cast->type->name); +#ifdef SWIGRUNTIME_DEBUG + if (ret) printf("SWIG_InitializeModule: found cast %s\n", ret->name); +#endif + } + if (ret) { + if (type == swig_module.type_initial[i]) { +#ifdef SWIGRUNTIME_DEBUG + printf("SWIG_InitializeModule: skip old type %s\n", ret->name); +#endif + cast->type = ret; + ret = 0; + } else { + /* Check for casting already in the list */ + swig_cast_info *ocast = SWIG_TypeCheck(ret->name, type); +#ifdef SWIGRUNTIME_DEBUG + if (ocast) printf("SWIG_InitializeModule: skip old cast %s\n", ret->name); +#endif + if (!ocast) ret = 0; + } + } + + if (!ret) { +#ifdef SWIGRUNTIME_DEBUG + printf("SWIG_InitializeModule: adding cast %s\n", cast->type->name); +#endif + if (type->cast) { + type->cast->prev = cast; + cast->next = type->cast; + } + type->cast = cast; + } + cast++; + } + /* Set entry in modules->types array equal to the type */ + swig_module.types[i] = type; + } + swig_module.types[i] = 0; + +#ifdef SWIGRUNTIME_DEBUG + printf("**** SWIG_InitializeModule: Cast List ******\n"); + for (i = 0; i < swig_module.size; ++i) { + int j = 0; + swig_cast_info *cast = swig_module.cast_initial[i]; + printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name); + while (cast->type) { + printf("SWIG_InitializeModule: cast type %s\n", cast->type->name); + cast++; + ++j; + } + printf("---- Total casts: %d\n",j); + } + printf("**** SWIG_InitializeModule: Cast List ******\n"); +#endif +} + +/* This function will propagate the clientdata field of type to +* any new swig_type_info structures that have been added into the list +* of equivalent types. It is like calling +* SWIG_TypeClientData(type, clientdata) a second time. +*/ +SWIGRUNTIME void +SWIG_PropagateClientData(void) { + size_t i; + swig_cast_info *equiv; + static int init_run = 0; + + if (init_run) return; + init_run = 1; + + for (i = 0; i < swig_module.size; i++) { + if (swig_module.types[i]->clientdata) { + equiv = swig_module.types[i]->cast; + while (equiv) { + if (!equiv->converter) { + if (equiv->type && !equiv->type->clientdata) + SWIG_TypeClientData(equiv->type, swig_module.types[i]->clientdata); + } + equiv = equiv->next; + } + } + } +} + +#ifdef __cplusplus +#if 0 +{ + /* c-mode */ +#endif +} +#endif + + + +#if defined(__cplusplus) && ! defined(XSPROTO) +extern "C" +#endif + +XS(SWIG_init) { + dXSARGS; + int i; + (void)items; + + SWIG_InitializeModule(0); + + /* Install commands */ + for (i = 0; swig_commands[i].name; i++) { + /* Casts only needed for Perl < 5.10. */ +#ifdef __cplusplus + newXS(const_cast(swig_commands[i].name), swig_commands[i].wrapper, const_cast(__FILE__)); +#else + newXS((char*)swig_commands[i].name, swig_commands[i].wrapper, (char*)__FILE__); +#endif + } + + /* Install variables */ + for (i = 0; swig_variables[i].name; i++) { + SV *sv; + sv = get_sv(swig_variables[i].name, TRUE | 0x2 | GV_ADDMULTI); + if (swig_variables[i].type) { + SWIG_MakePtr(sv,(void *)1, *swig_variables[i].type,0); + } else { + sv_setiv(sv,(IV) 0); + } + swig_create_magic(sv, swig_variables[i].name, swig_variables[i].set, swig_variables[i].get); + } + + /* Install constant */ + for (i = 0; swig_constants[i].type; i++) { + SV *sv; + sv = get_sv(swig_constants[i].name, TRUE | 0x2 | GV_ADDMULTI); + switch(swig_constants[i].type) { + case SWIG_INT: + sv_setiv(sv, (IV) swig_constants[i].lvalue); + break; + case SWIG_FLOAT: + sv_setnv(sv, (double) swig_constants[i].dvalue); + break; + case SWIG_STRING: + sv_setpv(sv, (const char *) swig_constants[i].pvalue); + break; + case SWIG_POINTER: + SWIG_MakePtr(sv, swig_constants[i].pvalue, *(swig_constants[i].ptype),0); + break; + case SWIG_BINARY: + SWIG_MakePackedObj(sv, swig_constants[i].pvalue, swig_constants[i].lvalue, *(swig_constants[i].ptype)); + break; + default: + break; + } + SvREADONLY_on(sv); + } + + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "lg_Fatal", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Fatal))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "lg_Error", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Error))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "lg_Warn", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Warn))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "lg_Info", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Info))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "lg_Debug", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Debug))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "lg_Trace", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Trace))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "lg_None", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_None))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + SWIG_TypeClientData(SWIGTYPE_p_lg_errinfo, (void*) "clinkgrammar::lg_errinfo"); + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "VDAL", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(VDAL))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "CORPUS", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(CORPUS))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "NO_DISPLAY", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(NO_DISPLAY))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "MULTILINE", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(MULTILINE))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "BRACKET_TREE", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(BRACKET_TREE))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "SINGLE_LINE", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(SINGLE_LINE))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { + SV *sv = get_sv((char*) SWIG_prefix "MAX_STYLES", TRUE | 0x2 | GV_ADDMULTI); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(MAX_STYLES))); + SvREADONLY_on(sv); + } while(0) /*@SWIG@*/; + ST(0) = &PL_sv_yes; + XSRETURN(1); +} + diff -Nru link-grammar-5.3.16/bindings/perl/lg_perl_wrap.cc link-grammar-5.5.0/bindings/perl/lg_perl_wrap.cc --- link-grammar-5.3.16/bindings/perl/lg_perl_wrap.cc 2017-01-19 20:16:16.000000000 +0000 +++ link-grammar-5.5.0/bindings/perl/lg_perl_wrap.cc 2018-04-29 21:29:50.000000000 +0000 @@ -15,6 +15,30 @@ #define SWIG_CASTRANK_MODE + +#ifdef __cplusplus +/* SwigValueWrapper is described in swig.swg */ +template class SwigValueWrapper { + struct SwigMovePointer { + T *ptr; + SwigMovePointer(T *p) : ptr(p) { } + ~SwigMovePointer() { delete ptr; } + SwigMovePointer& operator=(SwigMovePointer& rhs) { T* oldptr = ptr; ptr = 0; delete oldptr; ptr = rhs.ptr; rhs.ptr = 0; return *this; } + } pointer; + SwigValueWrapper& operator=(const SwigValueWrapper& rhs); + SwigValueWrapper(const SwigValueWrapper& rhs); +public: + SwigValueWrapper() : pointer(0) { } + SwigValueWrapper& operator=(const T& t) { SwigMovePointer tmp(new T(t)); pointer = tmp; return *this; } + operator T&() const { return *pointer.ptr; } + T *operator&() { return pointer.ptr; } +}; + +template T SwigValueInit() { + return T(); +} +#endif + /* ----------------------------------------------------------------------------- * This section contains generic SWIG labels for method/variable * declarations/attributes, and other compiler dependent labels. @@ -1533,8 +1557,11 @@ #define SWIG_VERSION SWIGVERSION -#define SWIG_as_voidptr(a) (void *)((const void *)(a)) -#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),(void**)(a)) +#define SWIG_as_voidptr(a) const_cast< void * >(static_cast< const void * >(a)) +#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),reinterpret_cast< void** >(a)) + + +#include #ifdef __cplusplus @@ -1623,7 +1650,7 @@ if (cptr) { if (alloc) { if (*alloc == SWIG_NEWOBJ) { - *cptr = (char *)memcpy((char *)malloc((size)*sizeof(char)), cstr, sizeof(char)*(size)); + *cptr = reinterpret_cast< char* >(memcpy((new char[size]), cstr, sizeof(char)*(size))); } else { *cptr = cstr; *alloc = SWIG_OLDOBJ; @@ -1796,7 +1823,7 @@ if ((v < INT_MIN || v > INT_MAX)) { return SWIG_OverflowError; } else { - if (val) *val = (int)(v); + if (val) *val = static_cast< int >(v); } } return res; @@ -1922,12 +1949,12 @@ #endif unsigned long v; res = SWIG_AsVal_unsigned_SS_long SWIG_PERL_CALL_ARGS_2(obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = (size_t)(v); + if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); #ifdef SWIG_LONG_LONG_AVAILABLE } else if (sizeof(size_t) <= sizeof(unsigned long long)) { unsigned long long v; res = SWIG_AsVal_unsigned_SS_long_SS_long SWIG_PERL_CALL_ARGS_2(obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = (size_t)(v); + if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); } #endif return res; @@ -1978,9 +2005,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_severity_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (lg_error_severity) ((arg1)->severity); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2006,7 +2033,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_severity_label_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *) ((arg1)->severity_label); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -2034,7 +2061,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_text_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *) ((arg1)->text); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -2055,7 +2082,7 @@ if ((items < 0) || (items > 0)) { SWIG_croak("Usage: new_lg_errinfo();"); } - result = (lg_errinfo *)calloc(1, sizeof(lg_errinfo)); + result = (lg_errinfo *)new lg_errinfo(); ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_lg_errinfo, SWIG_OWNER | SWIG_SHADOW); argvi++ ; XSRETURN(argvi); fail: @@ -2082,6 +2109,24 @@ } +XS(_wrap_linkgrammar_get_configuration) { + { + int argvi = 0; + char *result = 0 ; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: linkgrammar_get_configuration();"); + } + result = (char *)linkgrammar_get_configuration(); + ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; + XSRETURN(argvi); + fail: + SWIG_croak_null(); + } +} + + XS(_wrap_linkgrammar_get_dict_version) { { Dictionary arg1 = (Dictionary) 0 ; @@ -2098,7 +2143,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkgrammar_get_dict_version" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)linkgrammar_get_dict_version(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -2126,7 +2171,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkgrammar_get_dict_locale" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)linkgrammar_get_dict_locale(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -2155,13 +2200,13 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_create_lang" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); result = (Dictionary)dictionary_create_lang((char const *)arg1); ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Dictionary_s, 0 | 0); argvi++ ; - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; XSRETURN(argvi); fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; SWIG_croak_null(); } } @@ -2201,7 +2246,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_get_lang" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)dictionary_get_lang(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -2228,7 +2273,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_delete" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); dictionary_delete(arg1); ST(argvi) = sv_newmortal(); @@ -2256,13 +2301,13 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_set_data_dir" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); dictionary_set_data_dir((char const *)arg1); ST(argvi) = sv_newmortal(); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; XSRETURN(argvi); fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; SWIG_croak_null(); } } @@ -2279,7 +2324,7 @@ } result = (char *)dictionary_get_data_dir(); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; - free((char*)result); + delete[] result; XSRETURN(argvi); fail: SWIG_croak_null(); @@ -2321,9 +2366,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_delete" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_delete(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2351,12 +2396,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_verbosity" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_verbosity" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_verbosity(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2386,9 +2431,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_verbosity" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_verbosity(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2416,12 +2461,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_linkage_limit" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_linkage_limit" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_linkage_limit(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2451,9 +2496,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_linkage_limit" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_linkage_limit(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2481,12 +2526,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_disjunct_cost" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_double SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_disjunct_cost" "', argument " "2"" of type '" "double""'"); } - arg2 = (double)(val2); + arg2 = static_cast< double >(val2); parse_options_set_disjunct_cost(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2516,9 +2561,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_disjunct_cost" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (double)parse_options_get_disjunct_cost(arg1); - ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1((double)(result)); argvi++ ; + ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1(static_cast< double >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2546,12 +2591,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_min_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_min_null_count" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_min_null_count(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2581,9 +2626,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_min_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_min_null_count(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2611,12 +2656,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_null_count" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_null_count(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2646,9 +2691,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_null_count(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2676,12 +2721,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_islands_ok" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_islands_ok" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_islands_ok(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2711,9 +2756,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_islands_ok" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_islands_ok(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2741,12 +2786,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_short_length" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_short_length" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_short_length(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2776,9 +2821,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_short_length" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_short_length(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2806,12 +2851,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_memory" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_memory" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_memory(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2841,9 +2886,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_memory" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_memory(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2871,12 +2916,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_parse_time" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_parse_time" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_parse_time(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2906,9 +2951,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_parse_time" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_parse_time(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2936,12 +2981,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_cost_model_type" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_cost_model_type" "', argument " "2"" of type '" "Cost_Model_type""'"); } - arg2 = (Cost_Model_type)(val2); + arg2 = static_cast< Cost_Model_type >(val2); parse_options_set_cost_model_type(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -2971,9 +3016,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_cost_model_type" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (Cost_Model_type)parse_options_get_cost_model_type(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -2999,9 +3044,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_timer_expired" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_timer_expired(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3027,9 +3072,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_memory_exhausted" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_memory_exhausted(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3055,9 +3100,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_resources_exhausted" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_resources_exhausted(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3085,12 +3130,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_display_morphology" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_display_morphology" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_display_morphology(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -3120,9 +3165,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_display_morphology" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_display_morphology(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3150,12 +3195,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_spell_guess" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_spell_guess" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_spell_guess(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -3185,9 +3230,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_spell_guess" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_spell_guess(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3215,12 +3260,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_all_short_connectors" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_all_short_connectors" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_all_short_connectors(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -3250,9 +3295,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_all_short_connectors" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_all_short_connectors(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3277,7 +3322,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_reset_resources" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); parse_options_reset_resources(arg1); ST(argvi) = sv_newmortal(); @@ -3307,12 +3352,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_use_sat_parser" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_use_sat_parser" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_use_sat_parser(arg1,arg2); ST(argvi) = sv_newmortal(); @@ -3342,9 +3387,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_use_sat_parser" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_use_sat_parser(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3374,19 +3419,19 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_create" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); res2 = SWIG_ConvertPtr(ST(1), &argp2,SWIGTYPE_p_Dictionary_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_create" "', argument " "2"" of type '" "Dictionary""'"); } - arg2 = (Dictionary)(argp2); + arg2 = reinterpret_cast< Dictionary >(argp2); result = (Sentence)sentence_create((char const *)arg1,arg2); ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Sentence_s, 0 | 0); argvi++ ; - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; XSRETURN(argvi); fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; SWIG_croak_null(); } @@ -3408,7 +3453,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_delete" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); sentence_delete(arg1); ST(argvi) = sv_newmortal(); @@ -3439,14 +3484,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_split" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); res2 = SWIG_ConvertPtr(ST(1), &argp2,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_split" "', argument " "2"" of type '" "Parse_Options""'"); } - arg2 = (Parse_Options)(argp2); + arg2 = reinterpret_cast< Parse_Options >(argp2); result = (int)sentence_split(arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); @@ -3477,14 +3522,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_parse" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); res2 = SWIG_ConvertPtr(ST(1), &argp2,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_parse" "', argument " "2"" of type '" "Parse_Options""'"); } - arg2 = (Parse_Options)(argp2); + arg2 = reinterpret_cast< Parse_Options >(argp2); result = (int)sentence_parse(arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); @@ -3512,9 +3557,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_length" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_length(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3540,9 +3585,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_null_count" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_null_count(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3568,9 +3613,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_linkages_found" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_linkages_found(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3596,9 +3641,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_valid_linkages" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_valid_linkages(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3624,9 +3669,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_linkages_post_processed" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_linkages_post_processed(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -3655,14 +3700,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_violations" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_num_violations" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)sentence_num_violations(arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); @@ -3693,14 +3738,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_disjunct_cost" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_disjunct_cost" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (double)sentence_disjunct_cost(arg1,arg2); - ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1((double)(result)); argvi++ ; + ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1(static_cast< double >(result)); argvi++ ; XSRETURN(argvi); @@ -3731,14 +3776,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_link_cost" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_link_cost" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)sentence_link_cost(arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); @@ -3772,17 +3817,17 @@ if (!SWIG_IsOK(ecode1)) { SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "linkage_create" "', argument " "1"" of type '" "int""'"); } - arg1 = (int)(val1); + arg1 = static_cast< int >(val1); res2 = SWIG_ConvertPtr(ST(1), &argp2,SWIGTYPE_p_Sentence_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "linkage_create" "', argument " "2"" of type '" "Sentence""'"); } - arg2 = (Sentence)(argp2); + arg2 = reinterpret_cast< Sentence >(argp2); res3 = SWIG_ConvertPtr(ST(2), &argp3,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res3)) { SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "linkage_create" "', argument " "3"" of type '" "Parse_Options""'"); } - arg3 = (Parse_Options)(argp3); + arg3 = reinterpret_cast< Parse_Options >(argp3); result = (Linkage)linkage_create(arg1,arg2,arg3); ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Linkage_s, 0 | 0); argvi++ ; @@ -3813,7 +3858,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_delete" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); linkage_delete(arg1); ST(argvi) = sv_newmortal(); @@ -3847,17 +3892,17 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_diagram" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_diagram" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); ecode3 = SWIG_AsVal_size_t SWIG_PERL_CALL_ARGS_2(ST(2), &val3); if (!SWIG_IsOK(ecode3)) { SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "linkage_print_diagram" "', argument " "3"" of type '" "size_t""'"); } - arg3 = (size_t)(val3); + arg3 = static_cast< size_t >(val3); result = (char *)linkage_print_diagram(arg1,arg2,arg3); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -3898,17 +3943,17 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_postscript" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_postscript" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); ecode3 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(2), &val3); if (!SWIG_IsOK(ecode3)) { SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "linkage_print_postscript" "', argument " "3"" of type '" "int""'"); } - arg3 = (int)(val3); + arg3 = static_cast< int >(val3); result = (char *)linkage_print_postscript(arg1,arg2,arg3); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -3943,7 +3988,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_links_and_domains" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_links_and_domains(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -3974,7 +4019,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_senses" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_senses(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4008,12 +4053,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_constituent_tree" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_constituent_tree" "', argument " "2"" of type '" "ConstituentDisplayStyle""'"); } - arg2 = (ConstituentDisplayStyle)(val2); + arg2 = static_cast< ConstituentDisplayStyle >(val2); result = (char *)linkage_print_constituent_tree(arg1,arg2); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4046,7 +4091,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_disjuncts" "', argument " "1"" of type '" "Linkage const""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_disjuncts(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4074,7 +4119,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_pp_msgs" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_pp_msgs(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4102,9 +4147,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_num_words" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_get_num_words(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -4130,9 +4175,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_num_links" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_get_num_links(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -4161,14 +4206,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_lword" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_lword" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_lword(arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); @@ -4199,14 +4244,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_rword" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_rword" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_rword(arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); @@ -4237,14 +4282,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_length" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_length" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_length(arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); @@ -4275,12 +4320,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_label" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_label" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_label(arg1,arg2); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4313,12 +4358,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_llabel" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_llabel" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_llabel(arg1,arg2); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4351,12 +4396,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_rlabel" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_rlabel" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_rlabel(arg1,arg2); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4389,14 +4434,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_num_domains" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_num_domains" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_num_domains(arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); @@ -4427,12 +4472,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_domain_names" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_domain_names" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char **)linkage_get_link_domain_names(arg1,arg2); ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_p_char, 0 | 0); argvi++ ; @@ -4462,7 +4507,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_words" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char **)linkage_get_words(arg1); ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_p_char, 0 | 0); argvi++ ; @@ -4493,12 +4538,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_word(arg1,arg2); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4512,6 +4557,158 @@ } +XS(_wrap_linkage_get_word_byte_start) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word_byte_start(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_byte_start" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_byte_start" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_byte_start(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_word_byte_end) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word_byte_end(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_byte_end" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_byte_end" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_byte_end(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_word_char_start) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word_char_start(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_char_start" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_char_start" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_char_start(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + +XS(_wrap_linkage_get_word_char_end) { + { + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + int argvi = 0; + int result; + dXSARGS; + + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: linkage_get_word_char_end(linkage,index);"); + } + res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_char_end" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_char_end" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_char_end(arg1,arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + + + XSRETURN(argvi); + fail: + + + SWIG_croak_null(); + } +} + + XS(_wrap_linkage_unused_word_cost) { { Linkage arg1 = (Linkage) 0 ; @@ -4528,9 +4725,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_unused_word_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_unused_word_cost(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -4556,9 +4753,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_disjunct_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (double)linkage_disjunct_cost(arg1); - ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1((double)(result)); argvi++ ; + ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1(static_cast< double >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -4584,9 +4781,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_link_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_link_cost(arg1); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -4612,9 +4809,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_corpus_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (double)linkage_corpus_cost(arg1); - ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1((double)(result)); argvi++ ; + ST(argvi) = SWIG_From_double SWIG_PERL_CALL_ARGS_1(static_cast< double >(result)); argvi++ ; XSRETURN(argvi); fail: @@ -4640,7 +4837,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_violation_name" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_get_violation_name(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4668,7 +4865,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_lg_error_formatmsg" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *)lg_error_formatmsg(arg1); ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ; @@ -4693,7 +4890,7 @@ SWIG_croak("Usage: lg_error_clearall();"); } result = (int)lg_error_clearall(); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: SWIG_croak_null(); @@ -4704,28 +4901,56 @@ XS(_wrap__prt_error) { { char *arg1 = (char *) 0 ; - void *arg2 = 0 ; + char *arg2 = (char *) 0 ; int res1 ; char *buf1 = 0 ; int alloc1 = 0 ; + int res2 ; + char *buf2 = 0 ; + int alloc2 = 0 ; int argvi = 0; int result; dXSARGS; - if (items < 1) { - SWIG_croak("Usage: _prt_error(char const *,...);"); + if ((items < 2) || (items > 2)) { + SWIG_croak("Usage: _prt_error(char const *,char const *);"); } res1 = SWIG_AsCharPtrAndSize(ST(0), &buf1, NULL, &alloc1); if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_prt_error" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); - result = (int)prt_error((char const *)arg1,arg2); - ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(result)); argvi++ ; - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + arg1 = reinterpret_cast< char * >(buf1); + res2 = SWIG_AsCharPtrAndSize(ST(1), &buf2, NULL, &alloc2); + if (!SWIG_IsOK(res2)) { + SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "_prt_error" "', argument " "2"" of type '" "char const *""'"); + } + arg2 = reinterpret_cast< char * >(buf2); + result = (int)prt_error((char const *)arg1,(char const *)arg2); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + if (alloc2 == SWIG_NEWOBJ) delete[] buf2; + XSRETURN(argvi); + fail: + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + if (alloc2 == SWIG_NEWOBJ) delete[] buf2; + SWIG_croak_null(); + } +} + + +XS(_wrap_lg_error_flush) { + { + int argvi = 0; + int result; + dXSARGS; + + if ((items < 0) || (items > 0)) { + SWIG_croak("Usage: lg_error_flush();"); + } + result = (int)lg_error_flush(); + ST(argvi) = SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ; XSRETURN(argvi); fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); SWIG_croak_null(); } } @@ -4736,10 +4961,10 @@ static swig_type_info _swigt__p_ConstituentDisplayStyle = {"_p_ConstituentDisplayStyle", "enum ConstituentDisplayStyle *|ConstituentDisplayStyle *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_Cost_Model_type = {"_p_Cost_Model_type", "enum Cost_Model_type *|Cost_Model_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Dictionary_s = {"_p_Dictionary_s", "struct Dictionary_s *|Dictionary", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Linkage_s = {"_p_Linkage_s", "struct Linkage_s *|Linkage", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Parse_Options_s = {"_p_Parse_Options_s", "struct Parse_Options_s *|Parse_Options", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Sentence_s = {"_p_Sentence_s", "Sentence|struct Sentence_s *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Dictionary_s = {"_p_Dictionary_s", "Dictionary_s *|Dictionary", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Linkage_s = {"_p_Linkage_s", "Linkage_s *|Linkage", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Parse_Options_s = {"_p_Parse_Options_s", "Parse_Options_s *|Parse_Options", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Sentence_s = {"_p_Sentence_s", "Sentence|Sentence_s *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_lg_errinfo = {"_p_lg_errinfo", "lg_errinfo *", 0, 0, (void*)"clinkgrammar::lg_errinfo", 0}; static swig_type_info _swigt__p_lg_error_severity = {"_p_lg_error_severity", "enum lg_error_severity *|lg_error_severity *", 0, 0, (void*)0, 0}; @@ -4804,6 +5029,7 @@ {"clinkgrammarc::lg_errinfo_text_get", _wrap_lg_errinfo_text_get}, {"clinkgrammarc::new_lg_errinfo", _wrap_new_lg_errinfo}, {"clinkgrammarc::linkgrammar_get_version", _wrap_linkgrammar_get_version}, +{"clinkgrammarc::linkgrammar_get_configuration", _wrap_linkgrammar_get_configuration}, {"clinkgrammarc::linkgrammar_get_dict_version", _wrap_linkgrammar_get_dict_version}, {"clinkgrammarc::linkgrammar_get_dict_locale", _wrap_linkgrammar_get_dict_locale}, {"clinkgrammarc::dictionary_create_lang", _wrap_dictionary_create_lang}, @@ -4879,6 +5105,10 @@ {"clinkgrammarc::linkage_get_link_domain_names", _wrap_linkage_get_link_domain_names}, {"clinkgrammarc::linkage_get_words", _wrap_linkage_get_words}, {"clinkgrammarc::linkage_get_word", _wrap_linkage_get_word}, +{"clinkgrammarc::linkage_get_word_byte_start", _wrap_linkage_get_word_byte_start}, +{"clinkgrammarc::linkage_get_word_byte_end", _wrap_linkage_get_word_byte_end}, +{"clinkgrammarc::linkage_get_word_char_start", _wrap_linkage_get_word_char_start}, +{"clinkgrammarc::linkage_get_word_char_end", _wrap_linkage_get_word_char_end}, {"clinkgrammarc::linkage_unused_word_cost", _wrap_linkage_unused_word_cost}, {"clinkgrammarc::linkage_disjunct_cost", _wrap_linkage_disjunct_cost}, {"clinkgrammarc::linkage_link_cost", _wrap_linkage_link_cost}, @@ -4887,6 +5117,7 @@ {"clinkgrammarc::_lg_error_formatmsg", _wrap__lg_error_formatmsg}, {"clinkgrammarc::lg_error_clearall", _wrap_lg_error_clearall}, {"clinkgrammarc::_prt_error", _wrap__prt_error}, +{"clinkgrammarc::lg_error_flush", _wrap_lg_error_flush}, {0,0} }; /* ----------------------------------------------------------------------------- @@ -5183,73 +5414,73 @@ /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "lg_Fatal", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(lg_Fatal))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Fatal))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "lg_Error", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(lg_Error))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Error))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "lg_Warn", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(lg_Warn))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Warn))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "lg_Info", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(lg_Info))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Info))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "lg_Debug", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(lg_Debug))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Debug))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "lg_Trace", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(lg_Trace))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_Trace))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "lg_None", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(lg_None))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(lg_None))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; SWIG_TypeClientData(SWIGTYPE_p_lg_errinfo, (void*) "clinkgrammar::lg_errinfo"); /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "VDAL", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(VDAL))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(VDAL))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "CORPUS", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(CORPUS))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(CORPUS))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "NO_DISPLAY", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(NO_DISPLAY))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(NO_DISPLAY))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "MULTILINE", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(MULTILINE))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(MULTILINE))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "BRACKET_TREE", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(BRACKET_TREE))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(BRACKET_TREE))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "SINGLE_LINE", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(SINGLE_LINE))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(SINGLE_LINE))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; /*@SWIG:/usr/share/swig3.0/perl5/perltypemaps.swg,65,%set_constant@*/ do { SV *sv = get_sv((char*) SWIG_prefix "MAX_STYLES", TRUE | 0x2 | GV_ADDMULTI); - sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1((int)(MAX_STYLES))); + sv_setsv(sv, SWIG_From_int SWIG_PERL_CALL_ARGS_1(static_cast< int >(MAX_STYLES))); SvREADONLY_on(sv); } while(0) /*@SWIG@*/; ST(0) = &PL_sv_yes; diff -Nru link-grammar-5.3.16/bindings/perl/Makefile.in link-grammar-5.5.0/bindings/perl/Makefile.in --- link-grammar-5.3.16/bindings/perl/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/bindings/perl/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -227,6 +227,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -255,6 +256,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -263,13 +265,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -336,12 +341,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/bindings/perl/README link-grammar-5.5.0/bindings/perl/README --- link-grammar-5.3.16/bindings/perl/README 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/perl/README 2017-08-13 22:11:23.000000000 +0000 @@ -11,6 +11,6 @@ The build of the perl bindings is disabled by default; there are c++11 incompatibilities in 5.18/CORE/perl.h that cause issues for various -compilers. You can enalbe the perl bindings by saying: +compilers. You can enable the perl bindings by saying: ./configure --enable-perl-bindings diff -Nru link-grammar-5.3.16/bindings/python/clinkgrammar.py link-grammar-5.5.0/bindings/python/clinkgrammar.py --- link-grammar-5.3.16/bindings/python/clinkgrammar.py 2017-01-19 18:45:13.000000000 +0000 +++ link-grammar-5.5.0/bindings/python/clinkgrammar.py 2018-04-26 02:53:16.000000000 +0000 @@ -148,6 +148,10 @@ return _clinkgrammar.linkgrammar_get_version() linkgrammar_get_version = _clinkgrammar.linkgrammar_get_version +def linkgrammar_get_configuration(): + return _clinkgrammar.linkgrammar_get_configuration() +linkgrammar_get_configuration = _clinkgrammar.linkgrammar_get_configuration + def linkgrammar_get_dict_version(dict): return _clinkgrammar.linkgrammar_get_dict_version(dict) linkgrammar_get_dict_version = _clinkgrammar.linkgrammar_get_dict_version @@ -448,6 +452,22 @@ return _clinkgrammar.linkage_get_word(linkage, w) linkage_get_word = _clinkgrammar.linkage_get_word +def linkage_get_word_byte_start(linkage, index): + return _clinkgrammar.linkage_get_word_byte_start(linkage, index) +linkage_get_word_byte_start = _clinkgrammar.linkage_get_word_byte_start + +def linkage_get_word_byte_end(linkage, index): + return _clinkgrammar.linkage_get_word_byte_end(linkage, index) +linkage_get_word_byte_end = _clinkgrammar.linkage_get_word_byte_end + +def linkage_get_word_char_start(linkage, index): + return _clinkgrammar.linkage_get_word_char_start(linkage, index) +linkage_get_word_char_start = _clinkgrammar.linkage_get_word_char_start + +def linkage_get_word_char_end(linkage, index): + return _clinkgrammar.linkage_get_word_char_end(linkage, index) +linkage_get_word_char_end = _clinkgrammar.linkage_get_word_char_end + def linkage_unused_word_cost(linkage): return _clinkgrammar.linkage_unused_word_cost(linkage) linkage_unused_word_cost = _clinkgrammar.linkage_unused_word_cost @@ -476,12 +496,16 @@ return _clinkgrammar.lg_error_clearall() lg_error_clearall = _clinkgrammar.lg_error_clearall -def _prt_error(arg1): - return _clinkgrammar._prt_error(arg1) +def _prt_error(arg1, arg2): + return _clinkgrammar._prt_error(arg1, arg2) _prt_error = _clinkgrammar._prt_error -def _py_error_default_handler(lge, pedh_data): - return _clinkgrammar._py_error_default_handler(lge, pedh_data) +def lg_error_flush(): + return _clinkgrammar.lg_error_flush() +lg_error_flush = _clinkgrammar.lg_error_flush + +def _py_error_default_handler(eh_lge, pedh_data): + return _clinkgrammar._py_error_default_handler(eh_lge, pedh_data) _py_error_default_handler = _clinkgrammar._py_error_default_handler def _py_error_set_handler(func_and_data): diff -Nru link-grammar-5.3.16/bindings/python/__init__.py link-grammar-5.5.0/bindings/python/__init__.py --- link-grammar-5.3.16/bindings/python/__init__.py 2017-04-15 21:51:08.000000000 +0000 +++ link-grammar-5.5.0/bindings/python/__init__.py 2018-04-29 21:40:31.000000000 +0000 @@ -1,6 +1,6 @@ # # __init__.py file for Link Grammar Python bindings # -__version__ = "5.3.16" +__version__ = "5.5.0" from .linkgrammar import * diff -Nru link-grammar-5.3.16/bindings/python/lg_python_wrap.cc link-grammar-5.5.0/bindings/python/lg_python_wrap.cc --- link-grammar-5.3.16/bindings/python/lg_python_wrap.cc 2017-01-19 18:45:13.000000000 +0000 +++ link-grammar-5.5.0/bindings/python/lg_python_wrap.cc 2018-04-26 02:53:16.000000000 +0000 @@ -15,6 +15,30 @@ #define SWIG_PYTHON_DIRECTOR_NO_VTABLE + +#ifdef __cplusplus +/* SwigValueWrapper is described in swig.swg */ +template class SwigValueWrapper { + struct SwigMovePointer { + T *ptr; + SwigMovePointer(T *p) : ptr(p) { } + ~SwigMovePointer() { delete ptr; } + SwigMovePointer& operator=(SwigMovePointer& rhs) { T* oldptr = ptr; ptr = 0; delete oldptr; ptr = rhs.ptr; rhs.ptr = 0; return *this; } + } pointer; + SwigValueWrapper& operator=(const SwigValueWrapper& rhs); + SwigValueWrapper(const SwigValueWrapper& rhs); +public: + SwigValueWrapper() : pointer(0) { } + SwigValueWrapper& operator=(const T& t) { SwigMovePointer tmp(new T(t)); pointer = tmp; return *this; } + operator T&() const { return *pointer.ptr; } + T *operator&() { return pointer.ptr; } +}; + +template T SwigValueInit() { + return T(); +} +#endif + /* ----------------------------------------------------------------------------- * This section contains generic SWIG labels for method/variable * declarations/attributes, and other compiler dependent labels. @@ -3021,8 +3045,81 @@ #define SWIG_VERSION SWIGVERSION -#define SWIG_as_voidptr(a) (void *)((const void *)(a)) -#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),(void**)(a)) +#define SWIG_as_voidptr(a) const_cast< void * >(static_cast< const void * >(a)) +#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),reinterpret_cast< void** >(a)) + + +#include + + +namespace swig { + class SwigPtr_PyObject { + protected: + PyObject *_obj; + + public: + SwigPtr_PyObject() :_obj(0) + { + } + + SwigPtr_PyObject(const SwigPtr_PyObject& item) : _obj(item._obj) + { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + Py_XINCREF(_obj); + SWIG_PYTHON_THREAD_END_BLOCK; + } + + SwigPtr_PyObject(PyObject *obj, bool initial_ref = true) :_obj(obj) + { + if (initial_ref) { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + Py_XINCREF(_obj); + SWIG_PYTHON_THREAD_END_BLOCK; + } + } + + SwigPtr_PyObject & operator=(const SwigPtr_PyObject& item) + { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + Py_XINCREF(item._obj); + Py_XDECREF(_obj); + _obj = item._obj; + SWIG_PYTHON_THREAD_END_BLOCK; + return *this; + } + + ~SwigPtr_PyObject() + { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + Py_XDECREF(_obj); + SWIG_PYTHON_THREAD_END_BLOCK; + } + + operator PyObject *() const + { + return _obj; + } + + PyObject *operator->() const + { + return _obj; + } + }; +} + + +namespace swig { + struct SwigVar_PyObject : SwigPtr_PyObject { + SwigVar_PyObject(PyObject* obj = 0) : SwigPtr_PyObject(obj, false) { } + + SwigVar_PyObject & operator = (PyObject* obj) + { + Py_XDECREF(_obj); + _obj = obj; + return *this; + } + }; +} @@ -3057,20 +3154,20 @@ if (size > INT_MAX) { swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); return pchar_descriptor ? - SWIG_InternalNewPointerObj((char *)(carray), pchar_descriptor, 0) : SWIG_Py_Void(); + SWIG_InternalNewPointerObj(const_cast< char * >(carray), pchar_descriptor, 0) : SWIG_Py_Void(); } else { #if PY_VERSION_HEX >= 0x03000000 #if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - return PyBytes_FromStringAndSize(carray, (Py_ssize_t)(size)); + return PyBytes_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); #else #if PY_VERSION_HEX >= 0x03010000 - return PyUnicode_DecodeUTF8(carray, (Py_ssize_t)(size), "surrogateescape"); + return PyUnicode_DecodeUTF8(carray, static_cast< Py_ssize_t >(size), "surrogateescape"); #else - return PyUnicode_FromStringAndSize(carray, (Py_ssize_t)(size)); + return PyUnicode_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); #endif #endif #else - return PyString_FromStringAndSize(carray, (Py_ssize_t)(size)); + return PyString_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); #endif } } else { @@ -3133,7 +3230,7 @@ if (*alloc == SWIG_NEWOBJ) #endif { - *cptr = (char *)memcpy((char *)malloc((len + 1)*sizeof(char)), cstr, sizeof(char)*(len + 1)); + *cptr = reinterpret_cast< char* >(memcpy((new char[len + 1]), cstr, sizeof(char)*(len + 1))); *alloc = SWIG_NEWOBJ; } else { *cptr = cstr; @@ -3171,7 +3268,7 @@ if (PyString_AsStringAndSize(obj, &cstr, &len) != -1) { if (cptr) { if (alloc) *alloc = SWIG_NEWOBJ; - *cptr = (char *)memcpy((char *)malloc((len + 1)*sizeof(char)), cstr, sizeof(char)*(len + 1)); + *cptr = reinterpret_cast< char* >(memcpy((new char[len + 1]), cstr, sizeof(char)*(len + 1))); } if (psize) *psize = len + 1; @@ -3346,7 +3443,7 @@ if ((v < INT_MIN || v > INT_MAX)) { return SWIG_OverflowError; } else { - if (val) *val = (int)(v); + if (val) *val = static_cast< int >(v); } } return res; @@ -3457,12 +3554,12 @@ #endif unsigned long v; res = SWIG_AsVal_unsigned_SS_long (obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = (size_t)(v); + if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); #ifdef SWIG_LONG_LONG_AVAILABLE } else if (sizeof(size_t) <= sizeof(unsigned long long)) { unsigned long long v; res = SWIG_AsVal_unsigned_SS_long_SS_long (obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = (size_t)(v); + if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); } #endif return res; @@ -3508,9 +3605,9 @@ } -void _py_error_default_handler(lg_errinfo *lge, int *pedh_data) +void _py_error_default_handler(lg_errinfo *eh_lge, int *pedh_data) { - default_error_handler(lge, (void *)pedh_data); + default_error_handler(eh_lge, (void *)pedh_data); } /** @@ -3568,6 +3665,7 @@ } void delete_lg_errinfo(lg_errinfo *lge) { + if (NULL == lge) return; /* Was None - nothing to free. */ free((void *)lge->severity_label); free((void *)lge->text); free((void *)lge); @@ -3589,9 +3687,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_severity_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (lg_error_severity) ((arg1)->severity); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -3611,7 +3709,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_severity_label_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *) ((arg1)->severity_label); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3633,7 +3731,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_text_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *) ((arg1)->text); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3647,7 +3745,7 @@ lg_errinfo *result = 0 ; if (!PyArg_ParseTuple(args,(char *)":new_lg_errinfo")) SWIG_fail; - result = (lg_errinfo *)calloc(1, sizeof(lg_errinfo)); + result = (lg_errinfo *)new lg_errinfo(); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_lg_errinfo, SWIG_POINTER_NEW | 0 ); return resultobj; fail: @@ -3675,6 +3773,19 @@ } +SWIGINTERN PyObject *_wrap_linkgrammar_get_configuration(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + char *result = 0 ; + + if (!PyArg_ParseTuple(args,(char *)":linkgrammar_get_configuration")) SWIG_fail; + result = (char *)linkgrammar_get_configuration(); + resultobj = SWIG_FromCharPtr((const char *)result); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_linkgrammar_get_dict_version(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; Dictionary arg1 = (Dictionary) 0 ; @@ -3688,7 +3799,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkgrammar_get_dict_version" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)linkgrammar_get_dict_version(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3710,7 +3821,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkgrammar_get_dict_locale" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)linkgrammar_get_dict_locale(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3733,13 +3844,13 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_create_lang" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); result = (Dictionary)dictionary_create_lang((char const *)arg1); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Dictionary_s, 0 | 0 ); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return resultobj; fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return NULL; } @@ -3770,7 +3881,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_get_lang" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)dictionary_get_lang(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3791,7 +3902,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_delete" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); dictionary_delete(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -3813,13 +3924,13 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_set_data_dir" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); dictionary_set_data_dir((char const *)arg1); resultobj = SWIG_Py_Void(); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return resultobj; fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return NULL; } @@ -3831,7 +3942,7 @@ if (!PyArg_ParseTuple(args,(char *)":dictionary_get_data_dir")) SWIG_fail; result = (char *)dictionary_get_data_dir(); resultobj = SWIG_FromCharPtr((const char *)result); - free((char*)result); + delete[] result; return resultobj; fail: return NULL; @@ -3864,9 +3975,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_delete" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_delete(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -3889,12 +4000,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_verbosity" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_verbosity" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_verbosity(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -3916,9 +4027,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_verbosity" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_verbosity(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -3941,12 +4052,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_linkage_limit" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_linkage_limit" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_linkage_limit(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -3968,9 +4079,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_linkage_limit" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_linkage_limit(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -3993,12 +4104,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_disjunct_cost" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_double(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_disjunct_cost" "', argument " "2"" of type '" "double""'"); } - arg2 = (double)(val2); + arg2 = static_cast< double >(val2); parse_options_set_disjunct_cost(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4020,9 +4131,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_disjunct_cost" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (double)parse_options_get_disjunct_cost(arg1); - resultobj = SWIG_From_double((double)(result)); + resultobj = SWIG_From_double(static_cast< double >(result)); return resultobj; fail: return NULL; @@ -4045,12 +4156,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_min_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_min_null_count" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_min_null_count(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4072,9 +4183,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_min_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_min_null_count(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4097,12 +4208,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_null_count" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_null_count(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4124,9 +4235,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_null_count(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4149,12 +4260,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_islands_ok" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_islands_ok" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_islands_ok(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4176,9 +4287,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_islands_ok" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_islands_ok(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4201,12 +4312,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_short_length" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_short_length" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_short_length(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4228,9 +4339,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_short_length" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_short_length(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4253,12 +4364,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_memory" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_memory" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_memory(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4280,9 +4391,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_memory" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_memory(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4305,12 +4416,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_parse_time" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_parse_time" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_parse_time(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4332,9 +4443,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_parse_time" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_parse_time(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4357,12 +4468,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_cost_model_type" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_cost_model_type" "', argument " "2"" of type '" "Cost_Model_type""'"); } - arg2 = (Cost_Model_type)(val2); + arg2 = static_cast< Cost_Model_type >(val2); parse_options_set_cost_model_type(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4384,9 +4495,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_cost_model_type" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (Cost_Model_type)parse_options_get_cost_model_type(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4406,9 +4517,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_timer_expired" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_timer_expired(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4428,9 +4539,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_memory_exhausted" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_memory_exhausted(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4450,9 +4561,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_resources_exhausted" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_resources_exhausted(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4475,12 +4586,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_display_morphology" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_display_morphology" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_display_morphology(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4502,9 +4613,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_display_morphology" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_display_morphology(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4527,12 +4638,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_spell_guess" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_spell_guess" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_spell_guess(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4554,9 +4665,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_spell_guess" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_spell_guess(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4579,12 +4690,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_all_short_connectors" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_all_short_connectors" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_all_short_connectors(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4606,9 +4717,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_all_short_connectors" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_all_short_connectors(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4627,7 +4738,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_reset_resources" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); parse_options_reset_resources(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -4652,12 +4763,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_use_sat_parser" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_use_sat_parser" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_use_sat_parser(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4679,9 +4790,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_use_sat_parser" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_use_sat_parser(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4706,18 +4817,18 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_create" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_Dictionary_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_create" "', argument " "2"" of type '" "Dictionary""'"); } - arg2 = (Dictionary)(argp2); + arg2 = reinterpret_cast< Dictionary >(argp2); result = (Sentence)sentence_create((char const *)arg1,arg2); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Sentence_s, 0 | 0 ); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return resultobj; fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return NULL; } @@ -4734,7 +4845,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_delete" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); sentence_delete(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -4760,14 +4871,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_split" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_split" "', argument " "2"" of type '" "Parse_Options""'"); } - arg2 = (Parse_Options)(argp2); + arg2 = reinterpret_cast< Parse_Options >(argp2); result = (int)sentence_split(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4791,14 +4902,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_parse" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_parse" "', argument " "2"" of type '" "Parse_Options""'"); } - arg2 = (Parse_Options)(argp2); + arg2 = reinterpret_cast< Parse_Options >(argp2); result = (int)sentence_parse(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4818,9 +4929,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_length" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_length(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4840,9 +4951,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_null_count" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_null_count(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4862,9 +4973,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_linkages_found" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_linkages_found(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4884,9 +4995,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_valid_linkages" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_valid_linkages(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4906,9 +5017,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_linkages_post_processed" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_linkages_post_processed(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4932,14 +5043,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_violations" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_num_violations" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)sentence_num_violations(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4963,14 +5074,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_disjunct_cost" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_disjunct_cost" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (double)sentence_disjunct_cost(arg1,arg2); - resultobj = SWIG_From_double((double)(result)); + resultobj = SWIG_From_double(static_cast< double >(result)); return resultobj; fail: return NULL; @@ -4994,14 +5105,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_link_cost" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_link_cost" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)sentence_link_cost(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5029,17 +5140,17 @@ if (!SWIG_IsOK(ecode1)) { SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "linkage_create" "', argument " "1"" of type '" "int""'"); } - arg1 = (int)(val1); + arg1 = static_cast< int >(val1); res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_Sentence_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "linkage_create" "', argument " "2"" of type '" "Sentence""'"); } - arg2 = (Sentence)(argp2); + arg2 = reinterpret_cast< Sentence >(argp2); res3 = SWIG_ConvertPtr(obj2, &argp3,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res3)) { SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "linkage_create" "', argument " "3"" of type '" "Parse_Options""'"); } - arg3 = (Parse_Options)(argp3); + arg3 = reinterpret_cast< Parse_Options >(argp3); result = (Linkage)linkage_create(arg1,arg2,arg3); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Linkage_s, 0 | 0 ); return resultobj; @@ -5060,7 +5171,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_delete" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); linkage_delete(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -5090,17 +5201,17 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_diagram" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_diagram" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); ecode3 = SWIG_AsVal_size_t(obj2, &val3); if (!SWIG_IsOK(ecode3)) { SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "linkage_print_diagram" "', argument " "3"" of type '" "size_t""'"); } - arg3 = (size_t)(val3); + arg3 = static_cast< size_t >(val3); result = (char *)linkage_print_diagram(arg1,arg2,arg3); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5133,17 +5244,17 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_postscript" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_postscript" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); ecode3 = SWIG_AsVal_int(obj2, &val3); if (!SWIG_IsOK(ecode3)) { SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "linkage_print_postscript" "', argument " "3"" of type '" "int""'"); } - arg3 = (int)(val3); + arg3 = static_cast< int >(val3); result = (char *)linkage_print_postscript(arg1,arg2,arg3); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5168,7 +5279,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_links_and_domains" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_links_and_domains(arg1); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5193,7 +5304,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_senses" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_senses(arg1); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5222,12 +5333,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_constituent_tree" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_constituent_tree" "', argument " "2"" of type '" "ConstituentDisplayStyle""'"); } - arg2 = (ConstituentDisplayStyle)(val2); + arg2 = static_cast< ConstituentDisplayStyle >(val2); result = (char *)linkage_print_constituent_tree(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5252,7 +5363,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_disjuncts" "', argument " "1"" of type '" "Linkage const""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_disjuncts(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5274,7 +5385,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_pp_msgs" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_pp_msgs(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5296,9 +5407,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_num_words" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_get_num_words(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5318,9 +5429,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_num_links" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_get_num_links(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5344,14 +5455,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_lword" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_lword" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_lword(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5375,14 +5486,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_rword" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_rword" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_rword(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5406,14 +5517,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_length" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_length" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_length(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5437,12 +5548,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_label" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_label" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_label(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5468,12 +5579,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_llabel" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_llabel" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_llabel(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5499,12 +5610,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_rlabel" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_rlabel" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_rlabel(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5530,14 +5641,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_num_domains" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_num_domains" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_num_domains(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5561,12 +5672,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_domain_names" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_domain_names" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char **)linkage_get_link_domain_names(arg1,arg2); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_p_char, 0 | 0 ); return resultobj; @@ -5588,7 +5699,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_words" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char **)linkage_get_words(arg1); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_p_char, 0 | 0 ); return resultobj; @@ -5614,12 +5725,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_word(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5628,6 +5739,130 @@ } +SWIGINTERN PyObject *_wrap_linkage_get_word_byte_start(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; + int result; + + if (!PyArg_ParseTuple(args,(char *)"OO:linkage_get_word_byte_start",&obj0,&obj1)) SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_byte_start" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int(obj1, &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_byte_start" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_byte_start(arg1,arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_linkage_get_word_byte_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; + int result; + + if (!PyArg_ParseTuple(args,(char *)"OO:linkage_get_word_byte_end",&obj0,&obj1)) SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_byte_end" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int(obj1, &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_byte_end" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_byte_end(arg1,arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_linkage_get_word_char_start(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; + int result; + + if (!PyArg_ParseTuple(args,(char *)"OO:linkage_get_word_char_start",&obj0,&obj1)) SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_char_start" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int(obj1, &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_char_start" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_char_start(arg1,arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_linkage_get_word_char_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; + int result; + + if (!PyArg_ParseTuple(args,(char *)"OO:linkage_get_word_char_end",&obj0,&obj1)) SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_char_end" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int(obj1, &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_char_end" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_char_end(arg1,arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_linkage_unused_word_cost(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; Linkage arg1 = (Linkage) 0 ; @@ -5641,9 +5876,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_unused_word_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_unused_word_cost(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5663,9 +5898,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_disjunct_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (double)linkage_disjunct_cost(arg1); - resultobj = SWIG_From_double((double)(result)); + resultobj = SWIG_From_double(static_cast< double >(result)); return resultobj; fail: return NULL; @@ -5685,9 +5920,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_link_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_link_cost(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5707,9 +5942,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_corpus_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (double)linkage_corpus_cost(arg1); - resultobj = SWIG_From_double((double)(result)); + resultobj = SWIG_From_double(static_cast< double >(result)); return resultobj; fail: return NULL; @@ -5729,7 +5964,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_violation_name" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_get_violation_name(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5751,7 +5986,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_lg_error_formatmsg" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *)lg_error_formatmsg(arg1); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5769,50 +6004,60 @@ if (!PyArg_ParseTuple(args,(char *)":lg_error_clearall")) SWIG_fail; result = (int)lg_error_clearall(); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap__prt_error__varargs__(PyObject *SWIGUNUSEDPARM(self), PyObject *args, PyObject *varargs) { +SWIGINTERN PyObject *_wrap__prt_error(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; char *arg1 = (char *) 0 ; - void *arg2 = 0 ; + char *arg2 = (char *) 0 ; int res1 ; char *buf1 = 0 ; int alloc1 = 0 ; + int res2 ; + char *buf2 = 0 ; + int alloc2 = 0 ; PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; int result; - if (!PyArg_ParseTuple(args,(char *)"O:_prt_error",&obj0)) SWIG_fail; + if (!PyArg_ParseTuple(args,(char *)"OO:_prt_error",&obj0,&obj1)) SWIG_fail; res1 = SWIG_AsCharPtrAndSize(obj0, &buf1, NULL, &alloc1); if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_prt_error" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); - result = (int)prt_error((char const *)arg1,arg2); - resultobj = SWIG_From_int((int)(result)); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + arg1 = reinterpret_cast< char * >(buf1); + res2 = SWIG_AsCharPtrAndSize(obj1, &buf2, NULL, &alloc2); + if (!SWIG_IsOK(res2)) { + SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "_prt_error" "', argument " "2"" of type '" "char const *""'"); + } + arg2 = reinterpret_cast< char * >(buf2); + result = (int)prt_error((char const *)arg1,(char const *)arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + if (alloc2 == SWIG_NEWOBJ) delete[] buf2; return resultobj; fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + if (alloc2 == SWIG_NEWOBJ) delete[] buf2; return NULL; } -SWIGINTERN PyObject *_wrap__prt_error(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj; - PyObject *varargs; - PyObject *newargs; +SWIGINTERN PyObject *_wrap_lg_error_flush(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + int result; - newargs = PyTuple_GetSlice(args,0,1); - varargs = PyTuple_GetSlice(args,1,PyTuple_Size(args)); - resultobj = _wrap__prt_error__varargs__(NULL,newargs,varargs); - Py_XDECREF(newargs); - Py_XDECREF(varargs); + if (!PyArg_ParseTuple(args,(char *)":lg_error_flush")) SWIG_fail; + result = (int)lg_error_flush(); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; +fail: + return NULL; } @@ -5820,19 +6065,24 @@ PyObject *resultobj = 0; lg_errinfo *arg1 = (lg_errinfo *) 0 ; int *arg2 = (int *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; + int arg20 ; PyObject * obj0 = 0 ; PyObject * obj1 = 0 ; if (!PyArg_ParseTuple(args,(char *)"OO:_py_error_default_handler",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_lg_errinfo, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_py_error_default_handler" "', argument " "1"" of type '" "lg_errinfo *""'"); + { + void *argp1 = 0; + + if (Py_None == obj0) + SWIG_exception_fail(SWIG_TypeError, "in method '_py_error_default_handler', argument 1 (of type lg_errinfo *) must not be None."); + + int res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_lg_errinfo, 0); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '_py_error_default_handler', argument 1 of type 'lg_errinfo *'"); + } + arg1 = (lg_errinfo *)(argp1); } - arg1 = (lg_errinfo *)(argp1); { - int arg; int error = false; const char errmsg[] = "The default error handler data argument (arg 2) " "must be an integer (0 to lg_None) or None."; @@ -5850,17 +6100,17 @@ } else { - arg = (int)PyInt_AsLong(obj1); + arg20 = (int)PyInt_AsLong(obj1); } - if ((arg < 0) || (arg > lg_None)) + if ((arg20 < 0) || (arg20 > lg_None)) { SWIG_exception_fail(SWIG_ValueError, errmsg); error = true; } if (error) return NULL; - arg2 = &arg; + arg2 = &arg20; } } _py_error_default_handler(arg1,arg2); @@ -5915,7 +6165,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_lg_errinfo" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); delete_lg_errinfo(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -5932,6 +6182,7 @@ { (char *)"new_lg_errinfo", _wrap_new_lg_errinfo, METH_VARARGS, NULL}, { (char *)"lg_errinfo_swigregister", lg_errinfo_swigregister, METH_VARARGS, NULL}, { (char *)"linkgrammar_get_version", _wrap_linkgrammar_get_version, METH_VARARGS, NULL}, + { (char *)"linkgrammar_get_configuration", _wrap_linkgrammar_get_configuration, METH_VARARGS, NULL}, { (char *)"linkgrammar_get_dict_version", _wrap_linkgrammar_get_dict_version, METH_VARARGS, NULL}, { (char *)"linkgrammar_get_dict_locale", _wrap_linkgrammar_get_dict_locale, METH_VARARGS, NULL}, { (char *)"dictionary_create_lang", _wrap_dictionary_create_lang, METH_VARARGS, NULL}, @@ -6007,6 +6258,10 @@ { (char *)"linkage_get_link_domain_names", _wrap_linkage_get_link_domain_names, METH_VARARGS, NULL}, { (char *)"linkage_get_words", _wrap_linkage_get_words, METH_VARARGS, NULL}, { (char *)"linkage_get_word", _wrap_linkage_get_word, METH_VARARGS, NULL}, + { (char *)"linkage_get_word_byte_start", _wrap_linkage_get_word_byte_start, METH_VARARGS, NULL}, + { (char *)"linkage_get_word_byte_end", _wrap_linkage_get_word_byte_end, METH_VARARGS, NULL}, + { (char *)"linkage_get_word_char_start", _wrap_linkage_get_word_char_start, METH_VARARGS, NULL}, + { (char *)"linkage_get_word_char_end", _wrap_linkage_get_word_char_end, METH_VARARGS, NULL}, { (char *)"linkage_unused_word_cost", _wrap_linkage_unused_word_cost, METH_VARARGS, NULL}, { (char *)"linkage_disjunct_cost", _wrap_linkage_disjunct_cost, METH_VARARGS, NULL}, { (char *)"linkage_link_cost", _wrap_linkage_link_cost, METH_VARARGS, NULL}, @@ -6015,6 +6270,7 @@ { (char *)"_lg_error_formatmsg", _wrap__lg_error_formatmsg, METH_VARARGS, NULL}, { (char *)"lg_error_clearall", _wrap_lg_error_clearall, METH_VARARGS, NULL}, { (char *)"_prt_error", _wrap__prt_error, METH_VARARGS, NULL}, + { (char *)"lg_error_flush", _wrap_lg_error_flush, METH_VARARGS, NULL}, { (char *)"_py_error_default_handler", _wrap__py_error_default_handler, METH_VARARGS, NULL}, { (char *)"_py_error_set_handler", _wrap__py_error_set_handler, METH_VARARGS, NULL}, { (char *)"_py_error_printall", _wrap__py_error_printall, METH_VARARGS, NULL}, @@ -6027,10 +6283,10 @@ static swig_type_info _swigt__p_ConstituentDisplayStyle = {"_p_ConstituentDisplayStyle", "enum ConstituentDisplayStyle *|ConstituentDisplayStyle *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_Cost_Model_type = {"_p_Cost_Model_type", "enum Cost_Model_type *|Cost_Model_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Dictionary_s = {"_p_Dictionary_s", "struct Dictionary_s *|Dictionary", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Linkage_s = {"_p_Linkage_s", "struct Linkage_s *|Linkage", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Parse_Options_s = {"_p_Parse_Options_s", "struct Parse_Options_s *|Parse_Options", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Sentence_s = {"_p_Sentence_s", "Sentence|struct Sentence_s *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Dictionary_s = {"_p_Dictionary_s", "Dictionary_s *|Dictionary", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Linkage_s = {"_p_Linkage_s", "Linkage_s *|Linkage", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Parse_Options_s = {"_p_Parse_Options_s", "Parse_Options_s *|Parse_Options", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Sentence_s = {"_p_Sentence_s", "Sentence|Sentence_s *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_int = {"_p_int", "int *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_lg_errinfo = {"_p_lg_errinfo", "lg_errinfo *", 0, 0, (void*)0, 0}; @@ -6773,20 +7029,20 @@ SWIG_InstallConstants(d,swig_const_table); - SWIG_Python_SetConstant(d, "lg_Fatal",SWIG_From_int((int)(lg_Fatal))); - SWIG_Python_SetConstant(d, "lg_Error",SWIG_From_int((int)(lg_Error))); - SWIG_Python_SetConstant(d, "lg_Warn",SWIG_From_int((int)(lg_Warn))); - SWIG_Python_SetConstant(d, "lg_Info",SWIG_From_int((int)(lg_Info))); - SWIG_Python_SetConstant(d, "lg_Debug",SWIG_From_int((int)(lg_Debug))); - SWIG_Python_SetConstant(d, "lg_Trace",SWIG_From_int((int)(lg_Trace))); - SWIG_Python_SetConstant(d, "lg_None",SWIG_From_int((int)(lg_None))); - SWIG_Python_SetConstant(d, "VDAL",SWIG_From_int((int)(VDAL))); - SWIG_Python_SetConstant(d, "CORPUS",SWIG_From_int((int)(CORPUS))); - SWIG_Python_SetConstant(d, "NO_DISPLAY",SWIG_From_int((int)(NO_DISPLAY))); - SWIG_Python_SetConstant(d, "MULTILINE",SWIG_From_int((int)(MULTILINE))); - SWIG_Python_SetConstant(d, "BRACKET_TREE",SWIG_From_int((int)(BRACKET_TREE))); - SWIG_Python_SetConstant(d, "SINGLE_LINE",SWIG_From_int((int)(SINGLE_LINE))); - SWIG_Python_SetConstant(d, "MAX_STYLES",SWIG_From_int((int)(MAX_STYLES))); + SWIG_Python_SetConstant(d, "lg_Fatal",SWIG_From_int(static_cast< int >(lg_Fatal))); + SWIG_Python_SetConstant(d, "lg_Error",SWIG_From_int(static_cast< int >(lg_Error))); + SWIG_Python_SetConstant(d, "lg_Warn",SWIG_From_int(static_cast< int >(lg_Warn))); + SWIG_Python_SetConstant(d, "lg_Info",SWIG_From_int(static_cast< int >(lg_Info))); + SWIG_Python_SetConstant(d, "lg_Debug",SWIG_From_int(static_cast< int >(lg_Debug))); + SWIG_Python_SetConstant(d, "lg_Trace",SWIG_From_int(static_cast< int >(lg_Trace))); + SWIG_Python_SetConstant(d, "lg_None",SWIG_From_int(static_cast< int >(lg_None))); + SWIG_Python_SetConstant(d, "VDAL",SWIG_From_int(static_cast< int >(VDAL))); + SWIG_Python_SetConstant(d, "CORPUS",SWIG_From_int(static_cast< int >(CORPUS))); + SWIG_Python_SetConstant(d, "NO_DISPLAY",SWIG_From_int(static_cast< int >(NO_DISPLAY))); + SWIG_Python_SetConstant(d, "MULTILINE",SWIG_From_int(static_cast< int >(MULTILINE))); + SWIG_Python_SetConstant(d, "BRACKET_TREE",SWIG_From_int(static_cast< int >(BRACKET_TREE))); + SWIG_Python_SetConstant(d, "SINGLE_LINE",SWIG_From_int(static_cast< int >(SINGLE_LINE))); + SWIG_Python_SetConstant(d, "MAX_STYLES",SWIG_From_int(static_cast< int >(MAX_STYLES))); #if PY_VERSION_HEX >= 0x03000000 return m; #else diff -Nru link-grammar-5.3.16/bindings/python/linkgrammar.py link-grammar-5.5.0/bindings/python/linkgrammar.py --- link-grammar-5.3.16/bindings/python/linkgrammar.py 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/bindings/python/linkgrammar.py 2018-04-23 19:49:07.000000000 +0000 @@ -80,8 +80,8 @@ def verbosity(self, value): if not isinstance(value, int): raise TypeError("Verbosity must be set to an integer") - if value not in range(0,15): - raise ValueError("Verbosity levels can be any integer between 0 and 15 inclusive") + if value not in range(0,120): + raise ValueError("Verbosity levels can be any integer between 0 and 120 inclusive") clg.parse_options_set_verbosity(self._obj, value) @property @@ -289,6 +289,10 @@ # return clg.lg_error_formatmsg(lgerror) @staticmethod + def flush(): + return clg.lg_error_flush() + + @staticmethod def printall(ehandler_func, ehandler_data=None): return clg._py_error_printall((ehandler_func, ehandler_data)) @@ -298,7 +302,19 @@ @staticmethod def message(msg): - return clg._prt_error(msg) + """Print a message through the LG error facility""" + # Propagate a possible ending "\n" into the format, from which the LG + # error facility determine if this is a partial or a complete message. + if msg[-1:] == "\n": # a newline-ended complete message + _local_eol = "\n" + msg = msg[:-1] + elif msg[-2:] == "\n\\": # a newline-ended partial message + _local_eol = "" + msg = msg[:-1] + else: + _local_eol = "" # a partial message + + return clg._prt_error('%s'+_local_eol, msg) @staticmethod def _default_handler(errinfo, data): @@ -434,6 +450,18 @@ def constituent_tree(self, mode=1): return clg.linkage_print_constituent_tree(self._obj, mode) + def word_byte_start(self, w): + return clg.linkage_get_word_byte_start(self._obj, w) + + def word_byte_end(self, w): + return clg.linkage_get_word_byte_end(self._obj, w) + + def word_char_start(self, w): + return clg.linkage_get_word_char_start(self._obj, w) + + def word_char_end(self, w): + return clg.linkage_get_word_char_end(self._obj, w) + class LG_TimerExhausted(LG_Error): pass @@ -477,11 +505,12 @@ return clg.sentence_null_count(self._obj) class sentence_parse(object): - def __init__(self, sent): + def __init__(self, sent, parse_options): self.sent = sent self.num = 0 - self.rc = clg.sentence_parse(sent._obj, sent.parse_options._obj) - if clg.parse_options_timer_expired(sent.parse_options._obj): + self.parse_options = sent.parse_options if parse_options is None else parse_options + self.rc = clg.sentence_parse(sent._obj, self.parse_options._obj) + if clg.parse_options_timer_expired(self.parse_options._obj): raise LG_TimerExhausted() def __nonzero__(self): @@ -501,7 +530,7 @@ def next(self): if self.num == clg.sentence_num_valid_linkages(self.sent._obj): raise StopIteration() - linkage = Linkage(self.num, self.sent, self.sent.parse_options._obj) + linkage = Linkage(self.num, self.sent, self.parse_options._obj) if not linkage: # SAT sentinel value raise StopIteration() self.num += 1 @@ -509,5 +538,5 @@ __next__ = next # Account python3 - def parse(self): - return self.sentence_parse(self) + def parse(self, parse_options=None): + return self.sentence_parse(self, parse_options) diff -Nru link-grammar-5.3.16/bindings/python/Makefile.am link-grammar-5.5.0/bindings/python/Makefile.am --- link-grammar-5.3.16/bindings/python/Makefile.am 2017-02-12 11:46:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/python/Makefile.am 2018-03-04 14:56:46.000000000 +0000 @@ -6,9 +6,8 @@ SWIG_SOURCES = ../swig/link_grammar.i SWIG_INCLUDES = ../../link-grammar/link-includes.h -# BUILT_C_SOURCES = $(top_builddir)/bindings/python/lg_python_wrap.cc -BUILT_C_SOURCES = ../../bindings/python/lg_python_wrap.cc -BUILT_PY_SOURCES = $(top_builddir)/bindings/python/clinkgrammar.py +built_c_sources = lg_python_wrap.cc +built_py_sources = $(top_builddir)/bindings/python/clinkgrammar.py # Over-ride the install location, so as to remove the dash in the # directory "link-grammar". Turns out python cannot tolerate dashes. @@ -29,8 +28,8 @@ # These are packaged in the tarball; make clean should not remove them. maintainer-clean-local: - -rm -f $(BUILT_C_SOURCES) - -rm -f $(BUILT_PY_SOURCES) + -rm -f $(built_c_sources) + -rm -f $(built_py_sources) # Don't remove __init__.py; it is built by configure! DISTCLEANFILES = \ @@ -38,12 +37,12 @@ if HAVE_SWIG # Swig builds these .... -$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): $(SWIG_INCLUDES) -$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): $(SWIG_SOURCES) +$(built_c_sources) $(built_py_sources): $(SWIG_INCLUDES) +$(built_c_sources) $(built_py_sources): $(SWIG_SOURCES) $(SWIG) -python -module clinkgrammar -I$(top_srcdir)/link-grammar -o $@ $< else -$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): - touch $(BUILT_C_SOURCES) $(BUILT_PY_SOURCES) +$(built_c_sources) $(built_py_sources): + touch $(built_c_sources) $(built_py_sources) endif # The la MUST have the same name as the pm, @@ -52,7 +51,7 @@ # interfaces. pkgpyexec_LTLIBRARIES = _clinkgrammar.la -_clinkgrammar_la_SOURCES = $(BUILT_C_SOURCES) $(SWIG_SOURCES) +_clinkgrammar_la_SOURCES = $(built_c_sources) $(SWIG_SOURCES) # $(top_builddir) to pick up autogen'ed link-grammar/link-features.h _clinkgrammar_la_CPPFLAGS = \ @@ -61,10 +60,11 @@ -I$(top_srcdir) \ -I$(top_builddir) -# On Cygwin, a DLL with version is named name-major.dll with no symlink to -# it of an unversioned name (at least up and including libtool 2.4.6). -# This is bad for Python modules, as they must have an identifier name. -if OS_CYGWIN +# On Cygwin and MinGW, a DLL with version is named name-major.dll with no +# symlink to it of an unversioned name (at least up and including libtool +# 2.4.6). This is bad for Python modules, as they must have an identifier +# name. +if OS_WIN32 AVOID_VERSION = -avoid-version endif _clinkgrammar_la_LDFLAGS = \ diff -Nru link-grammar-5.3.16/bindings/python/Makefile.in link-grammar-5.5.0/bindings/python/Makefile.in --- link-grammar-5.3.16/bindings/python/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/bindings/python/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -141,9 +141,7 @@ _clinkgrammar_la_DEPENDENCIES = \ $(top_builddir)/link-grammar/liblink-grammar.la \ $(am__DEPENDENCIES_1) -am__dirstamp = $(am__leading_dot)dirstamp -am__objects_1 = \ - ../../bindings/python/_clinkgrammar_la-lg_python_wrap.lo +am__objects_1 = _clinkgrammar_la-lg_python_wrap.lo am__objects_2 = am__clinkgrammar_la_OBJECTS = $(am__objects_1) $(am__objects_2) _clinkgrammar_la_OBJECTS = $(am__clinkgrammar_la_OBJECTS) @@ -228,6 +226,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -256,6 +255,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -264,13 +264,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -337,12 +340,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -410,9 +413,8 @@ top_srcdir = @top_srcdir@ SWIG_SOURCES = ../swig/link_grammar.i SWIG_INCLUDES = ../../link-grammar/link-includes.h -# BUILT_C_SOURCES = $(top_builddir)/bindings/python/lg_python_wrap.cc -BUILT_C_SOURCES = ../../bindings/python/lg_python_wrap.cc -BUILT_PY_SOURCES = $(top_builddir)/bindings/python/clinkgrammar.py +built_c_sources = lg_python_wrap.cc +built_py_sources = $(top_builddir)/bindings/python/clinkgrammar.py # Files that get installed in $pkgpythondir pkgpython_PYTHON = \ @@ -431,7 +433,7 @@ # We're calling this clinkgrammar, since its a direct map of the C # interfaces. pkgpyexec_LTLIBRARIES = _clinkgrammar.la -_clinkgrammar_la_SOURCES = $(BUILT_C_SOURCES) $(SWIG_SOURCES) +_clinkgrammar_la_SOURCES = $(built_c_sources) $(SWIG_SOURCES) # $(top_builddir) to pick up autogen'ed link-grammar/link-features.h _clinkgrammar_la_CPPFLAGS = \ @@ -441,10 +443,11 @@ -I$(top_builddir) -# On Cygwin, a DLL with version is named name-major.dll with no symlink to -# it of an unversioned name (at least up and including libtool 2.4.6). -# This is bad for Python modules, as they must have an identifier name. -@OS_CYGWIN_TRUE@AVOID_VERSION = -avoid-version +# On Cygwin and MinGW, a DLL with version is named name-major.dll with no +# symlink to it of an unversioned name (at least up and including libtool +# 2.4.6). This is bad for Python modules, as they must have an identifier +# name. +@OS_WIN32_TRUE@AVOID_VERSION = -avoid-version _clinkgrammar_la_LDFLAGS = \ -version-info @VERSION_INFO@ $(AVOID_VERSION) \ $(PYTHON2_LDFLAGS) -module -no-undefined @@ -529,28 +532,17 @@ echo rm -f $${locs}; \ rm -f $${locs}; \ } -../../bindings/python/$(am__dirstamp): - @$(MKDIR_P) ../../bindings/python - @: > ../../bindings/python/$(am__dirstamp) -../../bindings/python/$(DEPDIR)/$(am__dirstamp): - @$(MKDIR_P) ../../bindings/python/$(DEPDIR) - @: > ../../bindings/python/$(DEPDIR)/$(am__dirstamp) -../../bindings/python/_clinkgrammar_la-lg_python_wrap.lo: \ - ../../bindings/python/$(am__dirstamp) \ - ../../bindings/python/$(DEPDIR)/$(am__dirstamp) _clinkgrammar.la: $(_clinkgrammar_la_OBJECTS) $(_clinkgrammar_la_DEPENDENCIES) $(EXTRA__clinkgrammar_la_DEPENDENCIES) $(AM_V_CXXLD)$(_clinkgrammar_la_LINK) -rpath $(pkgpyexecdir) $(_clinkgrammar_la_OBJECTS) $(_clinkgrammar_la_LIBADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) - -rm -f ../../bindings/python/*.$(OBJEXT) - -rm -f ../../bindings/python/*.lo distclean-compile: -rm -f *.tab.c -@AMDEP_TRUE@@am__include@ @am__quote@../../bindings/python/$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Plo@am__quote@ .cc.o: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ @@ -576,19 +568,18 @@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< -../../bindings/python/_clinkgrammar_la-lg_python_wrap.lo: ../../bindings/python/lg_python_wrap.cc -@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(_clinkgrammar_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ../../bindings/python/_clinkgrammar_la-lg_python_wrap.lo -MD -MP -MF ../../bindings/python/$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Tpo -c -o ../../bindings/python/_clinkgrammar_la-lg_python_wrap.lo `test -f '../../bindings/python/lg_python_wrap.cc' || echo '$(srcdir)/'`../../bindings/python/lg_python_wrap.cc -@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) ../../bindings/python/$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Tpo ../../bindings/python/$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Plo -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='../../bindings/python/lg_python_wrap.cc' object='../../bindings/python/_clinkgrammar_la-lg_python_wrap.lo' libtool=yes @AMDEPBACKSLASH@ +_clinkgrammar_la-lg_python_wrap.lo: lg_python_wrap.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(_clinkgrammar_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT _clinkgrammar_la-lg_python_wrap.lo -MD -MP -MF $(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Tpo -c -o _clinkgrammar_la-lg_python_wrap.lo `test -f 'lg_python_wrap.cc' || echo '$(srcdir)/'`lg_python_wrap.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Tpo $(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='lg_python_wrap.cc' object='_clinkgrammar_la-lg_python_wrap.lo' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(_clinkgrammar_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ../../bindings/python/_clinkgrammar_la-lg_python_wrap.lo `test -f '../../bindings/python/lg_python_wrap.cc' || echo '$(srcdir)/'`../../bindings/python/lg_python_wrap.cc +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(_clinkgrammar_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o _clinkgrammar_la-lg_python_wrap.lo `test -f 'lg_python_wrap.cc' || echo '$(srcdir)/'`lg_python_wrap.cc mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs - -rm -rf ../../bindings/python/.libs ../../bindings/python/_libs install-pkgpythonPYTHON: $(pkgpython_PYTHON) @$(NORMAL_INSTALL) @list='$(pkgpython_PYTHON)'; dlist=; list2=; test -n "$(pkgpythondir)" || list=; \ @@ -753,8 +744,6 @@ distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) - -rm -f ../../bindings/python/$(DEPDIR)/$(am__dirstamp) - -rm -f ../../bindings/python/$(am__dirstamp) -test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES) maintainer-clean-generic: @@ -766,7 +755,7 @@ mostlyclean-am distclean: distclean-am - -rm -rf ../../bindings/python/$(DEPDIR) + -rm -rf ./$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags @@ -812,7 +801,7 @@ installcheck-am: maintainer-clean: maintainer-clean-am - -rm -rf ../../bindings/python/$(DEPDIR) + -rm -rf ./$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic \ maintainer-clean-local @@ -861,15 +850,15 @@ # These are packaged in the tarball; make clean should not remove them. maintainer-clean-local: - -rm -f $(BUILT_C_SOURCES) - -rm -f $(BUILT_PY_SOURCES) + -rm -f $(built_c_sources) + -rm -f $(built_py_sources) # Swig builds these .... -@HAVE_SWIG_TRUE@$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): $(SWIG_INCLUDES) -@HAVE_SWIG_TRUE@$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): $(SWIG_SOURCES) +@HAVE_SWIG_TRUE@$(built_c_sources) $(built_py_sources): $(SWIG_INCLUDES) +@HAVE_SWIG_TRUE@$(built_c_sources) $(built_py_sources): $(SWIG_SOURCES) @HAVE_SWIG_TRUE@ $(SWIG) -python -module clinkgrammar -I$(top_srcdir)/link-grammar -o $@ $< -@HAVE_SWIG_FALSE@$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): -@HAVE_SWIG_FALSE@ touch $(BUILT_C_SOURCES) $(BUILT_PY_SOURCES) +@HAVE_SWIG_FALSE@$(built_c_sources) $(built_py_sources): +@HAVE_SWIG_FALSE@ touch $(built_c_sources) $(built_py_sources) # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff -Nru link-grammar-5.3.16/bindings/python/README.md link-grammar-5.5.0/bindings/python/README.md --- link-grammar-5.3.16/bindings/python/README.md 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/python/README.md 2018-04-22 14:57:33.000000000 +0000 @@ -1,12 +1,9 @@ # Python2 bindings for Link Grammar -Description -=========== This directory contains a Python 2.7 interface to the Link Grammar -C library. It is also used to generate the Python 3.4 bindings, in -the `python3` directory. - +C library. Some of the files here are also used to generate the +Python 3 bindings, in the `python3` directory. Testing ======= -See the python-examples directory for unit tests and example usage. +See the `python-examples` directory for unit tests and example usage. diff -Nru link-grammar-5.3.16/bindings/python3/clinkgrammar.py link-grammar-5.5.0/bindings/python3/clinkgrammar.py --- link-grammar-5.3.16/bindings/python3/clinkgrammar.py 2017-01-19 18:45:21.000000000 +0000 +++ link-grammar-5.5.0/bindings/python3/clinkgrammar.py 2018-04-26 02:53:24.000000000 +0000 @@ -148,6 +148,10 @@ return _clinkgrammar.linkgrammar_get_version() linkgrammar_get_version = _clinkgrammar.linkgrammar_get_version +def linkgrammar_get_configuration() -> "char const *": + return _clinkgrammar.linkgrammar_get_configuration() +linkgrammar_get_configuration = _clinkgrammar.linkgrammar_get_configuration + def linkgrammar_get_dict_version(dict: 'Dictionary') -> "char const *": return _clinkgrammar.linkgrammar_get_dict_version(dict) linkgrammar_get_dict_version = _clinkgrammar.linkgrammar_get_dict_version @@ -448,6 +452,22 @@ return _clinkgrammar.linkage_get_word(linkage, w) linkage_get_word = _clinkgrammar.linkage_get_word +def linkage_get_word_byte_start(linkage: 'Linkage', index: 'int') -> "int": + return _clinkgrammar.linkage_get_word_byte_start(linkage, index) +linkage_get_word_byte_start = _clinkgrammar.linkage_get_word_byte_start + +def linkage_get_word_byte_end(linkage: 'Linkage', index: 'int') -> "int": + return _clinkgrammar.linkage_get_word_byte_end(linkage, index) +linkage_get_word_byte_end = _clinkgrammar.linkage_get_word_byte_end + +def linkage_get_word_char_start(linkage: 'Linkage', index: 'int') -> "int": + return _clinkgrammar.linkage_get_word_char_start(linkage, index) +linkage_get_word_char_start = _clinkgrammar.linkage_get_word_char_start + +def linkage_get_word_char_end(linkage: 'Linkage', index: 'int') -> "int": + return _clinkgrammar.linkage_get_word_char_end(linkage, index) +linkage_get_word_char_end = _clinkgrammar.linkage_get_word_char_end + def linkage_unused_word_cost(linkage: 'Linkage') -> "int": return _clinkgrammar.linkage_unused_word_cost(linkage) linkage_unused_word_cost = _clinkgrammar.linkage_unused_word_cost @@ -476,12 +496,16 @@ return _clinkgrammar.lg_error_clearall() lg_error_clearall = _clinkgrammar.lg_error_clearall -def _prt_error(arg1: 'char const *') -> "int": - return _clinkgrammar._prt_error(arg1) +def _prt_error(arg1: 'char const *', arg2: 'char const *') -> "int": + return _clinkgrammar._prt_error(arg1, arg2) _prt_error = _clinkgrammar._prt_error -def _py_error_default_handler(lge: 'lg_errinfo', pedh_data: 'int *') -> "void": - return _clinkgrammar._py_error_default_handler(lge, pedh_data) +def lg_error_flush() -> "int": + return _clinkgrammar.lg_error_flush() +lg_error_flush = _clinkgrammar.lg_error_flush + +def _py_error_default_handler(eh_lge: 'lg_errinfo', pedh_data: 'int *') -> "void": + return _clinkgrammar._py_error_default_handler(eh_lge, pedh_data) _py_error_default_handler = _clinkgrammar._py_error_default_handler def _py_error_set_handler(func_and_data: 'PyObject *') -> "PyObject *": diff -Nru link-grammar-5.3.16/bindings/python3/__init__.py link-grammar-5.5.0/bindings/python3/__init__.py --- link-grammar-5.3.16/bindings/python3/__init__.py 2017-04-15 21:51:08.000000000 +0000 +++ link-grammar-5.5.0/bindings/python3/__init__.py 2018-04-29 21:40:31.000000000 +0000 @@ -1,6 +1,6 @@ # # __init__.py file for Link Grammar Python bindings # -__version__ = "5.3.16" +__version__ = "5.5.0" from .linkgrammar import * diff -Nru link-grammar-5.3.16/bindings/python3/lg_python_wrap.cc link-grammar-5.5.0/bindings/python3/lg_python_wrap.cc --- link-grammar-5.3.16/bindings/python3/lg_python_wrap.cc 2017-01-19 18:45:21.000000000 +0000 +++ link-grammar-5.5.0/bindings/python3/lg_python_wrap.cc 2018-04-26 02:53:24.000000000 +0000 @@ -15,6 +15,30 @@ #define SWIG_PYTHON_DIRECTOR_NO_VTABLE + +#ifdef __cplusplus +/* SwigValueWrapper is described in swig.swg */ +template class SwigValueWrapper { + struct SwigMovePointer { + T *ptr; + SwigMovePointer(T *p) : ptr(p) { } + ~SwigMovePointer() { delete ptr; } + SwigMovePointer& operator=(SwigMovePointer& rhs) { T* oldptr = ptr; ptr = 0; delete oldptr; ptr = rhs.ptr; rhs.ptr = 0; return *this; } + } pointer; + SwigValueWrapper& operator=(const SwigValueWrapper& rhs); + SwigValueWrapper(const SwigValueWrapper& rhs); +public: + SwigValueWrapper() : pointer(0) { } + SwigValueWrapper& operator=(const T& t) { SwigMovePointer tmp(new T(t)); pointer = tmp; return *this; } + operator T&() const { return *pointer.ptr; } + T *operator&() { return pointer.ptr; } +}; + +template T SwigValueInit() { + return T(); +} +#endif + /* ----------------------------------------------------------------------------- * This section contains generic SWIG labels for method/variable * declarations/attributes, and other compiler dependent labels. @@ -3021,8 +3045,81 @@ #define SWIG_VERSION SWIGVERSION -#define SWIG_as_voidptr(a) (void *)((const void *)(a)) -#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),(void**)(a)) +#define SWIG_as_voidptr(a) const_cast< void * >(static_cast< const void * >(a)) +#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),reinterpret_cast< void** >(a)) + + +#include + + +namespace swig { + class SwigPtr_PyObject { + protected: + PyObject *_obj; + + public: + SwigPtr_PyObject() :_obj(0) + { + } + + SwigPtr_PyObject(const SwigPtr_PyObject& item) : _obj(item._obj) + { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + Py_XINCREF(_obj); + SWIG_PYTHON_THREAD_END_BLOCK; + } + + SwigPtr_PyObject(PyObject *obj, bool initial_ref = true) :_obj(obj) + { + if (initial_ref) { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + Py_XINCREF(_obj); + SWIG_PYTHON_THREAD_END_BLOCK; + } + } + + SwigPtr_PyObject & operator=(const SwigPtr_PyObject& item) + { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + Py_XINCREF(item._obj); + Py_XDECREF(_obj); + _obj = item._obj; + SWIG_PYTHON_THREAD_END_BLOCK; + return *this; + } + + ~SwigPtr_PyObject() + { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; + Py_XDECREF(_obj); + SWIG_PYTHON_THREAD_END_BLOCK; + } + + operator PyObject *() const + { + return _obj; + } + + PyObject *operator->() const + { + return _obj; + } + }; +} + + +namespace swig { + struct SwigVar_PyObject : SwigPtr_PyObject { + SwigVar_PyObject(PyObject* obj = 0) : SwigPtr_PyObject(obj, false) { } + + SwigVar_PyObject & operator = (PyObject* obj) + { + Py_XDECREF(_obj); + _obj = obj; + return *this; + } + }; +} @@ -3057,20 +3154,20 @@ if (size > INT_MAX) { swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); return pchar_descriptor ? - SWIG_InternalNewPointerObj((char *)(carray), pchar_descriptor, 0) : SWIG_Py_Void(); + SWIG_InternalNewPointerObj(const_cast< char * >(carray), pchar_descriptor, 0) : SWIG_Py_Void(); } else { #if PY_VERSION_HEX >= 0x03000000 #if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - return PyBytes_FromStringAndSize(carray, (Py_ssize_t)(size)); + return PyBytes_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); #else #if PY_VERSION_HEX >= 0x03010000 - return PyUnicode_DecodeUTF8(carray, (Py_ssize_t)(size), "surrogateescape"); + return PyUnicode_DecodeUTF8(carray, static_cast< Py_ssize_t >(size), "surrogateescape"); #else - return PyUnicode_FromStringAndSize(carray, (Py_ssize_t)(size)); + return PyUnicode_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); #endif #endif #else - return PyString_FromStringAndSize(carray, (Py_ssize_t)(size)); + return PyString_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); #endif } } else { @@ -3133,7 +3230,7 @@ if (*alloc == SWIG_NEWOBJ) #endif { - *cptr = (char *)memcpy((char *)malloc((len + 1)*sizeof(char)), cstr, sizeof(char)*(len + 1)); + *cptr = reinterpret_cast< char* >(memcpy((new char[len + 1]), cstr, sizeof(char)*(len + 1))); *alloc = SWIG_NEWOBJ; } else { *cptr = cstr; @@ -3171,7 +3268,7 @@ if (PyString_AsStringAndSize(obj, &cstr, &len) != -1) { if (cptr) { if (alloc) *alloc = SWIG_NEWOBJ; - *cptr = (char *)memcpy((char *)malloc((len + 1)*sizeof(char)), cstr, sizeof(char)*(len + 1)); + *cptr = reinterpret_cast< char* >(memcpy((new char[len + 1]), cstr, sizeof(char)*(len + 1))); } if (psize) *psize = len + 1; @@ -3346,7 +3443,7 @@ if ((v < INT_MIN || v > INT_MAX)) { return SWIG_OverflowError; } else { - if (val) *val = (int)(v); + if (val) *val = static_cast< int >(v); } } return res; @@ -3457,12 +3554,12 @@ #endif unsigned long v; res = SWIG_AsVal_unsigned_SS_long (obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = (size_t)(v); + if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); #ifdef SWIG_LONG_LONG_AVAILABLE } else if (sizeof(size_t) <= sizeof(unsigned long long)) { unsigned long long v; res = SWIG_AsVal_unsigned_SS_long_SS_long (obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = (size_t)(v); + if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); } #endif return res; @@ -3508,9 +3605,9 @@ } -void _py_error_default_handler(lg_errinfo *lge, int *pedh_data) +void _py_error_default_handler(lg_errinfo *eh_lge, int *pedh_data) { - default_error_handler(lge, (void *)pedh_data); + default_error_handler(eh_lge, (void *)pedh_data); } /** @@ -3568,6 +3665,7 @@ } void delete_lg_errinfo(lg_errinfo *lge) { + if (NULL == lge) return; /* Was None - nothing to free. */ free((void *)lge->severity_label); free((void *)lge->text); free((void *)lge); @@ -3589,9 +3687,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_severity_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (lg_error_severity) ((arg1)->severity); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -3611,7 +3709,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_severity_label_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *) ((arg1)->severity_label); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3633,7 +3731,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "lg_errinfo_text_get" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *) ((arg1)->text); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3647,7 +3745,7 @@ lg_errinfo *result = 0 ; if (!PyArg_ParseTuple(args,(char *)":new_lg_errinfo")) SWIG_fail; - result = (lg_errinfo *)calloc(1, sizeof(lg_errinfo)); + result = (lg_errinfo *)new lg_errinfo(); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_lg_errinfo, SWIG_POINTER_NEW | 0 ); return resultobj; fail: @@ -3675,6 +3773,19 @@ } +SWIGINTERN PyObject *_wrap_linkgrammar_get_configuration(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + char *result = 0 ; + + if (!PyArg_ParseTuple(args,(char *)":linkgrammar_get_configuration")) SWIG_fail; + result = (char *)linkgrammar_get_configuration(); + resultobj = SWIG_FromCharPtr((const char *)result); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_linkgrammar_get_dict_version(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; Dictionary arg1 = (Dictionary) 0 ; @@ -3688,7 +3799,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkgrammar_get_dict_version" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)linkgrammar_get_dict_version(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3710,7 +3821,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkgrammar_get_dict_locale" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)linkgrammar_get_dict_locale(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3733,13 +3844,13 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_create_lang" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); result = (Dictionary)dictionary_create_lang((char const *)arg1); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Dictionary_s, 0 | 0 ); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return resultobj; fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return NULL; } @@ -3770,7 +3881,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_get_lang" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); result = (char *)dictionary_get_lang(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -3791,7 +3902,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_delete" "', argument " "1"" of type '" "Dictionary""'"); } - arg1 = (Dictionary)(argp1); + arg1 = reinterpret_cast< Dictionary >(argp1); dictionary_delete(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -3813,13 +3924,13 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "dictionary_set_data_dir" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); dictionary_set_data_dir((char const *)arg1); resultobj = SWIG_Py_Void(); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return resultobj; fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return NULL; } @@ -3831,7 +3942,7 @@ if (!PyArg_ParseTuple(args,(char *)":dictionary_get_data_dir")) SWIG_fail; result = (char *)dictionary_get_data_dir(); resultobj = SWIG_FromCharPtr((const char *)result); - free((char*)result); + delete[] result; return resultobj; fail: return NULL; @@ -3864,9 +3975,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_delete" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_delete(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -3889,12 +4000,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_verbosity" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_verbosity" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_verbosity(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -3916,9 +4027,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_verbosity" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_verbosity(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -3941,12 +4052,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_linkage_limit" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_linkage_limit" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_linkage_limit(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -3968,9 +4079,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_linkage_limit" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_linkage_limit(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -3993,12 +4104,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_disjunct_cost" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_double(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_disjunct_cost" "', argument " "2"" of type '" "double""'"); } - arg2 = (double)(val2); + arg2 = static_cast< double >(val2); parse_options_set_disjunct_cost(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4020,9 +4131,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_disjunct_cost" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (double)parse_options_get_disjunct_cost(arg1); - resultobj = SWIG_From_double((double)(result)); + resultobj = SWIG_From_double(static_cast< double >(result)); return resultobj; fail: return NULL; @@ -4045,12 +4156,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_min_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_min_null_count" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_min_null_count(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4072,9 +4183,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_min_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_min_null_count(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4097,12 +4208,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_null_count" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_null_count(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4124,9 +4235,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_null_count" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_null_count(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4149,12 +4260,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_islands_ok" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_islands_ok" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_islands_ok(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4176,9 +4287,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_islands_ok" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_islands_ok(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4201,12 +4312,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_short_length" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_short_length" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_short_length(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4228,9 +4339,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_short_length" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_short_length(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4253,12 +4364,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_memory" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_memory" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_memory(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4280,9 +4391,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_memory" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_memory(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4305,12 +4416,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_max_parse_time" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_max_parse_time" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_max_parse_time(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4332,9 +4443,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_max_parse_time" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_max_parse_time(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4357,12 +4468,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_cost_model_type" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_cost_model_type" "', argument " "2"" of type '" "Cost_Model_type""'"); } - arg2 = (Cost_Model_type)(val2); + arg2 = static_cast< Cost_Model_type >(val2); parse_options_set_cost_model_type(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4384,9 +4495,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_cost_model_type" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (Cost_Model_type)parse_options_get_cost_model_type(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4406,9 +4517,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_timer_expired" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_timer_expired(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4428,9 +4539,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_memory_exhausted" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_memory_exhausted(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4450,9 +4561,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_resources_exhausted" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_resources_exhausted(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4475,12 +4586,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_display_morphology" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_display_morphology" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_display_morphology(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4502,9 +4613,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_display_morphology" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_display_morphology(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4527,12 +4638,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_spell_guess" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_spell_guess" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_spell_guess(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4554,9 +4665,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_spell_guess" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_spell_guess(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4579,12 +4690,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_all_short_connectors" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_all_short_connectors" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_all_short_connectors(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4606,9 +4717,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_all_short_connectors" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_all_short_connectors(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4627,7 +4738,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_reset_resources" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); parse_options_reset_resources(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -4652,12 +4763,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_set_use_sat_parser" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "parse_options_set_use_sat_parser" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); parse_options_set_use_sat_parser(arg1,arg2); resultobj = SWIG_Py_Void(); return resultobj; @@ -4679,9 +4790,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "parse_options_get_use_sat_parser" "', argument " "1"" of type '" "Parse_Options""'"); } - arg1 = (Parse_Options)(argp1); + arg1 = reinterpret_cast< Parse_Options >(argp1); result = (int)parse_options_get_use_sat_parser(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4706,18 +4817,18 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_create" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); + arg1 = reinterpret_cast< char * >(buf1); res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_Dictionary_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_create" "', argument " "2"" of type '" "Dictionary""'"); } - arg2 = (Dictionary)(argp2); + arg2 = reinterpret_cast< Dictionary >(argp2); result = (Sentence)sentence_create((char const *)arg1,arg2); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Sentence_s, 0 | 0 ); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return resultobj; fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; return NULL; } @@ -4734,7 +4845,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_delete" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); sentence_delete(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -4760,14 +4871,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_split" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_split" "', argument " "2"" of type '" "Parse_Options""'"); } - arg2 = (Parse_Options)(argp2); + arg2 = reinterpret_cast< Parse_Options >(argp2); result = (int)sentence_split(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4791,14 +4902,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_parse" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "sentence_parse" "', argument " "2"" of type '" "Parse_Options""'"); } - arg2 = (Parse_Options)(argp2); + arg2 = reinterpret_cast< Parse_Options >(argp2); result = (int)sentence_parse(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4818,9 +4929,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_length" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_length(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4840,9 +4951,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_null_count" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_null_count(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4862,9 +4973,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_linkages_found" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_linkages_found(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4884,9 +4995,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_valid_linkages" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_valid_linkages(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4906,9 +5017,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_linkages_post_processed" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); result = (int)sentence_num_linkages_post_processed(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4932,14 +5043,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_num_violations" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_num_violations" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)sentence_num_violations(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -4963,14 +5074,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_disjunct_cost" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_disjunct_cost" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (double)sentence_disjunct_cost(arg1,arg2); - resultobj = SWIG_From_double((double)(result)); + resultobj = SWIG_From_double(static_cast< double >(result)); return resultobj; fail: return NULL; @@ -4994,14 +5105,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "sentence_link_cost" "', argument " "1"" of type '" "Sentence""'"); } - arg1 = (Sentence)(argp1); + arg1 = reinterpret_cast< Sentence >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "sentence_link_cost" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)sentence_link_cost(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5029,17 +5140,17 @@ if (!SWIG_IsOK(ecode1)) { SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "linkage_create" "', argument " "1"" of type '" "int""'"); } - arg1 = (int)(val1); + arg1 = static_cast< int >(val1); res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_Sentence_s, 0 | 0 ); if (!SWIG_IsOK(res2)) { SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "linkage_create" "', argument " "2"" of type '" "Sentence""'"); } - arg2 = (Sentence)(argp2); + arg2 = reinterpret_cast< Sentence >(argp2); res3 = SWIG_ConvertPtr(obj2, &argp3,SWIGTYPE_p_Parse_Options_s, 0 | 0 ); if (!SWIG_IsOK(res3)) { SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "linkage_create" "', argument " "3"" of type '" "Parse_Options""'"); } - arg3 = (Parse_Options)(argp3); + arg3 = reinterpret_cast< Parse_Options >(argp3); result = (Linkage)linkage_create(arg1,arg2,arg3); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Linkage_s, 0 | 0 ); return resultobj; @@ -5060,7 +5171,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_delete" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); linkage_delete(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -5090,17 +5201,17 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_diagram" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_diagram" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); ecode3 = SWIG_AsVal_size_t(obj2, &val3); if (!SWIG_IsOK(ecode3)) { SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "linkage_print_diagram" "', argument " "3"" of type '" "size_t""'"); } - arg3 = (size_t)(val3); + arg3 = static_cast< size_t >(val3); result = (char *)linkage_print_diagram(arg1,arg2,arg3); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5133,17 +5244,17 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_postscript" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_postscript" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); ecode3 = SWIG_AsVal_int(obj2, &val3); if (!SWIG_IsOK(ecode3)) { SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "linkage_print_postscript" "', argument " "3"" of type '" "int""'"); } - arg3 = (int)(val3); + arg3 = static_cast< int >(val3); result = (char *)linkage_print_postscript(arg1,arg2,arg3); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5168,7 +5279,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_links_and_domains" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_links_and_domains(arg1); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5193,7 +5304,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_senses" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_senses(arg1); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5222,12 +5333,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_constituent_tree" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_print_constituent_tree" "', argument " "2"" of type '" "ConstituentDisplayStyle""'"); } - arg2 = (ConstituentDisplayStyle)(val2); + arg2 = static_cast< ConstituentDisplayStyle >(val2); result = (char *)linkage_print_constituent_tree(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5252,7 +5363,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_disjuncts" "', argument " "1"" of type '" "Linkage const""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_disjuncts(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5274,7 +5385,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_print_pp_msgs" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_print_pp_msgs(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5296,9 +5407,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_num_words" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_get_num_words(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5318,9 +5429,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_num_links" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_get_num_links(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5344,14 +5455,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_lword" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_lword" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_lword(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5375,14 +5486,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_rword" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_rword" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_rword(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5406,14 +5517,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_length" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_length" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_length(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5437,12 +5548,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_label" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_label" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_label(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5468,12 +5579,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_llabel" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_llabel" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_llabel(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5499,12 +5610,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_rlabel" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_rlabel" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_link_rlabel(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5530,14 +5641,14 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_num_domains" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_num_domains" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (int)linkage_get_link_num_domains(arg1,arg2); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5561,12 +5672,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_link_domain_names" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_link_domain_names" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char **)linkage_get_link_domain_names(arg1,arg2); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_p_char, 0 | 0 ); return resultobj; @@ -5588,7 +5699,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_words" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char **)linkage_get_words(arg1); resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_p_char, 0 | 0 ); return resultobj; @@ -5614,12 +5725,12 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word" "', argument " "2"" of type '" "int""'"); } - arg2 = (int)(val2); + arg2 = static_cast< int >(val2); result = (char *)linkage_get_word(arg1,arg2); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5628,6 +5739,130 @@ } +SWIGINTERN PyObject *_wrap_linkage_get_word_byte_start(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; + int result; + + if (!PyArg_ParseTuple(args,(char *)"OO:linkage_get_word_byte_start",&obj0,&obj1)) SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_byte_start" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int(obj1, &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_byte_start" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_byte_start(arg1,arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_linkage_get_word_byte_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; + int result; + + if (!PyArg_ParseTuple(args,(char *)"OO:linkage_get_word_byte_end",&obj0,&obj1)) SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_byte_end" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int(obj1, &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_byte_end" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_byte_end(arg1,arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_linkage_get_word_char_start(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; + int result; + + if (!PyArg_ParseTuple(args,(char *)"OO:linkage_get_word_char_start",&obj0,&obj1)) SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_char_start" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int(obj1, &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_char_start" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_char_start(arg1,arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_linkage_get_word_char_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + Linkage arg1 = (Linkage) 0 ; + int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; + PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; + int result; + + if (!PyArg_ParseTuple(args,(char *)"OO:linkage_get_word_char_end",&obj0,&obj1)) SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Linkage_s, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_word_char_end" "', argument " "1"" of type '" "Linkage""'"); + } + arg1 = reinterpret_cast< Linkage >(argp1); + ecode2 = SWIG_AsVal_int(obj1, &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "linkage_get_word_char_end" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); + result = (int)linkage_get_word_char_end(arg1,arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_linkage_unused_word_cost(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; Linkage arg1 = (Linkage) 0 ; @@ -5641,9 +5876,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_unused_word_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_unused_word_cost(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5663,9 +5898,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_disjunct_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (double)linkage_disjunct_cost(arg1); - resultobj = SWIG_From_double((double)(result)); + resultobj = SWIG_From_double(static_cast< double >(result)); return resultobj; fail: return NULL; @@ -5685,9 +5920,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_link_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (int)linkage_link_cost(arg1); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; @@ -5707,9 +5942,9 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_corpus_cost" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (double)linkage_corpus_cost(arg1); - resultobj = SWIG_From_double((double)(result)); + resultobj = SWIG_From_double(static_cast< double >(result)); return resultobj; fail: return NULL; @@ -5729,7 +5964,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "linkage_get_violation_name" "', argument " "1"" of type '" "Linkage""'"); } - arg1 = (Linkage)(argp1); + arg1 = reinterpret_cast< Linkage >(argp1); result = (char *)linkage_get_violation_name(arg1); resultobj = SWIG_FromCharPtr((const char *)result); return resultobj; @@ -5751,7 +5986,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_lg_error_formatmsg" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); result = (char *)lg_error_formatmsg(arg1); resultobj = SWIG_FromCharPtr((const char *)result); { @@ -5769,50 +6004,60 @@ if (!PyArg_ParseTuple(args,(char *)":lg_error_clearall")) SWIG_fail; result = (int)lg_error_clearall(); - resultobj = SWIG_From_int((int)(result)); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap__prt_error__varargs__(PyObject *SWIGUNUSEDPARM(self), PyObject *args, PyObject *varargs) { +SWIGINTERN PyObject *_wrap__prt_error(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; char *arg1 = (char *) 0 ; - void *arg2 = 0 ; + char *arg2 = (char *) 0 ; int res1 ; char *buf1 = 0 ; int alloc1 = 0 ; + int res2 ; + char *buf2 = 0 ; + int alloc2 = 0 ; PyObject * obj0 = 0 ; + PyObject * obj1 = 0 ; int result; - if (!PyArg_ParseTuple(args,(char *)"O:_prt_error",&obj0)) SWIG_fail; + if (!PyArg_ParseTuple(args,(char *)"OO:_prt_error",&obj0,&obj1)) SWIG_fail; res1 = SWIG_AsCharPtrAndSize(obj0, &buf1, NULL, &alloc1); if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_prt_error" "', argument " "1"" of type '" "char const *""'"); } - arg1 = (char *)(buf1); - result = (int)prt_error((char const *)arg1,arg2); - resultobj = SWIG_From_int((int)(result)); - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + arg1 = reinterpret_cast< char * >(buf1); + res2 = SWIG_AsCharPtrAndSize(obj1, &buf2, NULL, &alloc2); + if (!SWIG_IsOK(res2)) { + SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "_prt_error" "', argument " "2"" of type '" "char const *""'"); + } + arg2 = reinterpret_cast< char * >(buf2); + result = (int)prt_error((char const *)arg1,(char const *)arg2); + resultobj = SWIG_From_int(static_cast< int >(result)); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + if (alloc2 == SWIG_NEWOBJ) delete[] buf2; return resultobj; fail: - if (alloc1 == SWIG_NEWOBJ) free((char*)buf1); + if (alloc1 == SWIG_NEWOBJ) delete[] buf1; + if (alloc2 == SWIG_NEWOBJ) delete[] buf2; return NULL; } -SWIGINTERN PyObject *_wrap__prt_error(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj; - PyObject *varargs; - PyObject *newargs; +SWIGINTERN PyObject *_wrap_lg_error_flush(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + int result; - newargs = PyTuple_GetSlice(args,0,1); - varargs = PyTuple_GetSlice(args,1,PyTuple_Size(args)); - resultobj = _wrap__prt_error__varargs__(NULL,newargs,varargs); - Py_XDECREF(newargs); - Py_XDECREF(varargs); + if (!PyArg_ParseTuple(args,(char *)":lg_error_flush")) SWIG_fail; + result = (int)lg_error_flush(); + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; +fail: + return NULL; } @@ -5820,19 +6065,24 @@ PyObject *resultobj = 0; lg_errinfo *arg1 = (lg_errinfo *) 0 ; int *arg2 = (int *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; + int arg20 ; PyObject * obj0 = 0 ; PyObject * obj1 = 0 ; if (!PyArg_ParseTuple(args,(char *)"OO:_py_error_default_handler",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_lg_errinfo, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "_py_error_default_handler" "', argument " "1"" of type '" "lg_errinfo *""'"); + { + void *argp1 = 0; + + if (Py_None == obj0) + SWIG_exception_fail(SWIG_TypeError, "in method '_py_error_default_handler', argument 1 (of type lg_errinfo *) must not be None."); + + int res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_lg_errinfo, 0); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '_py_error_default_handler', argument 1 of type 'lg_errinfo *'"); + } + arg1 = (lg_errinfo *)(argp1); } - arg1 = (lg_errinfo *)(argp1); { - int arg; int error = false; const char errmsg[] = "The default error handler data argument (arg 2) " "must be an integer (0 to lg_None) or None."; @@ -5850,17 +6100,17 @@ } else { - arg = (int)PyInt_AsLong(obj1); + arg20 = (int)PyInt_AsLong(obj1); } - if ((arg < 0) || (arg > lg_None)) + if ((arg20 < 0) || (arg20 > lg_None)) { SWIG_exception_fail(SWIG_ValueError, errmsg); error = true; } if (error) return NULL; - arg2 = &arg; + arg2 = &arg20; } } _py_error_default_handler(arg1,arg2); @@ -5915,7 +6165,7 @@ if (!SWIG_IsOK(res1)) { SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_lg_errinfo" "', argument " "1"" of type '" "lg_errinfo *""'"); } - arg1 = (lg_errinfo *)(argp1); + arg1 = reinterpret_cast< lg_errinfo * >(argp1); delete_lg_errinfo(arg1); resultobj = SWIG_Py_Void(); return resultobj; @@ -5932,6 +6182,7 @@ { (char *)"new_lg_errinfo", _wrap_new_lg_errinfo, METH_VARARGS, NULL}, { (char *)"lg_errinfo_swigregister", lg_errinfo_swigregister, METH_VARARGS, NULL}, { (char *)"linkgrammar_get_version", _wrap_linkgrammar_get_version, METH_VARARGS, NULL}, + { (char *)"linkgrammar_get_configuration", _wrap_linkgrammar_get_configuration, METH_VARARGS, NULL}, { (char *)"linkgrammar_get_dict_version", _wrap_linkgrammar_get_dict_version, METH_VARARGS, NULL}, { (char *)"linkgrammar_get_dict_locale", _wrap_linkgrammar_get_dict_locale, METH_VARARGS, NULL}, { (char *)"dictionary_create_lang", _wrap_dictionary_create_lang, METH_VARARGS, NULL}, @@ -6007,6 +6258,10 @@ { (char *)"linkage_get_link_domain_names", _wrap_linkage_get_link_domain_names, METH_VARARGS, NULL}, { (char *)"linkage_get_words", _wrap_linkage_get_words, METH_VARARGS, NULL}, { (char *)"linkage_get_word", _wrap_linkage_get_word, METH_VARARGS, NULL}, + { (char *)"linkage_get_word_byte_start", _wrap_linkage_get_word_byte_start, METH_VARARGS, NULL}, + { (char *)"linkage_get_word_byte_end", _wrap_linkage_get_word_byte_end, METH_VARARGS, NULL}, + { (char *)"linkage_get_word_char_start", _wrap_linkage_get_word_char_start, METH_VARARGS, NULL}, + { (char *)"linkage_get_word_char_end", _wrap_linkage_get_word_char_end, METH_VARARGS, NULL}, { (char *)"linkage_unused_word_cost", _wrap_linkage_unused_word_cost, METH_VARARGS, NULL}, { (char *)"linkage_disjunct_cost", _wrap_linkage_disjunct_cost, METH_VARARGS, NULL}, { (char *)"linkage_link_cost", _wrap_linkage_link_cost, METH_VARARGS, NULL}, @@ -6015,6 +6270,7 @@ { (char *)"_lg_error_formatmsg", _wrap__lg_error_formatmsg, METH_VARARGS, NULL}, { (char *)"lg_error_clearall", _wrap_lg_error_clearall, METH_VARARGS, NULL}, { (char *)"_prt_error", _wrap__prt_error, METH_VARARGS, NULL}, + { (char *)"lg_error_flush", _wrap_lg_error_flush, METH_VARARGS, NULL}, { (char *)"_py_error_default_handler", _wrap__py_error_default_handler, METH_VARARGS, NULL}, { (char *)"_py_error_set_handler", _wrap__py_error_set_handler, METH_VARARGS, NULL}, { (char *)"_py_error_printall", _wrap__py_error_printall, METH_VARARGS, NULL}, @@ -6027,10 +6283,10 @@ static swig_type_info _swigt__p_ConstituentDisplayStyle = {"_p_ConstituentDisplayStyle", "enum ConstituentDisplayStyle *|ConstituentDisplayStyle *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_Cost_Model_type = {"_p_Cost_Model_type", "enum Cost_Model_type *|Cost_Model_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Dictionary_s = {"_p_Dictionary_s", "struct Dictionary_s *|Dictionary", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Linkage_s = {"_p_Linkage_s", "struct Linkage_s *|Linkage", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Parse_Options_s = {"_p_Parse_Options_s", "struct Parse_Options_s *|Parse_Options", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Sentence_s = {"_p_Sentence_s", "Sentence|struct Sentence_s *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Dictionary_s = {"_p_Dictionary_s", "Dictionary_s *|Dictionary", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Linkage_s = {"_p_Linkage_s", "Linkage_s *|Linkage", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Parse_Options_s = {"_p_Parse_Options_s", "Parse_Options_s *|Parse_Options", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_Sentence_s = {"_p_Sentence_s", "Sentence|Sentence_s *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_int = {"_p_int", "int *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_lg_errinfo = {"_p_lg_errinfo", "lg_errinfo *", 0, 0, (void*)0, 0}; @@ -6773,20 +7029,20 @@ SWIG_InstallConstants(d,swig_const_table); - SWIG_Python_SetConstant(d, "lg_Fatal",SWIG_From_int((int)(lg_Fatal))); - SWIG_Python_SetConstant(d, "lg_Error",SWIG_From_int((int)(lg_Error))); - SWIG_Python_SetConstant(d, "lg_Warn",SWIG_From_int((int)(lg_Warn))); - SWIG_Python_SetConstant(d, "lg_Info",SWIG_From_int((int)(lg_Info))); - SWIG_Python_SetConstant(d, "lg_Debug",SWIG_From_int((int)(lg_Debug))); - SWIG_Python_SetConstant(d, "lg_Trace",SWIG_From_int((int)(lg_Trace))); - SWIG_Python_SetConstant(d, "lg_None",SWIG_From_int((int)(lg_None))); - SWIG_Python_SetConstant(d, "VDAL",SWIG_From_int((int)(VDAL))); - SWIG_Python_SetConstant(d, "CORPUS",SWIG_From_int((int)(CORPUS))); - SWIG_Python_SetConstant(d, "NO_DISPLAY",SWIG_From_int((int)(NO_DISPLAY))); - SWIG_Python_SetConstant(d, "MULTILINE",SWIG_From_int((int)(MULTILINE))); - SWIG_Python_SetConstant(d, "BRACKET_TREE",SWIG_From_int((int)(BRACKET_TREE))); - SWIG_Python_SetConstant(d, "SINGLE_LINE",SWIG_From_int((int)(SINGLE_LINE))); - SWIG_Python_SetConstant(d, "MAX_STYLES",SWIG_From_int((int)(MAX_STYLES))); + SWIG_Python_SetConstant(d, "lg_Fatal",SWIG_From_int(static_cast< int >(lg_Fatal))); + SWIG_Python_SetConstant(d, "lg_Error",SWIG_From_int(static_cast< int >(lg_Error))); + SWIG_Python_SetConstant(d, "lg_Warn",SWIG_From_int(static_cast< int >(lg_Warn))); + SWIG_Python_SetConstant(d, "lg_Info",SWIG_From_int(static_cast< int >(lg_Info))); + SWIG_Python_SetConstant(d, "lg_Debug",SWIG_From_int(static_cast< int >(lg_Debug))); + SWIG_Python_SetConstant(d, "lg_Trace",SWIG_From_int(static_cast< int >(lg_Trace))); + SWIG_Python_SetConstant(d, "lg_None",SWIG_From_int(static_cast< int >(lg_None))); + SWIG_Python_SetConstant(d, "VDAL",SWIG_From_int(static_cast< int >(VDAL))); + SWIG_Python_SetConstant(d, "CORPUS",SWIG_From_int(static_cast< int >(CORPUS))); + SWIG_Python_SetConstant(d, "NO_DISPLAY",SWIG_From_int(static_cast< int >(NO_DISPLAY))); + SWIG_Python_SetConstant(d, "MULTILINE",SWIG_From_int(static_cast< int >(MULTILINE))); + SWIG_Python_SetConstant(d, "BRACKET_TREE",SWIG_From_int(static_cast< int >(BRACKET_TREE))); + SWIG_Python_SetConstant(d, "SINGLE_LINE",SWIG_From_int(static_cast< int >(SINGLE_LINE))); + SWIG_Python_SetConstant(d, "MAX_STYLES",SWIG_From_int(static_cast< int >(MAX_STYLES))); #if PY_VERSION_HEX >= 0x03000000 return m; #else diff -Nru link-grammar-5.3.16/bindings/python3/Makefile.am link-grammar-5.5.0/bindings/python3/Makefile.am --- link-grammar-5.3.16/bindings/python3/Makefile.am 2017-02-12 11:47:34.000000000 +0000 +++ link-grammar-5.5.0/bindings/python3/Makefile.am 2018-03-04 14:56:46.000000000 +0000 @@ -8,9 +8,8 @@ SWIG_SOURCES = ../swig/link_grammar.i SWIG_INCLUDES = ../../link-grammar/link-includes.h -# BUILT_C_SOURCES = $(top_builddir)/bindings/python3/lg_python_wrap.cc -BUILT_C_SOURCES = ../../bindings/python3/lg_python_wrap.cc -BUILT_PY_SOURCES = $(top_builddir)/bindings/python3/clinkgrammar.py +built_c_sources = lg_python_wrap.cc +built_py_sources = $(top_builddir)/bindings/python3/clinkgrammar.py # Over-ride the install location, so as to remove the dash in the # directory "link-grammar". Turns out python cannot tolerate dashes. @@ -20,8 +19,8 @@ pkgpyexecdir=$(python3dir)/linkgrammar # Files that get installed in $pkgpythondir -pkgpython_PYTHON = \ - ../../bindings/python/linkgrammar.py \ +pkgpython_PYTHON = \ + ../python/linkgrammar.py \ $(top_builddir)/bindings/python3/__init__.py \ $(top_builddir)/bindings/python3/clinkgrammar.py @@ -35,13 +34,13 @@ # The make uninstall target should remove directories we created. uninstall-hook: - -rmdir $(pkgpypathdir)/__pycache__ - -rmdir $(pkgpypathdir) + -rmdir $(pkgpythondir)/__pycache__ + -rmdir $(pkgpythondir) # These are packaged in the tarball; make clean should not remove them. maintainer-clean-local: - -rm -f $(BUILT_C_SOURCES) - -rm -f $(BUILT_PY_SOURCES) + -rm -f $(built_c_sources) + -rm -f $(built_py_sources) # Don't remove __init__.py; it is built by configure! DISTCLEANFILES = \ @@ -49,12 +48,12 @@ if HAVE_SWIG # Swig builds these .... -$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): $(SWIG_INCLUDES) -$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): $(SWIG_SOURCES) +$(built_c_sources) $(built_py_sources): $(SWIG_INCLUDES) +$(built_c_sources) $(built_py_sources): $(SWIG_SOURCES) $(SWIG) -python -py3 -module clinkgrammar -I$(top_srcdir)/link-grammar -o $@ $< else -$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): - touch $(BUILT_C_SOURCES) $(BUILT_PY_SOURCES) +$(built_c_sources) $(built_py_sources): + touch $(built_c_sources) $(built_py_sources) endif # The la MUST have the same name as the pm, @@ -63,7 +62,7 @@ # interfaces. pkgpyexec_LTLIBRARIES = _clinkgrammar.la -_clinkgrammar_la_SOURCES = $(BUILT_C_SOURCES) $(SWIG_SOURCES) +_clinkgrammar_la_SOURCES = $(built_c_sources) $(SWIG_SOURCES) # $(top_builddir) to pick up autogen'ed link-grammar/link-features.h _clinkgrammar_la_CPPFLAGS = \ @@ -72,10 +71,11 @@ -I$(top_srcdir) \ -I$(top_builddir) -# On Cygwin, a DLL with version is named name-major.dll with no symlink to -# it of an unversioned name (at least up and including libtool 2.4.6). -# This is bad for Python modules, as they must have an identifier name. -if OS_CYGWIN +# On Cygwin and MinGW, a DLL with version is named name-major.dll with no +# symlink to it of an unversioned name (at least up and including libtool +# 2.4.6). This is bad for Python modules, as they must have an identifier +# name. +if OS_WIN32 AVOID_VERSION = -avoid-version endif _clinkgrammar_la_LDFLAGS = \ diff -Nru link-grammar-5.3.16/bindings/python3/Makefile.in link-grammar-5.5.0/bindings/python3/Makefile.in --- link-grammar-5.3.16/bindings/python3/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/bindings/python3/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -141,9 +141,7 @@ _clinkgrammar_la_DEPENDENCIES = \ $(top_builddir)/link-grammar/liblink-grammar.la \ $(am__DEPENDENCIES_1) -am__dirstamp = $(am__leading_dot)dirstamp -am__objects_1 = \ - ../../bindings/python3/_clinkgrammar_la-lg_python_wrap.lo +am__objects_1 = _clinkgrammar_la-lg_python_wrap.lo am__objects_2 = am__clinkgrammar_la_OBJECTS = $(am__objects_1) $(am__objects_2) _clinkgrammar_la_OBJECTS = $(am__clinkgrammar_la_OBJECTS) @@ -228,6 +226,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -256,6 +255,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -264,13 +264,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -337,12 +340,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -410,13 +413,12 @@ top_srcdir = @top_srcdir@ SWIG_SOURCES = ../swig/link_grammar.i SWIG_INCLUDES = ../../link-grammar/link-includes.h -# BUILT_C_SOURCES = $(top_builddir)/bindings/python3/lg_python_wrap.cc -BUILT_C_SOURCES = ../../bindings/python3/lg_python_wrap.cc -BUILT_PY_SOURCES = $(top_builddir)/bindings/python3/clinkgrammar.py +built_c_sources = lg_python_wrap.cc +built_py_sources = $(top_builddir)/bindings/python3/clinkgrammar.py # Files that get installed in $pkgpythondir pkgpython_PYTHON = \ - ../../bindings/python/linkgrammar.py \ + ../python/linkgrammar.py \ $(top_builddir)/bindings/python3/__init__.py \ $(top_builddir)/bindings/python3/clinkgrammar.py @@ -439,7 +441,7 @@ # We're calling this clinkgrammar, since its a direct map of the C # interfaces. pkgpyexec_LTLIBRARIES = _clinkgrammar.la -_clinkgrammar_la_SOURCES = $(BUILT_C_SOURCES) $(SWIG_SOURCES) +_clinkgrammar_la_SOURCES = $(built_c_sources) $(SWIG_SOURCES) # $(top_builddir) to pick up autogen'ed link-grammar/link-features.h _clinkgrammar_la_CPPFLAGS = \ @@ -449,10 +451,11 @@ -I$(top_builddir) -# On Cygwin, a DLL with version is named name-major.dll with no symlink to -# it of an unversioned name (at least up and including libtool 2.4.6). -# This is bad for Python modules, as they must have an identifier name. -@OS_CYGWIN_TRUE@AVOID_VERSION = -avoid-version +# On Cygwin and MinGW, a DLL with version is named name-major.dll with no +# symlink to it of an unversioned name (at least up and including libtool +# 2.4.6). This is bad for Python modules, as they must have an identifier +# name. +@OS_WIN32_TRUE@AVOID_VERSION = -avoid-version _clinkgrammar_la_LDFLAGS = \ -version-info @VERSION_INFO@ $(AVOID_VERSION) \ $(PYTHON3_LDFLAGS) -module -no-undefined @@ -535,28 +538,17 @@ echo rm -f $${locs}; \ rm -f $${locs}; \ } -../../bindings/python3/$(am__dirstamp): - @$(MKDIR_P) ../../bindings/python3 - @: > ../../bindings/python3/$(am__dirstamp) -../../bindings/python3/$(DEPDIR)/$(am__dirstamp): - @$(MKDIR_P) ../../bindings/python3/$(DEPDIR) - @: > ../../bindings/python3/$(DEPDIR)/$(am__dirstamp) -../../bindings/python3/_clinkgrammar_la-lg_python_wrap.lo: \ - ../../bindings/python3/$(am__dirstamp) \ - ../../bindings/python3/$(DEPDIR)/$(am__dirstamp) _clinkgrammar.la: $(_clinkgrammar_la_OBJECTS) $(_clinkgrammar_la_DEPENDENCIES) $(EXTRA__clinkgrammar_la_DEPENDENCIES) $(AM_V_CXXLD)$(_clinkgrammar_la_LINK) -rpath $(pkgpyexecdir) $(_clinkgrammar_la_OBJECTS) $(_clinkgrammar_la_LIBADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) - -rm -f ../../bindings/python3/*.$(OBJEXT) - -rm -f ../../bindings/python3/*.lo distclean-compile: -rm -f *.tab.c -@AMDEP_TRUE@@am__include@ @am__quote@../../bindings/python3/$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Plo@am__quote@ .cc.o: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ @@ -582,19 +574,18 @@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< -../../bindings/python3/_clinkgrammar_la-lg_python_wrap.lo: ../../bindings/python3/lg_python_wrap.cc -@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(_clinkgrammar_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ../../bindings/python3/_clinkgrammar_la-lg_python_wrap.lo -MD -MP -MF ../../bindings/python3/$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Tpo -c -o ../../bindings/python3/_clinkgrammar_la-lg_python_wrap.lo `test -f '../../bindings/python3/lg_python_wrap.cc' || echo '$(srcdir)/'`../../bindings/python3/lg_python_wrap.cc -@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) ../../bindings/python3/$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Tpo ../../bindings/python3/$(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Plo -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='../../bindings/python3/lg_python_wrap.cc' object='../../bindings/python3/_clinkgrammar_la-lg_python_wrap.lo' libtool=yes @AMDEPBACKSLASH@ +_clinkgrammar_la-lg_python_wrap.lo: lg_python_wrap.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(_clinkgrammar_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT _clinkgrammar_la-lg_python_wrap.lo -MD -MP -MF $(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Tpo -c -o _clinkgrammar_la-lg_python_wrap.lo `test -f 'lg_python_wrap.cc' || echo '$(srcdir)/'`lg_python_wrap.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Tpo $(DEPDIR)/_clinkgrammar_la-lg_python_wrap.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='lg_python_wrap.cc' object='_clinkgrammar_la-lg_python_wrap.lo' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(_clinkgrammar_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ../../bindings/python3/_clinkgrammar_la-lg_python_wrap.lo `test -f '../../bindings/python3/lg_python_wrap.cc' || echo '$(srcdir)/'`../../bindings/python3/lg_python_wrap.cc +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(_clinkgrammar_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o _clinkgrammar_la-lg_python_wrap.lo `test -f 'lg_python_wrap.cc' || echo '$(srcdir)/'`lg_python_wrap.cc mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs - -rm -rf ../../bindings/python3/.libs ../../bindings/python3/_libs install-pkgpypathPYTHON: $(pkgpypath_PYTHON) @$(NORMAL_INSTALL) @list='$(pkgpypath_PYTHON)'; dlist=; list2=; test -n "$(pkgpypathdir)" || list=; \ @@ -807,8 +798,6 @@ distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) - -rm -f ../../bindings/python3/$(DEPDIR)/$(am__dirstamp) - -rm -f ../../bindings/python3/$(am__dirstamp) -test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES) maintainer-clean-generic: @@ -820,7 +809,7 @@ mostlyclean-am distclean: distclean-am - -rm -rf ../../bindings/python3/$(DEPDIR) + -rm -rf ./$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags @@ -866,7 +855,7 @@ installcheck-am: maintainer-clean: maintainer-clean-am - -rm -rf ../../bindings/python3/$(DEPDIR) + -rm -rf ./$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic \ maintainer-clean-local @@ -912,20 +901,20 @@ # The make uninstall target should remove directories we created. uninstall-hook: - -rmdir $(pkgpypathdir)/__pycache__ - -rmdir $(pkgpypathdir) + -rmdir $(pkgpythondir)/__pycache__ + -rmdir $(pkgpythondir) # These are packaged in the tarball; make clean should not remove them. maintainer-clean-local: - -rm -f $(BUILT_C_SOURCES) - -rm -f $(BUILT_PY_SOURCES) + -rm -f $(built_c_sources) + -rm -f $(built_py_sources) # Swig builds these .... -@HAVE_SWIG_TRUE@$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): $(SWIG_INCLUDES) -@HAVE_SWIG_TRUE@$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): $(SWIG_SOURCES) +@HAVE_SWIG_TRUE@$(built_c_sources) $(built_py_sources): $(SWIG_INCLUDES) +@HAVE_SWIG_TRUE@$(built_c_sources) $(built_py_sources): $(SWIG_SOURCES) @HAVE_SWIG_TRUE@ $(SWIG) -python -py3 -module clinkgrammar -I$(top_srcdir)/link-grammar -o $@ $< -@HAVE_SWIG_FALSE@$(BUILT_C_SOURCES) $(BUILT_PY_SOURCES): -@HAVE_SWIG_FALSE@ touch $(BUILT_C_SOURCES) $(BUILT_PY_SOURCES) +@HAVE_SWIG_FALSE@$(built_c_sources) $(built_py_sources): +@HAVE_SWIG_FALSE@ touch $(built_c_sources) $(built_py_sources) # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff -Nru link-grammar-5.3.16/bindings/python3/README.md link-grammar-5.5.0/bindings/python3/README.md --- link-grammar-5.3.16/bindings/python3/README.md 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/python3/README.md 2018-04-22 14:57:33.000000000 +0000 @@ -1,12 +1,10 @@ # Python3 bindings for Link Grammar -Description -=========== -This directory contains a Python 3.4 interface to the Link Grammar +This directory contains a Python3 interface to the Link Grammar C library. The interface generated uses the configuration files in the `python` directory. Testing ======= -See the python-examples directory for unit tests and example usage. +See the `python-examples` directory for unit tests and example usage. diff -Nru link-grammar-5.3.16/bindings/python-examples/lg_testutils.py link-grammar-5.5.0/bindings/python-examples/lg_testutils.py --- link-grammar-5.3.16/bindings/python-examples/lg_testutils.py 2016-09-26 21:50:34.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/lg_testutils.py 2018-04-22 14:57:33.000000000 +0000 @@ -68,9 +68,11 @@ original_class.original_parse = original_class.parse - def parse(self): + def parse(self, parse_options=None): """A decoration for the original Sentence.parse""" - linkages = self.original_parse() + # parse() has an optional single argument for parse options. If it is not given, + # call original_parse() also without arguments in order to test it that way. + linkages = self.original_parse() if parse_options is None else self.original_parse(parse_options) return eqcost_soretd_parse(linkages) original_class.parse = parse diff -Nru link-grammar-5.3.16/bindings/python-examples/Makefile.in link-grammar-5.5.0/bindings/python-examples/Makefile.in --- link-grammar-5.3.16/bindings/python-examples/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -334,6 +334,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -362,6 +363,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -370,13 +372,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -443,12 +448,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/bindings/python-examples/parses-en.txt link-grammar-5.5.0/bindings/python-examples/parses-en.txt --- link-grammar-5.3.16/bindings/python-examples/parses-en.txt 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/parses-en.txt 2018-03-11 23:13:19.000000000 +0000 @@ -5,9 +5,9 @@ Ithis is a test O O +----->WV----->+---Osm--+ -O +---Wd---+-Ss*b+ +Ds**c+ +O +-->Wd---+-Ss*b+ +Ds**c+ O | | | | | -OLEFT-WALL this.p is.v a test.n +OLEFT-WALL this.p is.v a test.n O C(S (NP this.p) C (VP is.v @@ -16,22 +16,32 @@ N O O +----->WV----->+---Ost--+ -O +---Wd---+-Ss*b+ +Ds**c+ +O +-->Wd---+-Ss*b+ +Ds**c+ O | | | | | -OLEFT-WALL this.p is.v a test.n +OLEFT-WALL this.p is.v a test.n O Ithe quick brown fox jumped over the lazy dog O -O +---------------->WV---------------->+ -O +-------------Wd------------+ | +O +---------------->WV---------------->+ +O +------------>Wd------------+ | O | +--------Ds**x-------+ | +-------Js------+ O | | +-------A------+ | | +---Ds**x---+ -O | | | +---A--+---Ss---+--MVp--+ | +--A--+ +O | | | +---A--+--Ss*s--+--MVp--+ | +--A--+ O | | | | | | | | | | -OLEFT-WALL the quick.a brown.a fox.n jumped.v-d over the lazy.a dog.n +OLEFT-WALL the quick.a brown.a fox.n jumped.v-d over the lazy.a dog.n O C(S (NP the quick.a brown.a fox.n) C (VP jumped.v-d C (PP over C (NP the lazy.a dog.n)))) C + +% Prefixes + +IY'gotta do it this way +O +O +---->WV---->+ +------MVa-----+ +O +->Wd--+-Sp*i+--I*t--+Osm+ +-Dsu-+ +O | | | | | | | +OLEFT-WALL y' gotta.v-d do.v it this.d way.n +O diff -Nru link-grammar-5.3.16/bindings/python-examples/parses-lt.txt link-grammar-5.5.0/bindings/python-examples/parses-lt.txt --- link-grammar-5.3.16/bindings/python-examples/parses-lt.txt 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/parses-lt.txt 2017-10-30 23:05:56.000000000 +0000 @@ -7,7 +7,7 @@ O +------->T------>+--------Xp-------+ O | +<-DV*vp<+---->DG--->+ | O | | | | | -OLEFT-WALL aš.i skaitau.tevp knygą.avg1 . +OLEFT-WALL aš.i skaitau.tevp knygą.avg1 . O ISkaitau knygą broliui. @@ -16,6 +16,6 @@ O +---------->DN--------->+ | O +---->T---->+---->DG--->+ | | O | | | | | -OLEFT-WALL skaitau.tevp knygą.avg1 broliui.pvn1 . +OLEFT-WALL skaitau.tevp knygą.avg1 broliui.pvn1 . O diff -Nru link-grammar-5.3.16/bindings/python-examples/parses-quotes-en.txt link-grammar-5.5.0/bindings/python-examples/parses-quotes-en.txt --- link-grammar-5.3.16/bindings/python-examples/parses-quotes-en.txt 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/parses-quotes-en.txt 2017-10-30 23:05:56.000000000 +0000 @@ -10,29 +10,29 @@ Ithey have told of the soldiers' fear O O +--------->WV-------->+ +----------Jp----------+ -O +---Wd--+--Sp-+---PP--+-OFj-+ +--Dmc--+--YP-+D*u*c+ +O +-->Wd--+--Sp-+---PP--+-OFj-+ +--Dmc--+--YP-+D*u*c+ O | | | | | | | | | -OLEFT-WALL they have.v told.v-d of the soldiers.n ' fear.n +OLEFT-WALL they have.v told.v-d of the soldiers.n ' fear.n O Ihe said `this is a backtick test` O O +------------------QUc-----------------+ O +------>WV----->+--------Osm-------+ | -O +---->WV---->+----Wd---+ | +-----Ds**x-----+ | -O +--Wd--+--Ss-+-QUd+ +-Ss*b+ | +---AN---+ | +O +---->WV---->+--->Wd---+ | +-----Ds**x-----+ | +O +->Wd--+--Ss-+-QUd+ +-Ss*b+ | +---AN---+ | O | | | | | | | | | | -OLEFT-WALL he said.q-d ` this.p is.v a backtick.n test.n ` +OLEFT-WALL he said.q-d ` this.p is.v a backtick.n test.n ` O I“What are you doing?” she asked. O -O +---------------QUc--------------+ -O +--------------Xc--------------+ | -O +----------->WV---------->+ | | -O | +------Bsw------+ | | -O +----Wq---+ +---Pg*b---+ | +----CP---+ +O +---------------QUc--------------+ +O +--------------Xc--------------+ | +O +----------->WV---------->+ | | +O | +------Bsw------+ | | +O +--->Wq---+ +---Pg*b---+ | +----CP---+ O +-QUd-+ +-Rw-+SIpx+ | | | +--Ss--+--Xp-+ O | | | | | | | | | | | -OLEFT-WALL “ what are.v you doing.v ? ” she asked.q-d . -O +OLEFT-WALL “ what are.v you doing.v ? ” she asked.q-d . +O diff -Nru link-grammar-5.3.16/bindings/python-examples/parses-sat-en.txt link-grammar-5.5.0/bindings/python-examples/parses-sat-en.txt --- link-grammar-5.3.16/bindings/python-examples/parses-sat-en.txt 2016-09-26 21:50:34.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/parses-sat-en.txt 2017-10-30 23:05:56.000000000 +0000 @@ -9,7 +9,7 @@ O +---Osm--+ O +-Ss*b+ +Ds**c+ O | | | | -Othis.p is.v a test.n +Othis.p is.v a test.n O C(S (NP this.p) C (VP is.v diff -Nru link-grammar-5.3.16/bindings/python-examples/README.md link-grammar-5.5.0/bindings/python-examples/README.md --- link-grammar-5.3.16/bindings/python-examples/README.md 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/README.md 2018-04-22 14:57:33.000000000 +0000 @@ -1,28 +1,46 @@ Python bindings for Link Grammar ================================ +This directory contains an example program, and a unit test for the +python bindings to Link Grammar. -Description ------------ -A Link Grammar library test is implemented in `tests.py`. -An example program `example.py` is provided. +The example programs `example.py` and `sentence-check.py` illustrates +how the to use the Link Grammar Python bindings. + +A unit test for the Link Grammar Python bindings can be found in +in `tests.py`. Configuring (if needed) ----------------------- -### For Python2 - $ configure --enable-python-bindings -### For Python3 - $ configure --enable-python3-bindings - +The python bindings will be built by default, if the required python +system libraries are detected on the build system. Thus, no special +configuration should be needed. However, configure can be forced with +the following commands. + +### For Python2 and Python3 + `$ ./configure --enable-python-bindings` +(This is the default if Python development packages are installed.) + +### For Python2 or Python3 only + `$ ./configure --enable-python-bindings=2` +Or:
+ `$ ./configure --enable-python-bindings=3` + +### To disable the Python bindings + `$ ./configure --disable-python-bindings` +(This is the default if no Python is installed.) How to use ---------- -(See below under **Testing the installation** for directions on how to set -`PYTHONPATH` in case it is needed.) +The python bindings will be installed automatically into default system +locations, and no additional steps should be needed to use python. +However, in some cases, therere might be a need to manually set the +`PYTHONPATH` environment variable. See the discussion below, in +the section **Testing the installation** . Parsing simple sentences: ``` -$ python +`$ python` >>> from linkgrammar import Sentence, ParseOptions, Dictionary >>> sent = Sentence("This is a simple sentence.", Dictionary(), ParseOptions()) @@ -40,41 +58,44 @@ | | | | | | | LEFT-WALL this.p is.v a simple.a sentence.n . ``` -Additional examples can be found in `examples.py`. +Additional examples can be found in `examples.py` and `sentence-cehck.py`. Testing ------- -The test collection `tests.py` should run 56 tests, none of them should fail. -However, 3 tests will get skipped if the library is not configured with a -speller, and one test will get skipped if the library is not configured with -the SAT solver (this is the status for now on native Windows). - -The following shows how to issue the tests on systems other then natives -Windows/MinGW (for testing on native Windows see msvc14/README under -"Running Python programs"). - -Note: For less verbosity of the `make` command output you can use the `-s` -flag of make. +The test collection `tests.py` should run 76 tests; none of them should +fail. However, 3 tests will be skipped, if the library is not configured +with a spell guesser, and one test will be skipped if the library is not +configured with the SAT solver (this is currently the case for native +Windows builds). + +The test procedure is outlined below. For native Windows/MinGW, see +the `msvc14/README.md` file: +[Running Python programs in Windows](/msvc14/README.md#running-python-programs). ### Testing the build directory The following is assumed: + **$SRC_DIR** - Link Grammar source directory. + **$BUILD_DIR** - Link Grammar build directory. -#### By `make` +#### Using `make` +The tests can be run using the `make` command, as follows: ``` $ cd $BUILD_DIR/bindings/python-examples $ make [-s] check ``` -The results of tests.py are in the current directory under in the file +The `make` command can be made less verbose by using the `-s` flag. + +The test results are saved in the current directory, in the file `tests.log`. -Note: To run also the tests in the **$SRC_DIR/tests/** directory, issue -`make check` directly from **$BUILD_DIR**. +To run the tests in the **$SRC_DIR/tests/** directory, issue `make check` +directly from **$BUILD_DIR**. #### Manually -To run tests.py manually, or to run `example.py`, you have to set the -`PYTHONPATH` environment variable as follows: +To run `tests.py` manually, or to run `example.py`, without installing +the bindings, the `PYTHONPATH` environment variable must be set: ``` PYTHONPATH=$SRC_DIR/bindings/python:$BUILD_DIR/bindings/python:$BUILD_DIR/bindings/python/.libs ``` @@ -87,7 +108,7 @@ ### Testing the installation This can be done only after `make install`. -#### By `make` +#### Using `make` ``` $ cd $BUILD_DIR/bindings/python-examples $ make [-s] installcheck @@ -100,17 +121,18 @@ Python's **linkgrammar** module, e.g.: ``` -PYTHONPATH=/usr/local/lib/python2.7/site-packages +PYTHONPATH=/usr/local/lib/python2.7/dist-packages ``` (Export it, or prepend it to the `python` command.) -
-Note: This is not needed if the package has been configured to install to the -OS standard system locations. - -**NOTE:** Make sure you invoke `tests.py` from a directory from which it cannot -find the `data` directory in **$SRCDIR/.** ! This will enforce it to use the -system-installed data directory. Two directory levels under **$SRCDIR**, as -shown below, is fine for that purpose. + +Setting the `PYTHONPATH` is not needed if the default package +configuration is used. The default configuration installs the python +bindings into the standard operating system locations. + +To correctly test the system installation, make sure that `tests.py` is +invoked from a directory from which the **$SRCDIR/data.** directory +cannot be found. This is needed to ensure that the system-installed data +directory is used. For example: ``` $ cd $SRCDIR/binding/python-examples diff -Nru link-grammar-5.3.16/bindings/python-examples/sentence-check.py link-grammar-5.5.0/bindings/python-examples/sentence-check.py --- link-grammar-5.3.16/bindings/python-examples/sentence-check.py 2017-01-03 20:43:00.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/sentence-check.py 2018-04-22 14:57:33.000000000 +0000 @@ -22,31 +22,57 @@ import sys import re import itertools +import argparse from linkgrammar import (Sentence, ParseOptions, Dictionary, - LG_TimerExhausted, Clinkgrammar as clg) -print("Version:", clg.linkgrammar_get_version()) + LG_Error, LG_TimerExhausted, Clinkgrammar as clg) def nsuffix(q): return '' if q == 1 else 's' +class Formatter(argparse.HelpFormatter): + """ Display the "lang" argument as a first one, as in link-parser. """ + def _format_usage(self, usage, actions, groups, prefix): + usage_message = super(Formatter, self)._format_usage(usage, actions, groups, prefix) + return re.sub(r'(usage: \S+) (.*) \[lang]', r'\1 [lang] \2', str(usage_message)) + #-----------------------------------------------------------------------------# DISPLAY_GUESSES = True # Display regex and POS guesses -DEBUG_POSITION = True # Debug word position -po = ParseOptions(verbosity=0) # 1=more verbose; 2=trace; >5=debug -lgdict = Dictionary('en') +args = argparse.ArgumentParser(formatter_class=Formatter) +args.add_argument('lang', nargs='?', default='en', + help="language or dictionary location") +args.add_argument("-v", "--verbosity", type=int,default=0, + choices=range(0,199), metavar='[0-199]', + help= "1: Basic verbosity; 2-4: Trace; >5: Debug") +args.add_argument("-p", "--position", action="store_true", + help="show word sentence position") +args.add_argument("-nm", "--no-morphology", dest='morphology', action='store_false', + help="do not display morphology") + +arg = args.parse_args() + +try: + lgdict = Dictionary(arg.lang) +except LG_Error: + # The default error handler will print the error message + args.print_usage() + sys.exit(2) + +po = ParseOptions(verbosity=arg.verbosity) po.max_null_count = 999 # > allowed maximum number of words po.max_parse_time = 10 # actual parse timeout may be about twice bigger -po.spell_guess = 0 # spell guesses are not handled in this demo +po.spell_guess = True if DISPLAY_GUESSES else False +po.display_morphology = arg.morphology +print("Enter sentences:") # iter(): avoid python2 input buffering for sentence_text in iter(sys.stdin.readline, ''): if sentence_text.strip() == '': continue - sent = Sentence(sentence_text, lgdict, po) + sent = Sentence(str(sentence_text), lgdict, po) try: linkages = sent.parse() except LG_TimerExhausted: @@ -75,9 +101,15 @@ break # Show results with unlinked words or guesses - if not guess_found and null_count == 0: + if not arg.position and not guess_found and null_count == 0: continue + + if arg.position: + for p in range (0, len(sentence_text)): + print(p%10, end="") + print() + print('Sentence has {} unlinked word{}:'.format( null_count, nsuffix(null_count))) result_no = 0 @@ -88,4 +120,19 @@ continue result_no += 1 uniqe_parse[str(words)] = True - print("{}: {}".format(result_no, ' '.join(words))) + + if arg.position: + words_char = [] + words_byte = [] + wi = 0 + for w in words: + if sys.version_info < (3, 0): + words[wi] = words[wi].decode('utf-8') + words_char.append(words[wi] + str((linkage.word_char_start(wi), linkage.word_char_end(wi)))) + words_byte.append(words[wi] + str((linkage.word_byte_start(wi), linkage.word_byte_end(wi)))) + wi += 1 + + print(u"{}: {}".format(result_no, ' '.join(words_char))) + print(u"{}: {}".format(result_no, ' '.join(words_byte))) + else: + print("{}: {}".format(result_no, ' '.join(words))) diff -Nru link-grammar-5.3.16/bindings/python-examples/tests.py link-grammar-5.5.0/bindings/python-examples/tests.py --- link-grammar-5.3.16/bindings/python-examples/tests.py 2017-02-17 21:27:03.000000000 +0000 +++ link-grammar-5.5.0/bindings/python-examples/tests.py 2018-04-23 19:49:07.000000000 +0000 @@ -3,10 +3,16 @@ """Python link-grammar test script""" from __future__ import print_function -import sys, os +import sys, os, re import locale import unittest +# assertRaisesRegexp and assertRegexpMatches have been renamed in +# unittest for python 3, but not in python 2 (at least yet). +if hasattr(unittest.TestCase, 'assertRaisesRegex'): + unittest.TestCase.assertRaisesRegexp = unittest.TestCase.assertRaisesRegex + unittest.TestCase.assertRegexpMatches = unittest.TestCase.assertRegex + import lg_testutils # Found in the same directory of this test script # Show information on this program run @@ -16,21 +22,24 @@ print('{}={}'.format(v, os.environ.get(v))) #=== - from linkgrammar import (Sentence, Linkage, ParseOptions, Link, Dictionary, LG_Error, LG_DictionaryError, LG_TimerExhausted, Clinkgrammar as clg) +print(clg.linkgrammar_get_configuration()) # Show the location and version of the bindings modules -for module in 'linkgrammar', '_clinkgrammar', 'lg_testutils': - if module in sys.modules: - print("Using", sys.modules[module], end='') - if hasattr(sys.modules[module], '__version__'): - print(' version', sys.modules[module].__version__, end='') - print() - else: - print("Warning: Module", module, "not loaded.") +for imported_module in 'linkgrammar$', 'clinkgrammar', '_clinkgrammar', 'lg_testutils': + module_found = False + for module in sys.modules: + if re.search(r'^(linkgrammar\.)?'+imported_module, module): + print("Using", sys.modules[module], end='') + if hasattr(sys.modules[module], '__version__'): + print(' version', sys.modules[module].__version__, end='') + print() + module_found = True + if not module_found: + print("Warning: Module", imported_module, "not loaded.") sys.stdout.flush() #=== @@ -58,8 +67,15 @@ class AADictionaryTestCase(unittest.TestCase): def test_open_nonexistent_dictionary(self): - self.assertRaises(LG_DictionaryError, Dictionary, 'No such language test 1') - self.assertRaises(LG_Error, Dictionary, 'No such language test 2') + dummy_lang = "No such language test " + + save_stderr = divert_start(2) + self.assertRaises(LG_DictionaryError, Dictionary, dummy_lang + '1') + self.assertIn(dummy_lang + '1', save_stderr.divert_end()) + + save_stderr = divert_start(2) + self.assertRaises(LG_Error, Dictionary, dummy_lang + '2') + self.assertIn(dummy_lang + '2', save_stderr.divert_end()) class BParseOptionsTestCase(unittest.TestCase): def test_setting_verbosity(self): @@ -72,7 +88,7 @@ def test_setting_verbosity_to_not_allow_value_raises_value_error(self): po = ParseOptions() - self.assertRaises(ValueError, setattr, po, "verbosity", 16) + self.assertRaises(ValueError, setattr, po, "verbosity", 121) def test_setting_verbosity_to_non_integer_raises_type_error(self): po = ParseOptions() @@ -233,12 +249,11 @@ @classmethod def setUpClass(cls): cls.d = Dictionary() - cls.po = None @classmethod def tearDownClass(cls): del cls.d - del cls.po + del cls.parse_sent def parse_sent(self, text, po=ParseOptions()): return list(Sentence(text, self.d, po).parse()) @@ -320,11 +335,10 @@ self.assertEqual(linkage.word(4), '_regex_ive[!].a') def test_timer_exhausted_exception(self): - self.po = ParseOptions(max_parse_time=1) self.assertRaises(LG_TimerExhausted, self.parse_sent, "This should take more than one second to parse! " * 20, - self.po) + ParseOptions(max_parse_time=1)) # The tests here are are numbered since their order is important. # They depend on the result and state of the previous ones as follows: @@ -401,13 +415,16 @@ #LG_Error.set_handler(self.__class__.handler["default"], "bad param") #with self.assertRaises(TypeError): # try: - # Dictionary("a visible dummy dict name (bad param test)") + # Dictionary("a dummy dict name (bad param test)") # except LG_Error: # pass # So test it directly. - self.assertRaises(LG_Error, Dictionary, "a visible dummy dict name (bad param test)") + + dummy_lang = "a dummy dict name (bad param test)" + self.assertRaises(LG_Error, Dictionary, dummy_lang) LG_Error.printall(self.error_handler_test, self) # grab a valid errinfo + #self.assertIn(dummy_lang, save_stderr.divert_end()) self.assertRaisesRegexp(TypeError, "must be an integer", self.__class__.handler["default"], self.errinfo, "bad param") @@ -416,9 +433,12 @@ self.errinfo, clg.lg_None+1) self.assertRaises(ValueError, self.__class__.handler["default"], self.errinfo, -1) + try: self.param_ok = False + save_stdout = divert_start(1) # Note: Handler parameter is stdout self.__class__.handler["default"](self.errinfo, 1) + self.assertIn(dummy_lang, save_stdout.divert_end()) self.param_ok = True except (TypeError, ValueError): self.assertTrue(self.param_ok) @@ -466,6 +486,24 @@ self.assertEqual(self.numerr, 0) self.assertFalse(hasattr(self, "gotit")) + def test_41_flush(self): + # Here the error handler is still set to None. + # First validate that nothing gets flushed (no error is buffered at this point). + self.flushed = LG_Error.flush() + self.assertEqual(self.flushed, False) + # Now generate a partial error message that is still buffered. + LG_Error.message("This is a partial error message.") + # Validate that it is still hidden. + self.numerr = LG_Error.printall(self.error_handler_test, self) + self.assertEqual(self.numerr, 0) + self.assertFalse(hasattr(self, "gotit")) + # Flush it. + self.flushed = LG_Error.flush() + self.assertEqual(self.flushed, True) + self.numerr = LG_Error.printall(self.error_handler_test, self) + self.assertEqual(self.numerr, 1) + self.assertRegexpMatches(self.errinfo.text, "partial") + def test_50_set_orig_error_handler(self): # Set the error handler back to the default handler. # The error message is now visible (but we cannot test that). @@ -474,8 +512,12 @@ for _ in range(0, 1+self.testleaks): self.__class__.handler["previous"] = LG_Error.set_handler(self.__class__.handler["default"]) self.assertEqual(self.__class__.handler["previous"].__name__, "_default_handler") + self.errinfo = "dummy" - self.assertRaises(LG_Error, Dictionary, "a visible dummy dict name (default handler test)") + dummy_lang = "a dummy dict name (default handler test)" + save_stderr = divert_start(2) + self.assertRaises(LG_Error, Dictionary, dummy_lang) + self.assertIn(dummy_lang, save_stderr.divert_end()) self.assertEqual(self.errinfo, "dummy") class FSATsolverTestCase(unittest.TestCase): @@ -491,7 +533,7 @@ class HEnglishLinkageTestCase(unittest.TestCase): @classmethod def setUpClass(cls): - cls.d, cls.po = Dictionary(), ParseOptions() + cls.d, cls.po = Dictionary(), ParseOptions(linkage_limit=300) @classmethod def tearDownClass(cls): @@ -517,7 +559,7 @@ self.assertEqual(linkage.link(1), Link(linkage, 1, 'LEFT-WALL','hWV','dWV','is.v')) self.assertEqual(linkage.link(2), - Link(linkage, 2, 'LEFT-WALL','Wd','Wd','this.p')) + Link(linkage, 2, 'LEFT-WALL','hWd','Wd','this.p')) self.assertEqual(linkage.link(3), Link(linkage, 3, 'this.p','Ss*b','Ss','is.v')) self.assertEqual(linkage.link(4), @@ -646,18 +688,18 @@ "\n +---------------------------->WV---------------------------->+ |" "\n | +-----------------------Sp-----------------------+ |" "\n | | +------------VJlpi------------+ |" -"\n +-----Wd----+ +---E---+---I---+----Op----+ +VJrpi+---Op--+ |" +"\n +---->Wd----+ +---E---+---I---+----Op----+ +VJrpi+---Op--+ |" "\n | | | | | | | | | |" -"\nLEFT-WALL scientists.n sometimes may.v repeat.v experiments.n or.j-v use.v groups.n . " +"\nLEFT-WALL scientists.n sometimes may.v repeat.v experiments.n or.j-v use.v groups.n ." "\n\n") sent = 'I enjoy eating bass.' linkage = self.parse_sent(sent)[0] self.assertEqual(linkage.diagram(), "\n +-----------------Xp----------------+" "\n +---->WV---->+ |" -"\n +--Wd--+-Sp*i+---Pg---+---Ou---+ |" +"\n +->Wd--+-Sp*i+---Pg---+---Ou---+ |" "\n | | | | | |" -"\nLEFT-WALL I.p enjoy.v eating.v bass.n-u . " +"\nLEFT-WALL I.p enjoy.v eating.v bass.n-u ." "\n\n") @@ -665,9 +707,9 @@ linkage = self.parse_sent(sent)[0] self.assertEqual(linkage.diagram(), "\n +--->WV--->+ +---------Js--------+" -"\n +--Wd--+Spx+--Pp-+ +--DD--+---GN---+" +"\n +->Wd--+Spx+--Pp-+ +--DD--+---GN---+" "\n | | | | | | |" -"\nLEFT-WALL we are.v from the planet.n Gorpon[!] " +"\nLEFT-WALL we are.v from the planet.n Gorpon[!]" "\n\n") @@ -692,7 +734,7 @@ # This sentence has no complete linkage. Validate that the library # doesn't mangle parsing with null-count>0 due to power_prune()'s # connector-discard optimization at null-count==0. Without commit - # "Allow calling chart_parse() with and w/o nulls", the number of + # "Allow calling classic_parse() with and w/o nulls", the number of # linkages here is 1 instead of 2 and the unused_word_cost is 5. self.po = ParseOptions(min_null_count=0, max_null_count=999) linkages = Sentence('about people attended', self.d, self.po).parse() @@ -710,6 +752,16 @@ # | | #[about] people.p attended.v-d + def test_2_step_parsing_with_null_links(self): + self.po = ParseOptions(min_null_count=0, max_null_count=0) + + sent = Sentence('about people attended', self.d, self.po) + linkages = sent.parse() + self.assertEqual(len(linkages), 0) + self.po = ParseOptions(min_null_count=1, max_null_count=999) + linkages = sent.parse(self.po) + self.assertEqual(len(linkages), 2) + self.assertEqual(linkages.next().unused_word_cost(), 1) class ZENConstituentsCase(unittest.TestCase): @classmethod @@ -840,16 +892,26 @@ testfile = clg.test_data_srcdir + "parses-" + desc + clg.dictionary_get_lang(lgdict._obj) + ".txt" parses = open(testfile, "rb") diagram = None + constituents = None sent = None lineno = 0 - opcode_detected = 0 # function sanity check + + # Function code and file format sanity check + self.opcode_detected = 0 + def validate_opcode(ctxt=self, O=False, C=False): + ctxt.opcode_detected += 1 + if O: + self.assertFalse(diagram, "at {}:{}: Unfinished diagram entry".format(testfile, lineno)) + if C: + self.assertFalse(constituents, "at {}:{}: Unfinished constituents entry".format(testfile, lineno)) + for line in parses: lineno += 1 if sys.version_info > (3, 0): line = line.decode('utf-8') # Lines starting with I are the input sentences if 'I' == line[0]: - opcode_detected += 1 + validate_opcode(O=True, C=True) sent = line[1:] diagram = "" constituents = "" @@ -859,7 +921,7 @@ # Generate the next linkage of the last input sentence if 'N' == line[0]: - opcode_detected += 1 + validate_opcode(O=True, C=True) diagram = "" constituents = "" linkage = next(linkages, None) @@ -868,27 +930,59 @@ # Lines starting with O are the parse diagram # It ends with an empty line if 'O' == line[0]: - opcode_detected += 1 + validate_opcode(C=True) diagram += line[1:] if '\n' == line[1] and 1 < len(diagram): self.assertEqual(linkage.diagram(), diagram, "at {}:{}".format(testfile, lineno)) + diagram = None # Lines starting with C are the constituent output (type 1) # It ends with an empty line if 'C' == line[0]: - opcode_detected += 1 + validate_opcode(O=True) if '\n' == line[1] and 1 < len(constituents): self.assertEqual(linkage.constituent_tree(), constituents, "at {}:{}".format(testfile, lineno)) - constituents += line[1:] + constituents = None + else: + constituents += line[1:] parses.close() - self.assertGreaterEqual(opcode_detected, 2, "Nothing has been done for " + testfile) + validate_opcode(O=True, C=True) + self.assertGreaterEqual(self.opcode_detected, 2, "Nothing has been done for " + testfile) def warning(*msg): progname = os.path.basename(sys.argv[0]) print("{}: Warning:".format(progname), *msg, file=sys.stderr) +import tempfile + +class divert_start(object): + """ Output diversion. """ + def __init__(self, fd): + """ Divert a file descriptor. + The created object is used for restoring the original file descriptor. + """ + self.fd = fd + self.savedfd = os.dup(fd) + (newfd, self.filename) = tempfile.mkstemp(text=False) + os.dup2(newfd, fd) + os.close(newfd) + + def divert_end(self): + """ Restore a previous diversion and return its content. """ + if not self.filename: + return "" + os.lseek(self.fd, os.SEEK_SET, 0) + content = os.read(self.fd, 1024) # 1024 is more than needed + os.dup2(self.savedfd, self.fd) + os.close(self.savedfd) + os.unlink(self.filename) + self.filename = None + return str(content) + + __del__ = divert_end + # Decorate Sentence.parse with eqcost_soretd_parse. lg_testutils.add_eqcost_linkage_order(Sentence) diff -Nru link-grammar-5.3.16/bindings/swig/link_grammar.i link-grammar-5.5.0/bindings/swig/link_grammar.i --- link-grammar-5.3.16/bindings/swig/link_grammar.i 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/bindings/swig/link_grammar.i 2018-04-22 14:57:33.000000000 +0000 @@ -26,6 +26,7 @@ const char * linkgrammar_get_version(void); +const char * linkgrammar_get_configuration(void); const char * linkgrammar_get_dict_version(Dictionary dict); const char * linkgrammar_get_dict_locale(Dictionary dict); @@ -182,6 +183,10 @@ const char ** linkage_get_words(Linkage linkage); //const char * linkage_get_disjunct(Linkage linkage, int w); const char * linkage_get_word(Linkage linkage, int w); +int linkage_get_word_byte_start(Linkage linkage, int index); +int linkage_get_word_byte_end(Linkage linkage, int index); +int linkage_get_word_char_start(Linkage linkage, int index); +int linkage_get_word_char_end(Linkage linkage, int index); int linkage_unused_word_cost(Linkage linkage); double linkage_disjunct_cost(Linkage linkage); @@ -195,7 +200,10 @@ char * lg_error_formatmsg(lg_errinfo *lge); int lg_error_clearall(void); %rename(_prt_error) prt_error; -int prt_error(const char * , ...); +/* For security, the first argument should always contain a single "%s" + * (e.g. "%s\n"), and the second one should always be a C string. */ +int prt_error(const char *, const char *); +bool lg_error_flush(void); /* * void *lg_error_set_handler_data(void *); * A wrapper to this function is complex and is not implemented here. However, @@ -257,11 +265,24 @@ } %} +%typemap(in) lg_errinfo *eh_lge +{ + void *argp1 = 0; + + if (Py_None == $input) + SWIG_exception_fail(SWIG_TypeError, "in method '_py_error_default_handler', argument 1 (of type lg_errinfo *) must not be None."); + + int res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_lg_errinfo, 0); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '_py_error_default_handler', argument 1 of type 'lg_errinfo *'"); + } + arg1 = (lg_errinfo *)(argp1); +} + /* The second argument of the default callback can be NULL or a severity_level integer. Validate that and convert it to C int. */ -%typemap(in) int *pedh_data +%typemap(in) int *pedh_data (int arg) { - int arg; bool error = false; const char errmsg[] = "The default error handler data argument (arg 2) " "must be an integer (0 to lg_None) or None."; @@ -294,9 +315,9 @@ } %inline %{ -void _py_error_default_handler(lg_errinfo *lge, int *pedh_data) +void _py_error_default_handler(lg_errinfo *eh_lge, int *pedh_data) { - default_error_handler(lge, (void *)pedh_data); + default_error_handler(eh_lge, (void *)pedh_data); } /** @@ -354,6 +375,7 @@ } void delete_lg_errinfo(lg_errinfo *lge) { + if (NULL == lge) return; /* Was None - nothing to free. */ free((void *)lge->severity_label); free((void *)lge->text); free((void *)lge); diff -Nru link-grammar-5.3.16/ChangeLog link-grammar-5.5.0/ChangeLog --- link-grammar-5.3.16/ChangeLog 2017-04-15 21:43:39.000000000 +0000 +++ link-grammar-5.5.0/ChangeLog 2018-04-29 21:33:08.000000000 +0000 @@ -1,8 +1,78 @@ +Version 5.5.0 (29 April 2018) + * Fix accidental API breakage that impacts OpenCog. + * Fix memory leak when parsing with null links. + * Python bindings: Add an optional parse-option argument to parse(). + * Add an extended version API and use it in "link-parser --version". + * Fix spurious errors if the last dict line is a comment. + * Fix garbage report if EOF encountered in a quoted dict word. + * Fix garbage report if whitespace encountered in a quoted dict word. + * Add a per-command help in link-parser. + * Add a command line completion in link-parser. + * Enable build of word-graph printing support by default. + * Add idiom lookup in link-parser's dict lookup command (!!idiom_here). + * Improve handling of quoted words (e.g. single words in "scare quotes"). + * Fix random selection of linkages so that it's actually random. + +Version 5.4.4 (11 March 2018) + * Dictionary loading now thread safe. + * Fix post-nominal modifiers used with pronouns. + * Fix comparative openers. + * Fix given-name single-letter abbreviations. + * Fix conjoined questions and conjoined WH-statements. + * Fix conditional sentences. + * Fix misc comparatives. + * Fix crash on invalid UTF-8 input. + * Fix many predicative adjective uses. + * Fix many paraphrasing-type constructions. + * Minor cleanup of word-lists. + * New dict definition LENGTH-LIMIT-n to limit connector link length to n. + * Speed up parsing of Russian by factor of 2x. + * Add assorted technical vocabulary (#680) + * Fix conjoined infinitives. + +Version 5.4.3 (4 January 2018) + * Fix man page installation (actually broken from 5.3.0). + * Add "thither" to the English dictionary. + * Fix printing inf loop for very narrow screen widths. + * Some Windows code clean up. + * Remove trailing blanks from the linkage diagram. + * Fix square area and cubic volume measurements (English dict). + * Fix assorted exclamations and responses (English dict). + * Fix displaying random linkages on Windows. + * Fix unit tokenization to remove ambiguity. + * Fix utf8-related bug on Windows that could affect printing. + * Add missing affix file, needed for the 'any' language. + +Version 5.4.2 (19 October 2017) + * Fix man page build (broken in 5.4.1) + +Version 5.4.1 (18 October 2017) + * Fix man page installation (broken in 5.3.8). + * Add affix-class MPUNC for splitting at intra-word punctuation. + * Fix crash when there is no PP info. + * Fix a stack buffer overflow. + * Eliminate hard-wired linkage diagram size limitations. + * Fix an unintended clipping of the linkage-limit option to 250000. + +Version 5.4.0 (26 July 2017) + * Fix for missing locale info in Windows XP. + * Empty out the post-processing tables for the any, ady, amy languages + * Remove left_print_string() from the API. + * Recover pp_lexer.l from ancient version 2.2! + * Fix unusual crash in post-processing for the "any" language. + * Remove three deprecated post-processing functions from API. + * Major reorganization of code base into more modular directories. + * Revive the sqlite3 dictionary into operational form. + * Add double-quotes to splittable punctuation for the "any" language. + * Add API functions to get linkage word positions in the sentence. + * Fix printing of diagrams containing Chinese or other wide glyphs. + * Fix `make distclean` when ant not installed. + Version 5.3.16 (15 April 2017) * Fix python3 unit tests. * Restore tty state after ctrl-C, ctrl-Z of the app. * Check for before use (in Java bindings). - * Fix handling of punctuation in the "any" langauge. + * Fix handling of punctuation in the "any" language. Version 5.3.15 (12 Feb 2017) * Fix Windows compilation; the new wcwidth files were omitted. diff -Nru link-grammar-5.3.16/configure link-grammar-5.5.0/configure --- link-grammar-5.3.16/configure 2017-04-15 21:49:51.000000000 +0000 +++ link-grammar-5.5.0/configure 2018-04-29 21:40:04.000000000 +0000 @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for link-grammar 5.3.16. +# Generated by GNU Autoconf 2.69 for link-grammar 5.5.0. # # Report bugs to . # @@ -198,6 +198,7 @@ as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1 +test \$(( 1 + 1 )) = 2 || exit 1 test -n \"\${ZSH_VERSION+set}\${BASH_VERSION+set}\" || ( ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' @@ -205,8 +206,7 @@ ECHO=\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO PATH=/empty FPATH=/empty; export PATH FPATH test \"X\`printf %s \$ECHO\`\" = \"X\$ECHO\" \\ - || test \"X\`print -r -- \$ECHO\`\" = \"X\$ECHO\" ) || exit 1 -test \$(( 1 + 1 )) = 2 || exit 1" + || test \"X\`print -r -- \$ECHO\`\" = \"X\$ECHO\" ) || exit 1" if (eval "$as_required") 2>/dev/null; then : as_have_required=yes else @@ -590,8 +590,8 @@ # Identity of this package. PACKAGE_NAME='link-grammar' PACKAGE_TARNAME='link-grammar' -PACKAGE_VERSION='5.3.16' -PACKAGE_STRING='link-grammar 5.3.16' +PACKAGE_VERSION='5.5.0' +PACKAGE_STRING='link-grammar 5.5.0' PACKAGE_BUGREPORT='link-grammar@googlegroups.com' PACKAGE_URL='' @@ -636,9 +636,14 @@ am__EXEEXT_TRUE LTLIBOBJS LIBOBJS +LG_DEFS LINK_CFLAGS WARN_CXXFLAGS WARN_CFLAGS +HAVE_SWIG_FALSE +HAVE_SWIG_TRUE +SWIG_LIB +SWIG PYTHON3 LG_PYDIR HAVE_PYTHON3_FALSE @@ -672,17 +677,14 @@ PERL_EXT_LIB PERL_CFLAGS PERL_EXT_INC -HAVE_SWIG_FALSE -HAVE_SWIG_TRUE -SWIGfound -SWIG_LIB -SWIG HAVE_ANT_FALSE HAVE_ANT_TRUE ANTfound -JAVA_CPPFLAGS HAVE_JAVA_FALSE HAVE_JAVA_TRUE +JAVA_CPPFLAGS +_ACJNI_JAVAC +JAVA_SRCDIR REGEX_LIBS PCRE_CFLAGS PCRE_LIBS @@ -712,10 +714,6 @@ WITH_ANYSPLIT_TRUE WITH_CORPUS_FALSE WITH_CORPUS_TRUE -WITH_VITERBI_FALSE -WITH_VITERBI_TRUE -LIBGC_LIBS -LIBGC_CFLAGS WITH_SAT_SOLVER_FALSE WITH_SAT_SOLVER_TRUE LIBMINISAT_BUNDLED_FALSE @@ -723,10 +721,9 @@ ZLIB_CPPFLAGS MINISAT_INCLUDES MINISAT_LIBS +HOST_OS OS_X_FALSE OS_X_TRUE -OS_CYGWIN_FALSE -OS_CYGWIN_TRUE OS_WIN32_FALSE OS_WIN32_TRUE ALLOCA @@ -744,18 +741,22 @@ RANLIB ac_ct_AR AR -DLLTOOL -OBJDUMP -LN_S NM ac_ct_DUMPBIN DUMPBIN LD FGREP -EGREP -GREP SED LIBTOOL +OBJDUMP +DLLTOOL +AS +EGREP +GREP +LN_S +LEXLIB +LEX_OUTPUT_ROOT +LEX CPP am__fastdepCXX_FALSE am__fastdepCXX_TRUE @@ -884,7 +885,6 @@ enable_perl_bindings enable_sat_solver with_zlib -enable_viterbi enable_corpus_stats enable_wordgraph_display enable_aspell @@ -911,8 +911,6 @@ PKG_CONFIG PKG_CONFIG_PATH PKG_CONFIG_LIBDIR -LIBGC_CFLAGS -LIBGC_LIBS SQLITE3_CFLAGS SQLITE3_LIBS HUNSPELL_CFLAGS @@ -1473,7 +1471,7 @@ # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures link-grammar 5.3.16 to adapt to many kinds of systems. +\`configure' configures link-grammar 5.5.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1544,7 +1542,7 @@ if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of link-grammar 5.3.16:";; + short | recursive ) echo "Configuration of link-grammar 5.5.0:";; esac cat <<\_ACEOF @@ -1587,13 +1585,12 @@ ARG=yes (default): Use the system minisat library if possible ARG=bundled: Use the bundled minisat library - --enable-viterbi enable use of the Viterbi parser --enable-corpus-stats use corpus statistics --enable-wordgraph-display enable graphical display of the Wordgraph --disable-aspell Build without ASpell support (default is enabled) --disable-hunspell Build without HunSpell support (default is enabled) --disable-editline disable use of editline - --enable-regex-tokenizer enable use of regex word splitter + --enable-regex-tokenizer enable use of regex word splitter (experimental) Optional Packages: --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] @@ -1633,9 +1630,6 @@ directories to add to pkg-config's search path PKG_CONFIG_LIBDIR path overriding pkg-config's built-in search path - LIBGC_CFLAGS - C compiler flags for LIBGC, overriding pkg-config - LIBGC_LIBS linker flags for LIBGC, overriding pkg-config SQLITE3_CFLAGS C compiler flags for SQLITE3, overriding pkg-config SQLITE3_LIBS @@ -1723,7 +1717,7 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -link-grammar configure 5.3.16 +link-grammar configure 5.5.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -1896,37 +1890,6 @@ } # ac_fn_c_try_link -# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES -# ------------------------------------------------------- -# Tests whether HEADER exists and can be compiled using the include files in -# INCLUDES, setting the cache variable VAR accordingly. -ac_fn_c_check_header_compile () -{ - as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 -$as_echo_n "checking for $2... " >&6; } -if eval \${$3+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -$4 -#include <$2> -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - eval "$3=yes" -else - eval "$3=no" -fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -fi -eval ac_res=\$$3 - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 -$as_echo "$ac_res" >&6; } - eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno - -} # ac_fn_c_check_header_compile - # ac_fn_c_try_run LINENO # ---------------------- # Try to link conftest.$ac_ext, and return whether this succeeded. Assumes @@ -1969,6 +1932,37 @@ } # ac_fn_c_try_run +# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES +# ------------------------------------------------------- +# Tests whether HEADER exists and can be compiled using the include files in +# INCLUDES, setting the cache variable VAR accordingly. +ac_fn_c_check_header_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +#include <$2> +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + eval "$3=yes" +else + eval "$3=no" +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} # ac_fn_c_check_header_compile + # ac_fn_c_check_func LINENO FUNC VAR # ---------------------------------- # Tests whether FUNC exists, setting the cache variable VAR accordingly @@ -2404,7 +2398,7 @@ This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by link-grammar $as_me 5.3.16, which was +It was created by link-grammar $as_me 5.5.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2755,8 +2749,8 @@ PACKAGE=link-grammar LINK_MAJOR_VERSION=5 -LINK_MINOR_VERSION=3 -LINK_MICRO_VERSION=16 +LINK_MINOR_VERSION=5 +LINK_MICRO_VERSION=0 MAJOR_VERSION_PLUS_MINOR_VERSION=`expr $LINK_MAJOR_VERSION + $LINK_MINOR_VERSION` VERSION=$LINK_MAJOR_VERSION.$LINK_MINOR_VERSION.$LINK_MICRO_VERSION # Version info for libraries = CURRENT:REVISION:AGE @@ -3282,7 +3276,7 @@ # Define the identity of the package. PACKAGE='link-grammar' - VERSION='5.3.16' + VERSION='5.5.0' cat >>confdefs.h <<_ACEOF @@ -5078,180 +5072,211 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu -case `pwd` in - *\ * | *\ *) - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Libtool does not cope well with whitespace in \`pwd\`" >&5 -$as_echo "$as_me: WARNING: Libtool does not cope well with whitespace in \`pwd\`" >&2;} ;; -esac - - - -macro_version='2.4.6' -macro_revision='2.4.6' - - - - - - - - - - - - - -ltmain=$ac_aux_dir/ltmain.sh - -# Backslashify metacharacters that are still active within -# double-quoted strings. -sed_quote_subst='s/\(["`$\\]\)/\\\1/g' - -# Same as above, but do not quote variable references. -double_quote_subst='s/\(["`\\]\)/\\\1/g' - -# Sed substitution to delay expansion of an escaped shell variable in a -# double_quote_subst'ed string. -delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g' - -# Sed substitution to delay expansion of an escaped single quote. -delay_single_quote_subst='s/'\''/'\'\\\\\\\'\''/g' - -# Sed substitution to avoid accidental globbing in evaled expressions -no_glob_subst='s/\*/\\\*/g' - -ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' -ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO -ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO +for ac_prog in flex lex +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_LEX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$LEX"; then + ac_cv_prog_LEX="$LEX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_LEX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to print strings" >&5 -$as_echo_n "checking how to print strings... " >&6; } -# Test print first, because it will be a builtin if present. -if test "X`( print -r -- -n ) 2>/dev/null`" = X-n && \ - test "X`print -r -- $ECHO 2>/dev/null`" = "X$ECHO"; then - ECHO='print -r --' -elif test "X`printf %s $ECHO 2>/dev/null`" = "X$ECHO"; then - ECHO='printf %s\n' +fi +fi +LEX=$ac_cv_prog_LEX +if test -n "$LEX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LEX" >&5 +$as_echo "$LEX" >&6; } else - # Use this function as a fallback that always works. - func_fallback_echo () - { - eval 'cat <<_LTECHO_EOF -$1 -_LTECHO_EOF' - } - ECHO='func_fallback_echo' + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } fi -# func_echo_all arg... -# Invoke $ECHO with all args, space-separated. -func_echo_all () + + test -n "$LEX" && break +done +test -n "$LEX" || LEX=":" + +if test "x$LEX" != "x:"; then + cat >conftest.l <<_ACEOF +%% +a { ECHO; } +b { REJECT; } +c { yymore (); } +d { yyless (1); } +e { /* IRIX 6.5 flex 2.5.4 underquotes its yyless argument. */ + yyless ((input () != 0)); } +f { unput (yytext[0]); } +. { BEGIN INITIAL; } +%% +#ifdef YYTEXT_POINTER +extern char *yytext; +#endif +int +main (void) { - $ECHO "" + return ! yylex () + ! yywrap (); } - -case $ECHO in - printf*) { $as_echo "$as_me:${as_lineno-$LINENO}: result: printf" >&5 -$as_echo "printf" >&6; } ;; - print*) { $as_echo "$as_me:${as_lineno-$LINENO}: result: print -r" >&5 -$as_echo "print -r" >&6; } ;; - *) { $as_echo "$as_me:${as_lineno-$LINENO}: result: cat" >&5 -$as_echo "cat" >&6; } ;; +_ACEOF +{ { ac_try="$LEX conftest.l" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$LEX conftest.l") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking lex output file root" >&5 +$as_echo_n "checking lex output file root... " >&6; } +if ${ac_cv_prog_lex_root+:} false; then : + $as_echo_n "(cached) " >&6 +else +if test -f lex.yy.c; then + ac_cv_prog_lex_root=lex.yy +elif test -f lexyy.c; then + ac_cv_prog_lex_root=lexyy +else + as_fn_error $? "cannot find output from $LEX; giving up" "$LINENO" 5 +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_lex_root" >&5 +$as_echo "$ac_cv_prog_lex_root" >&6; } +LEX_OUTPUT_ROOT=$ac_cv_prog_lex_root +if test -z "${LEXLIB+set}"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking lex library" >&5 +$as_echo_n "checking lex library... " >&6; } +if ${ac_cv_lib_lex+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_save_LIBS=$LIBS + ac_cv_lib_lex='none needed' + for ac_lib in '' -lfl -ll; do + LIBS="$ac_lib $ac_save_LIBS" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +`cat $LEX_OUTPUT_ROOT.c` +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_lex=$ac_lib +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + test "$ac_cv_lib_lex" != 'none needed' && break + done + LIBS=$ac_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lex" >&5 +$as_echo "$ac_cv_lib_lex" >&6; } + test "$ac_cv_lib_lex" != 'none needed' && LEXLIB=$ac_cv_lib_lex +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether yytext is a pointer" >&5 +$as_echo_n "checking whether yytext is a pointer... " >&6; } +if ${ac_cv_prog_lex_yytext_pointer+:} false; then : + $as_echo_n "(cached) " >&6 +else + # POSIX says lex can declare yytext either as a pointer or an array; the +# default is implementation-dependent. Figure out which it is, since +# not all implementations provide the %pointer and %array declarations. +ac_cv_prog_lex_yytext_pointer=no +ac_save_LIBS=$LIBS +LIBS="$LEXLIB $ac_save_LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + #define YYTEXT_POINTER 1 +`cat $LEX_OUTPUT_ROOT.c` +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_prog_lex_yytext_pointer=yes +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_lex_yytext_pointer" >&5 +$as_echo "$ac_cv_prog_lex_yytext_pointer" >&6; } +if test $ac_cv_prog_lex_yytext_pointer = yes; then +$as_echo "#define YYTEXT_POINTER 1" >>confdefs.h +fi +rm -f conftest.l $LEX_OUTPUT_ROOT.c +fi +if test "$LEX" = :; then + LEX=${am_missing_run}flex +fi +#AC_PROG_LIBTOOL +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ln -s works" >&5 +$as_echo_n "checking whether ln -s works... " >&6; } +LN_S=$as_ln_s +if test "$LN_S" = "ln -s"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no, using $LN_S" >&5 +$as_echo "no, using $LN_S" >&6; } +fi - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a sed that does not truncate output" >&5 -$as_echo_n "checking for a sed that does not truncate output... " >&6; } -if ${ac_cv_path_SED+:} false; then : +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5 +$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; } +set x ${MAKE-make} +ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'` +if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then : $as_echo_n "(cached) " >&6 else - ac_script=s/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/ - for ac_i in 1 2 3 4 5 6 7; do - ac_script="$ac_script$as_nl$ac_script" - done - echo "$ac_script" 2>/dev/null | sed 99q >conftest.sed - { ac_script=; unset ac_script;} - if test -z "$SED"; then - ac_path_SED_found=false - # Loop through the user's path and test for each of PROGNAME-LIST - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_prog in sed gsed; do - for ac_exec_ext in '' $ac_executable_extensions; do - ac_path_SED="$as_dir/$ac_prog$ac_exec_ext" - as_fn_executable_p "$ac_path_SED" || continue -# Check for GNU ac_path_SED and select it if it is found. - # Check for GNU $ac_path_SED -case `"$ac_path_SED" --version 2>&1` in -*GNU*) - ac_cv_path_SED="$ac_path_SED" ac_path_SED_found=:;; -*) - ac_count=0 - $as_echo_n 0123456789 >"conftest.in" - while : - do - cat "conftest.in" "conftest.in" >"conftest.tmp" - mv "conftest.tmp" "conftest.in" - cp "conftest.in" "conftest.nl" - $as_echo '' >> "conftest.nl" - "$ac_path_SED" -f conftest.sed < "conftest.nl" >"conftest.out" 2>/dev/null || break - diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break - as_fn_arith $ac_count + 1 && ac_count=$as_val - if test $ac_count -gt ${ac_path_SED_max-0}; then - # Best one so far, save it but keep looking for a better one - ac_cv_path_SED="$ac_path_SED" - ac_path_SED_max=$ac_count - fi - # 10*(2^10) chars as input seems more than enough - test $ac_count -gt 10 && break - done - rm -f conftest.in conftest.tmp conftest.nl conftest.out;; + cat >conftest.make <<\_ACEOF +SHELL = /bin/sh +all: + @echo '@@@%%%=$(MAKE)=@@@%%%' +_ACEOF +# GNU make sometimes prints "make[1]: Entering ...", which would confuse us. +case `${MAKE-make} -f conftest.make 2>/dev/null` in + *@@@%%%=?*=@@@%%%*) + eval ac_cv_prog_make_${ac_make}_set=yes;; + *) + eval ac_cv_prog_make_${ac_make}_set=no;; esac - - $ac_path_SED_found && break 3 - done - done - done -IFS=$as_save_IFS - if test -z "$ac_cv_path_SED"; then - as_fn_error $? "no acceptable sed could be found in \$PATH" "$LINENO" 5 - fi -else - ac_cv_path_SED=$SED +rm -f conftest.make fi - +if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + SET_MAKE= +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + SET_MAKE="MAKE=${MAKE-make}" fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_SED" >&5 -$as_echo "$ac_cv_path_SED" >&6; } - SED="$ac_cv_path_SED" - rm -f conftest.sed - -test -z "$SED" && SED=sed -Xsed="$SED -e 1s/^X//" - - - - - - - - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5 @@ -5384,6 +5409,294 @@ EGREP="$ac_cv_path_EGREP" +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5 +$as_echo_n "checking for ANSI C header files... " >&6; } +if ${ac_cv_header_stdc+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_header_stdc=yes +else + ac_cv_header_stdc=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +if test $ac_cv_header_stdc = yes; then + # SunOS 4.x string.h does not declare mem*, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "memchr" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "free" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. + if test "$cross_compiling" = yes; then : + : +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#if ((' ' & 0x0FF) == 0x020) +# define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#else +# define ISLOWER(c) \ + (('a' <= (c) && (c) <= 'i') \ + || ('j' <= (c) && (c) <= 'r') \ + || ('s' <= (c) && (c) <= 'z')) +# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) +#endif + +#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) +int +main () +{ + int i; + for (i = 0; i < 256; i++) + if (XOR (islower (i), ISLOWER (i)) + || toupper (i) != TOUPPER (i)) + return 2; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + +else + ac_cv_header_stdc=no +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5 +$as_echo "$ac_cv_header_stdc" >&6; } +if test $ac_cv_header_stdc = yes; then + +$as_echo "#define STDC_HEADERS 1" >>confdefs.h + +fi + +case `pwd` in + *\ * | *\ *) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Libtool does not cope well with whitespace in \`pwd\`" >&5 +$as_echo "$as_me: WARNING: Libtool does not cope well with whitespace in \`pwd\`" >&2;} ;; +esac + + + +macro_version='2.4.6' +macro_revision='2.4.6' + + + + + + + + + + + + + +ltmain=$ac_aux_dir/ltmain.sh + +# Backslashify metacharacters that are still active within +# double-quoted strings. +sed_quote_subst='s/\(["`$\\]\)/\\\1/g' + +# Same as above, but do not quote variable references. +double_quote_subst='s/\(["`\\]\)/\\\1/g' + +# Sed substitution to delay expansion of an escaped shell variable in a +# double_quote_subst'ed string. +delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g' + +# Sed substitution to delay expansion of an escaped single quote. +delay_single_quote_subst='s/'\''/'\'\\\\\\\'\''/g' + +# Sed substitution to avoid accidental globbing in evaled expressions +no_glob_subst='s/\*/\\\*/g' + +ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO +ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to print strings" >&5 +$as_echo_n "checking how to print strings... " >&6; } +# Test print first, because it will be a builtin if present. +if test "X`( print -r -- -n ) 2>/dev/null`" = X-n && \ + test "X`print -r -- $ECHO 2>/dev/null`" = "X$ECHO"; then + ECHO='print -r --' +elif test "X`printf %s $ECHO 2>/dev/null`" = "X$ECHO"; then + ECHO='printf %s\n' +else + # Use this function as a fallback that always works. + func_fallback_echo () + { + eval 'cat <<_LTECHO_EOF +$1 +_LTECHO_EOF' + } + ECHO='func_fallback_echo' +fi + +# func_echo_all arg... +# Invoke $ECHO with all args, space-separated. +func_echo_all () +{ + $ECHO "" +} + +case $ECHO in + printf*) { $as_echo "$as_me:${as_lineno-$LINENO}: result: printf" >&5 +$as_echo "printf" >&6; } ;; + print*) { $as_echo "$as_me:${as_lineno-$LINENO}: result: print -r" >&5 +$as_echo "print -r" >&6; } ;; + *) { $as_echo "$as_me:${as_lineno-$LINENO}: result: cat" >&5 +$as_echo "cat" >&6; } ;; +esac + + + + + + + + + + + + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a sed that does not truncate output" >&5 +$as_echo_n "checking for a sed that does not truncate output... " >&6; } +if ${ac_cv_path_SED+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_script=s/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/ + for ac_i in 1 2 3 4 5 6 7; do + ac_script="$ac_script$as_nl$ac_script" + done + echo "$ac_script" 2>/dev/null | sed 99q >conftest.sed + { ac_script=; unset ac_script;} + if test -z "$SED"; then + ac_path_SED_found=false + # Loop through the user's path and test for each of PROGNAME-LIST + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_prog in sed gsed; do + for ac_exec_ext in '' $ac_executable_extensions; do + ac_path_SED="$as_dir/$ac_prog$ac_exec_ext" + as_fn_executable_p "$ac_path_SED" || continue +# Check for GNU ac_path_SED and select it if it is found. + # Check for GNU $ac_path_SED +case `"$ac_path_SED" --version 2>&1` in +*GNU*) + ac_cv_path_SED="$ac_path_SED" ac_path_SED_found=:;; +*) + ac_count=0 + $as_echo_n 0123456789 >"conftest.in" + while : + do + cat "conftest.in" "conftest.in" >"conftest.tmp" + mv "conftest.tmp" "conftest.in" + cp "conftest.in" "conftest.nl" + $as_echo '' >> "conftest.nl" + "$ac_path_SED" -f conftest.sed < "conftest.nl" >"conftest.out" 2>/dev/null || break + diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break + as_fn_arith $ac_count + 1 && ac_count=$as_val + if test $ac_count -gt ${ac_path_SED_max-0}; then + # Best one so far, save it but keep looking for a better one + ac_cv_path_SED="$ac_path_SED" + ac_path_SED_max=$ac_count + fi + # 10*(2^10) chars as input seems more than enough + test $ac_count -gt 10 && break + done + rm -f conftest.in conftest.tmp conftest.nl conftest.out;; +esac + + $ac_path_SED_found && break 3 + done + done + done +IFS=$as_save_IFS + if test -z "$ac_cv_path_SED"; then + as_fn_error $? "no acceptable sed could be found in \$PATH" "$LINENO" 5 + fi +else + ac_cv_path_SED=$SED +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_SED" >&5 +$as_echo "$ac_cv_path_SED" >&6; } + SED="$ac_cv_path_SED" + rm -f conftest.sed + +test -z "$SED" && SED=sed +Xsed="$SED -e 1s/^X//" + + + + + + + + + + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for fgrep" >&5 $as_echo_n "checking for fgrep... " >&6; } if ${ac_cv_path_FGREP+:} false; then : @@ -5781,24 +6094,13 @@ cat conftest.err >&5 (eval echo "\"\$as_me:$LINENO: output\"" >&5) cat conftest.out >&5 - if $GREP 'External.*some_variable' conftest.out > /dev/null; then - lt_cv_nm_interface="MS dumpbin" - fi - rm -f conftest* -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_nm_interface" >&5 -$as_echo "$lt_cv_nm_interface" >&6; } - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ln -s works" >&5 -$as_echo_n "checking whether ln -s works... " >&6; } -LN_S=$as_ln_s -if test "$LN_S" = "ln -s"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no, using $LN_S" >&5 -$as_echo "no, using $LN_S" >&6; } + if $GREP 'External.*some_variable' conftest.out > /dev/null; then + lt_cv_nm_interface="MS dumpbin" + fi + rm -f conftest* fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_nm_interface" >&5 +$as_echo "$lt_cv_nm_interface" >&6; } # find the maximum length of command line arguments { $as_echo "$as_me:${as_lineno-$LINENO}: checking the maximum length of command line arguments" >&5 @@ -6191,9 +6493,6 @@ - - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to recognize dependent libraries" >&5 $as_echo_n "checking how to recognize dependent libraries... " >&6; } if ${lt_cv_deplibs_check_method+:} false; then : @@ -6536,9 +6835,6 @@ - - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to associate runtime and link libraries" >&5 $as_echo_n "checking how to associate runtime and link libraries... " >&6; } if ${lt_cv_sharedlib_from_linklib_cmd+:} false; then : @@ -6577,7 +6873,6 @@ - if test -n "$ac_tool_prefix"; then for ac_prog in ar do @@ -8496,173 +8791,358 @@ esac } +# On IRIX 5.3, sys/types and inttypes.h are conflicting. +for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \ + inttypes.h stdint.h unistd.h +do : + as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` +ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default +" +if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 +_ACEOF + +fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5 -$as_echo_n "checking for ANSI C header files... " >&6; } -if ${ac_cv_header_stdc+:} false; then : +done + + +for ac_header in dlfcn.h +do : + ac_fn_c_check_header_compile "$LINENO" "dlfcn.h" "ac_cv_header_dlfcn_h" "$ac_includes_default +" +if test "x$ac_cv_header_dlfcn_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_DLFCN_H 1 +_ACEOF + +fi + +done + + + + +func_stripname_cnf () +{ + case $2 in + .*) func_stripname_result=`$ECHO "$3" | $SED "s%^$1%%; s%\\\\$2\$%%"`;; + *) func_stripname_result=`$ECHO "$3" | $SED "s%^$1%%; s%$2\$%%"`;; + esac +} # func_stripname_cnf + + + + + +# Set options +enable_win32_dll=yes + +case $host in +*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-cegcc*) + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}as", so it can be a program name with args. +set dummy ${ac_tool_prefix}as; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_AS+:} false; then : $as_echo_n "(cached) " >&6 else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -#include -#include -#include + if test -n "$AS"; then + ac_cv_prog_AS="$AS" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_AS="${ac_tool_prefix}as" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS -int -main () -{ +fi +fi +AS=$ac_cv_prog_AS +if test -n "$AS"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AS" >&5 +$as_echo "$AS" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi - ; - return 0; -} -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - ac_cv_header_stdc=yes + +fi +if test -z "$ac_cv_prog_AS"; then + ac_ct_AS=$AS + # Extract the first word of "as", so it can be a program name with args. +set dummy as; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_AS+:} false; then : + $as_echo_n "(cached) " >&6 else - ac_cv_header_stdc=no + if test -n "$ac_ct_AS"; then + ac_cv_prog_ac_ct_AS="$ac_ct_AS" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_AS="as" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_AS=$ac_cv_prog_ac_ct_AS +if test -n "$ac_ct_AS"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AS" >&5 +$as_echo "$ac_ct_AS" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -if test $ac_cv_header_stdc = yes; then - # SunOS 4.x string.h does not declare mem*, contrary to ANSI. - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include + if test "x$ac_ct_AS" = x; then + AS="false" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + AS=$ac_ct_AS + fi +else + AS="$ac_cv_prog_AS" +fi -_ACEOF -if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "memchr" >/dev/null 2>&1; then : + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}dlltool", so it can be a program name with args. +set dummy ${ac_tool_prefix}dlltool; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_DLLTOOL+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$DLLTOOL"; then + ac_cv_prog_DLLTOOL="$DLLTOOL" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_DLLTOOL="${ac_tool_prefix}dlltool" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +DLLTOOL=$ac_cv_prog_DLLTOOL +if test -n "$DLLTOOL"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DLLTOOL" >&5 +$as_echo "$DLLTOOL" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_DLLTOOL"; then + ac_ct_DLLTOOL=$DLLTOOL + # Extract the first word of "dlltool", so it can be a program name with args. +set dummy dlltool; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_DLLTOOL+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_DLLTOOL"; then + ac_cv_prog_ac_ct_DLLTOOL="$ac_ct_DLLTOOL" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_DLLTOOL="dlltool" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_DLLTOOL=$ac_cv_prog_ac_ct_DLLTOOL +if test -n "$ac_ct_DLLTOOL"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_DLLTOOL" >&5 +$as_echo "$ac_ct_DLLTOOL" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_DLLTOOL" = x; then + DLLTOOL="false" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + DLLTOOL=$ac_ct_DLLTOOL + fi +else + DLLTOOL="$ac_cv_prog_DLLTOOL" +fi + + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}objdump", so it can be a program name with args. +set dummy ${ac_tool_prefix}objdump; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_OBJDUMP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$OBJDUMP"; then + ac_cv_prog_OBJDUMP="$OBJDUMP" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_OBJDUMP="${ac_tool_prefix}objdump" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS -else - ac_cv_header_stdc=no fi -rm -f conftest* - fi - -if test $ac_cv_header_stdc = yes; then - # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include - -_ACEOF -if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "free" >/dev/null 2>&1; then : - +OBJDUMP=$ac_cv_prog_OBJDUMP +if test -n "$OBJDUMP"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OBJDUMP" >&5 +$as_echo "$OBJDUMP" >&6; } else - ac_cv_header_stdc=no + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } fi -rm -f conftest* -fi -if test $ac_cv_header_stdc = yes; then - # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. - if test "$cross_compiling" = yes; then : - : +fi +if test -z "$ac_cv_prog_OBJDUMP"; then + ac_ct_OBJDUMP=$OBJDUMP + # Extract the first word of "objdump", so it can be a program name with args. +set dummy objdump; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_OBJDUMP+:} false; then : + $as_echo_n "(cached) " >&6 else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -#include -#if ((' ' & 0x0FF) == 0x020) -# define ISLOWER(c) ('a' <= (c) && (c) <= 'z') -# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) -#else -# define ISLOWER(c) \ - (('a' <= (c) && (c) <= 'i') \ - || ('j' <= (c) && (c) <= 'r') \ - || ('s' <= (c) && (c) <= 'z')) -# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) -#endif - -#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) -int -main () -{ - int i; - for (i = 0; i < 256; i++) - if (XOR (islower (i), ISLOWER (i)) - || toupper (i) != TOUPPER (i)) - return 2; - return 0; -} -_ACEOF -if ac_fn_c_try_run "$LINENO"; then : - + if test -n "$ac_ct_OBJDUMP"; then + ac_cv_prog_ac_ct_OBJDUMP="$ac_ct_OBJDUMP" # Let the user override the test. else - ac_cv_header_stdc=no -fi -rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ - conftest.$ac_objext conftest.beam conftest.$ac_ext -fi +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_OBJDUMP="objdump" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5 -$as_echo "$ac_cv_header_stdc" >&6; } -if test $ac_cv_header_stdc = yes; then - -$as_echo "#define STDC_HEADERS 1" >>confdefs.h +ac_ct_OBJDUMP=$ac_cv_prog_ac_ct_OBJDUMP +if test -n "$ac_ct_OBJDUMP"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OBJDUMP" >&5 +$as_echo "$ac_ct_OBJDUMP" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + if test "x$ac_ct_OBJDUMP" = x; then + OBJDUMP="false" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + OBJDUMP=$ac_ct_OBJDUMP + fi +else + OBJDUMP="$ac_cv_prog_OBJDUMP" fi -# On IRIX 5.3, sys/types and inttypes.h are conflicting. -for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \ - inttypes.h stdint.h unistd.h -do : - as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` -ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default -" -if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : - cat >>confdefs.h <<_ACEOF -#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 -_ACEOF + ;; +esac -fi +test -z "$AS" && AS=as -done -for ac_header in dlfcn.h -do : - ac_fn_c_check_header_compile "$LINENO" "dlfcn.h" "ac_cv_header_dlfcn_h" "$ac_includes_default -" -if test "x$ac_cv_header_dlfcn_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_DLFCN_H 1 -_ACEOF -fi -done +test -z "$DLLTOOL" && DLLTOOL=dlltool -func_stripname_cnf () -{ - case $2 in - .*) func_stripname_result=`$ECHO "$3" | $SED "s%^$1%%; s%\\\\$2\$%%"`;; - *) func_stripname_result=`$ECHO "$3" | $SED "s%^$1%%; s%$2\$%%"`;; - esac -} # func_stripname_cnf +test -z "$OBJDUMP" && OBJDUMP=objdump -# Set options enable_dlopen=no - enable_win32_dll=no - # Check whether --enable-shared was given. if test "${enable_shared+set}" = set; then : @@ -16225,219 +16705,64 @@ hardcode_action_CXX=unsupported fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $hardcode_action_CXX" >&5 -$as_echo "$hardcode_action_CXX" >&6; } - -if test relink = "$hardcode_action_CXX" || - test yes = "$inherit_rpath_CXX"; then - # Fast installation is not supported - enable_fast_install=no -elif test yes = "$shlibpath_overrides_runpath" || - test no = "$enable_shared"; then - # Fast installation is not necessary - enable_fast_install=needless -fi - - - - - - - - fi # test -n "$compiler" - - CC=$lt_save_CC - CFLAGS=$lt_save_CFLAGS - LDCXX=$LD - LD=$lt_save_LD - GCC=$lt_save_GCC - with_gnu_ld=$lt_save_with_gnu_ld - lt_cv_path_LDCXX=$lt_cv_path_LD - lt_cv_path_LD=$lt_save_path_LD - lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld - lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld -fi # test yes != "$_lt_caught_CXX_error" - -ac_ext=c -ac_cpp='$CPP $CPPFLAGS' -ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_c_compiler_gnu - - - - - - - - - - - - - - - - ac_config_commands="$ac_config_commands libtool" - - - - -# Only expand once: - - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ln -s works" >&5 -$as_echo_n "checking whether ln -s works... " >&6; } -LN_S=$as_ln_s -if test "$LN_S" = "ln -s"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no, using $LN_S" >&5 -$as_echo "no, using $LN_S" >&6; } -fi - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5 -$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; } -set x ${MAKE-make} -ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'` -if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat >conftest.make <<\_ACEOF -SHELL = /bin/sh -all: - @echo '@@@%%%=$(MAKE)=@@@%%%' -_ACEOF -# GNU make sometimes prints "make[1]: Entering ...", which would confuse us. -case `${MAKE-make} -f conftest.make 2>/dev/null` in - *@@@%%%=?*=@@@%%%*) - eval ac_cv_prog_make_${ac_make}_set=yes;; - *) - eval ac_cv_prog_make_${ac_make}_set=no;; -esac -rm -f conftest.make -fi -if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - SET_MAKE= -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - SET_MAKE="MAKE=${MAKE-make}" +$as_echo "$hardcode_action_CXX" >&6; } + +if test relink = "$hardcode_action_CXX" || + test yes = "$inherit_rpath_CXX"; then + # Fast installation is not supported + enable_fast_install=no +elif test yes = "$shlibpath_overrides_runpath" || + test no = "$enable_shared"; then + # Fast installation is not necessary + enable_fast_install=needless fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5 -$as_echo_n "checking for ANSI C header files... " >&6; } -if ${ac_cv_header_stdc+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -#include -#include -#include -int -main () -{ - ; - return 0; -} -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - ac_cv_header_stdc=yes -else - ac_cv_header_stdc=no -fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -if test $ac_cv_header_stdc = yes; then - # SunOS 4.x string.h does not declare mem*, contrary to ANSI. - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -_ACEOF -if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "memchr" >/dev/null 2>&1; then : -else - ac_cv_header_stdc=no -fi -rm -f conftest* -fi + fi # test -n "$compiler" + + CC=$lt_save_CC + CFLAGS=$lt_save_CFLAGS + LDCXX=$LD + LD=$lt_save_LD + GCC=$lt_save_GCC + with_gnu_ld=$lt_save_with_gnu_ld + lt_cv_path_LDCXX=$lt_cv_path_LD + lt_cv_path_LD=$lt_save_path_LD + lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld + lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld +fi # test yes != "$_lt_caught_CXX_error" + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + + + -if test $ac_cv_header_stdc = yes; then - # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -_ACEOF -if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | - $EGREP "free" >/dev/null 2>&1; then : -else - ac_cv_header_stdc=no -fi -rm -f conftest* -fi -if test $ac_cv_header_stdc = yes; then - # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. - if test "$cross_compiling" = yes; then : - : -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -#include -#if ((' ' & 0x0FF) == 0x020) -# define ISLOWER(c) ('a' <= (c) && (c) <= 'z') -# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) -#else -# define ISLOWER(c) \ - (('a' <= (c) && (c) <= 'i') \ - || ('j' <= (c) && (c) <= 'r') \ - || ('s' <= (c) && (c) <= 'z')) -# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) -#endif -#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) -int -main () -{ - int i; - for (i = 0; i < 256; i++) - if (XOR (islower (i), ISLOWER (i)) - || toupper (i) != TOUPPER (i)) - return 2; - return 0; -} -_ACEOF -if ac_fn_c_try_run "$LINENO"; then : -else - ac_cv_header_stdc=no -fi -rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ - conftest.$ac_objext conftest.beam conftest.$ac_ext -fi -fi -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5 -$as_echo "$ac_cv_header_stdc" >&6; } -if test $ac_cv_header_stdc = yes; then -$as_echo "#define STDC_HEADERS 1" >>confdefs.h -fi + + ac_config_commands="$ac_config_commands libtool" + + + + +# Only expand once: @@ -16653,12 +16978,25 @@ fi -for ac_func in strndup +for ac_func in strndup strtok_r +do : + as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` +ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" +if eval test \"x\$"$as_ac_var"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1 +_ACEOF + +fi +done + +for ac_func in aligned_alloc posix_memalign _aligned_malloc do : - ac_fn_c_check_func "$LINENO" "strndup" "ac_cv_func_strndup" -if test "x$ac_cv_func_strndup" = xyes; then : + as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` +ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" +if eval test \"x\$"$as_ac_var"\" = x"yes"; then : cat >>confdefs.h <<_ACEOF -#define HAVE_STRNDUP 1 +#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1 _ACEOF fi @@ -17109,8 +17447,8 @@ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for native Win32" >&5 $as_echo_n "checking for native Win32... " >&6; } -case "$host" in - *-*-mingw*) +case "$host_os" in + *cygwin* | *msys* | *mingw* | *pw32 | *cegcc*) native_win32=yes ;; *) @@ -17128,31 +17466,10 @@ fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Cygwin" >&5 -$as_echo_n "checking for Cygwin... " >&6; } -case "$host" in - *-*-cygwin*) - cygwin=yes - ;; - *) - cygwin=no - ;; -esac -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cygwin" >&5 -$as_echo "$cygwin" >&6; } - if test "x$cygwin" = "xyes"; then - OS_CYGWIN_TRUE= - OS_CYGWIN_FALSE='#' -else - OS_CYGWIN_TRUE='#' - OS_CYGWIN_FALSE= -fi - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for 64-bit Apple OSX" >&5 $as_echo_n "checking for 64-bit Apple OSX... " >&6; } -case "$host" in - x86_64-*-darwin*) +case "$host_os" in + darwin*) apple_osx=yes ;; *) @@ -17170,26 +17487,27 @@ fi +HOST_OS="$host_os" + # ==================================================================== CFLAGS="${CFLAGS} -O3" CXXFLAGS="${CXXFLAGS} -O3 -Wall" -# The std=c99/c11 flag provides the proper float-pt math decls working, +# The std=c11 flag provides the proper float-pt math decls working, # e.g. fmaxf However, it also undefined _BSD_SOURCE, etc which is # needed to get fileno, strdup, etc. and so it needs to be manually # enabled again. # Setting -D_POSIX_SOURCE messes up compilation on FreeBSD by # hiding strdup, etc. again. -# CFLAGS="${CFLAGS} -std=c99 -D_BSD_SOURCE -D_SVID_SOURCE -D_POSIX_C_SOURCE -D_GNU_SOURCE" -# Final solution: enable std=c11, explitictly turn on BSD and SVID and +# Final solution: enable std=c11, explicitly turn on BSD and SVID and # GNU, but do NOT turn on POSIX. # if test x${native_win32} = xyes; then - # Vikas says that -std=gnu99 is needed on cygwin/mingw - CFLAGS="-std=gnu99 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE ${CFLAGS}" + SPAWNV_WORKAROUND=-D_spawnv=spawnv # Fix MSYS2 wrapper compilation problem + CFLAGS="-std=c11 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE -D_ISOC11_SOURCE ${CFLAGS} ${SPAWNV_WORKAROUND}" CXXFLAGS="-std=c++11 ${CXXFLAGS}" # We need the shlwapi for PathRemoveFileSpecA(). LDFLAGS="${LDFLAGS} -lshlwapi" @@ -17224,7 +17542,7 @@ LDFLAGS="-stdlib=libc++ -lc++ ${LDFLAGS}" else # Else standard Linux/*BSD environment. - CFLAGS="-std=c11 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE ${CFLAGS}" + CFLAGS="-std=c11 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE -D_ISOC11_SOURCE ${CFLAGS}" CXXFLAGS="-std=c++11 ${CXXFLAGS}" fi fi @@ -17793,7 +18111,10 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu - CPPFLAGS="$CPPFLAGS_SAVE -DUSE_SAT_SOLVER=1" + CPPFLAGS="$CPPFLAGS_SAVE" + +$as_echo "#define USE_SAT_SOLVER 1" >>confdefs.h + fi @@ -17815,133 +18136,6 @@ # ==================================================================== -# Viterbi disabled by default - -# Check whether --enable-viterbi was given. -if test "${enable_viterbi+set}" = set; then : - enableval=$enable_viterbi; -else - enable_viterbi=no - -fi - - -if test "x$enable_viterbi" = "xyes"; then - CPPFLAGS="${CPPFLAGS} -DUSE_VITERBI" - - # We want to check for C++; the easiest way to do this is to - # use c++ to compile stdio.h and bomb if it fails. - ac_ext=cpp -ac_cpp='$CXXCPP $CPPFLAGS' -ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_cxx_compiler_gnu - - ac_fn_cxx_check_header_mongrel "$LINENO" "stdio.h" "ac_cv_header_stdio_h" "$ac_includes_default" -if test "x$ac_cv_header_stdio_h" = xyes; then : - -else - as_fn_error $? "C++ compiler not found; it is needed for the Viterbi parser" "$LINENO" 5 -fi - - - ac_ext=c -ac_cpp='$CPP $CPPFLAGS' -ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_c_compiler_gnu - - - # The viterbi decoder is designed to leak memory like crazy. - # Must use garbage collection in order for it to work. - -pkg_failed=no -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LIBGC" >&5 -$as_echo_n "checking for LIBGC... " >&6; } - -if test -n "$LIBGC_CFLAGS"; then - pkg_cv_LIBGC_CFLAGS="$LIBGC_CFLAGS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"bdw-gc\""; } >&5 - ($PKG_CONFIG --exists --print-errors "bdw-gc") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_LIBGC_CFLAGS=`$PKG_CONFIG --cflags "bdw-gc" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi -if test -n "$LIBGC_LIBS"; then - pkg_cv_LIBGC_LIBS="$LIBGC_LIBS" - elif test -n "$PKG_CONFIG"; then - if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"bdw-gc\""; } >&5 - ($PKG_CONFIG --exists --print-errors "bdw-gc") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - pkg_cv_LIBGC_LIBS=`$PKG_CONFIG --libs "bdw-gc" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes -else - pkg_failed=yes -fi - else - pkg_failed=untried -fi - - - -if test $pkg_failed = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi - if test $_pkg_short_errors_supported = yes; then - LIBGC_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "bdw-gc" 2>&1` - else - LIBGC_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "bdw-gc" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$LIBGC_PKG_ERRORS" >&5 - - gc=no -elif test $pkg_failed = untried; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - gc=no -else - LIBGC_CFLAGS=$pkg_cv_LIBGC_CFLAGS - LIBGC_LIBS=$pkg_cv_LIBGC_LIBS - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - gc=yes -fi - - - if test "x${gc}" = "xno"; then - as_fn_error $? "Garbage collector not found; it is needed for the Viterbi parser" "$LINENO" 5 - fi -fi - - if test x${enable_viterbi} = xyes; then - WITH_VITERBI_TRUE= - WITH_VITERBI_FALSE='#' -else - WITH_VITERBI_TRUE='#' - WITH_VITERBI_FALSE= -fi - - -# ==================================================================== # Corpus stats disabled by default # Check whether --enable-corpus-stats was given. @@ -17954,7 +18148,9 @@ if test "x$enable_corpus_stats" = "xyes" then - CPPFLAGS="${CPPFLAGS} -DUSE_CORPUS=1" + +$as_echo "#define USE_CORPUS 1" >>confdefs.h + fi if test x${enable_corpus_stats} = xyes; then @@ -17972,13 +18168,15 @@ if test "${enable_wordgraph_display+set}" = set; then : enableval=$enable_wordgraph_display; else - enable_wordgraph_display=no + enable_wordgraph_display=yes fi if test "x$enable_wordgraph_display" = "xyes" then - CPPFLAGS="${CPPFLAGS} -DUSE_WORDGRAPH_DISPLAY=1" + +$as_echo "#define USE_WORDGRAPH_DISPLAY 1" >>confdefs.h + fi if test x${enable_wordgraph_display} = xyes; then @@ -18080,7 +18278,9 @@ # Which is insane, of course, except that some Apple OSX systems # appear to be borken... have header files but no libs ... if test "x${SQLiteFound}" = "xyes"; then - CPPFLAGS="${CPPFLAGS} -DHAVE_SQLITE=1" + +$as_echo "#define HAVE_SQLITE 1" >>confdefs.h + fi if test x${SQLiteFound} = xyes; then HAVE_SQLITE_TRUE= @@ -18143,7 +18343,7 @@ # ==================================================================== # 2015-08: The current default for speller is hunspell, since aspell is not -# yet thread safe. +# yet thread safe. Jan 2018 - aspell still not thread-safe... HunSpellDictDir= HunSpellFound=no @@ -18405,6 +18605,7 @@ $as_echo "#define HAVE_ASPELL 1" >>confdefs.h + ASPELL_LIBS="${ASPELL_LIBS} -lpthread" fi @@ -18634,7 +18835,9 @@ if test "x$enable_regex_tokenizer" = "xyes" then - CPPFLAGS="${CPPFLAGS} -DUSE_REGEX_TOKENIZER=1" + +$as_echo "#define USE_REGEX_TOKENIZER 1" >>confdefs.h + { $as_echo "$as_me:${as_lineno-$LINENO}: checking lib pcre" >&5 $as_echo_n "checking lib pcre... " >&6; } @@ -18779,8 +18982,57 @@ fi # ==================================================================== +# Find a regex library that supports regexec(). +# Check in this order: libtre, libc, libregex. + +ac_fn_c_check_header_mongrel "$LINENO" "tre/regex.h" "ac_cv_header_tre_regex_h" "$ac_includes_default" +if test "x$ac_cv_header_tre_regex_h" = xyes; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for regexec in -ltre" >&5 +$as_echo_n "checking for regexec in -ltre... " >&6; } +if ${ac_cv_lib_tre_regexec+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-ltre $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char regexec (); +int +main () +{ +return regexec (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_tre_regexec=yes +else + ac_cv_lib_tre_regexec=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_tre_regexec" >&5 +$as_echo "$ac_cv_lib_tre_regexec" >&6; } +if test "x$ac_cv_lib_tre_regexec" = xyes; then : + REGEX_LIBS=-ltre +fi + +fi -ac_fn_c_check_header_mongrel "$LINENO" "regex.h" "ac_cv_header_regex_h" "$ac_includes_default" + + +if test -z "$REGEX_LIBS"; then + ac_fn_c_check_header_mongrel "$LINENO" "regex.h" "ac_cv_header_regex_h" "$ac_includes_default" if test "x$ac_cv_header_regex_h" = xyes; then : else @@ -18788,8 +19040,7 @@ fi - -for ac_func in regexec + for ac_func in regexec do : ac_fn_c_check_func "$LINENO" "regexec" "ac_cv_func_regexec" if test "x$ac_cv_func_regexec" = xyes; then : @@ -18843,6 +19094,12 @@ fi done +fi + +if test "x$REGEX_LIBS" = "x-ltre"; then + $as_echo "#define HAVE_TRE_TRE_H 1" >>confdefs.h + +fi # ===================================================================== @@ -18870,149 +19127,54 @@ CFLAGS="${CFLAGS} -arch x86_64" fi - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for jni.h" >&5 -$as_echo_n "checking for jni.h... " >&6; } - - # Try looking for jni in the standard locations first. - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -_ACEOF -if ac_fn_c_try_cpp "$LINENO"; then : - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes, will build java libs" >&5 -$as_echo "yes, will build java libs" >&6; } - JNIfound=yes -else - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot find jni.h in standard location." >&5 -$as_echo "$as_me: WARNING: cannot find jni.h in standard location." >&2;} - JNIfound=no -fi -rm -f conftest.err conftest.i conftest.$ac_ext - - # Didn't find it yet. Try looking in user-specified locations. - if test $JNIfound = "no"; then - JNI_GUESS=" \ - -I $JAVA_HOME/include \ - -I $JAVA_HOME/include/freebsd \ - -I $JAVA_HOME/include/linux \ - -I $JAVA_HOME/include/win32 \ - -I $JAVA_HOME/Headers \ - -I $JDK_HOME/include \ - -I $JDK_HOME/include/freebsd \ - -I $JDK_HOME/include/linux \ - -I $JDK_HOME/include/win32 \ - -I $JDK_HOME/Headers" - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for jni.h in user-specified location" >&5 -$as_echo_n "checking for jni.h in user-specified location... " >&6; } - SAVE_CPPFLAGS=${CPPFLAGS} - CPPFLAGS="${CPPFLAGS} ${JNI_GUESS}" - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -_ACEOF -if ac_fn_c_try_cpp "$LINENO"; then : - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes, will build java libs" >&5 -$as_echo "yes, will build java libs" >&6; } - JNIfound=yes -else - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot find jni.h in user-specified location." >&5 -$as_echo "$as_me: WARNING: cannot find jni.h in user-specified location." >&2;} - CPPFLAGS=${SAVE_CPPFLAGS} - JNIfound=no -fi -rm -f conftest.err conftest.i conftest.$ac_ext - fi + JAVA_SRCDIR=../../bindings/java + if test "$native_win32" = yes; then + # javac needs a Windows path + JAVA_SRCDIR=`cygpath -w "$JAVA_SRCDIR"` + + # The standard location of Windows Java is under "\Program Files". + # However, automake is unable to handle spaces in a filename even + # when quoted (e.g. "-I '$JAVA_HOME/include'"). As a workaround, + # convert $JAVA_HOME to a short DOS path. + JAVA_HOME=`cygpath --dos --mixed "$JAVA_HOME"` - # Still didn't find it. Try looking in some non-standard locations. - if test $JNIfound = "no"; then - JNI_GUESS=" \ - -I/usr/include/classpath/ \ - -I/usr/lib/jvm/default-java/include \ - -I/usr/lib/jvm/default-java/include/freebsd \ - -I/usr/lib/jvm/default-java/include/linux \ - -I/usr/lib/jvm/java-gcj/include \ - -I/usr/lib/jvm/java-gcj/include/freebsd \ - -I/usr/lib/jvm/java-gcj/include/linux \ - -I/usr/local/jdk1.6.0/include/\ - -I/usr/local/jdk1.6.0/include/freebsd \ - -I/usr/local/jdk1.6.0/include/linux \ - -I/usr/lib/jvm/java-6-sun/include/ \ - -I/usr/lib/jvm/java-6-sun/include/freebsd \ - -I/usr/lib/jvm/java-6-sun/include/linux \ - -I/usr/lib/j2sdk1.6-ibm/include \ - -I/usr/lib/j2sdk1.6-ibm/include/freebsd \ - -I/usr/lib/j2sdk1.6-ibm/include/linux \ - -I/usr/lib/jvm/java-7-openjdk-amd64/include \ - -I/usr/lib/jvm/java-7-openjdk-amd64/include/freebsd \ - -I/usr/lib/jvm/java-7-openjdk-amd64/include/linux \ - -I/usr/lib/jvm/java-8-openjdk-amd64/include \ - -I/usr/lib/jvm/java-8-openjdk-amd64/include/freebsd \ - -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux \ - -I/usr/lib/jvm/java-9-openjdk-amd64/include \ - -I/usr/lib/jvm/java-9-openjdk-amd64/include/freebsd \ - -I/usr/lib/jvm/java-9-openjdk-amd64/include/linux \ - -I/c/java/jdk1.6.0/include/ \ - -I/c/java/jdk1.6.0/include/win32/ \ - -I/Developer/SDKs/MacOSX10.5.sdk/System/Library/Frameworks/JavaVM.framework/Headers/ \ - -I/Developer/SDKs/MacOSX10.4u.sdk/System/Library/Frameworks/JavaVM.framework/Headers/ \ - -I /System/Library/Frameworks/JavaVM.framework/Versions/Current/Headers" - - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for jni.h in non-standard location" >&5 -$as_echo_n "checking for jni.h in non-standard location... " >&6; } - SAVE_CPPFLAGS=${CPPFLAGS} - CPPFLAGS="${CPPFLAGS} ${JNI_GUESS}" - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -_ACEOF -if ac_fn_c_try_cpp "$LINENO"; then : - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes, will build java libs" >&5 -$as_echo "yes, will build java libs" >&6; } - JAVA_CPPFLAGS=${JNI_GUESS} - JNIfound=yes -else - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot find jni.h in anywhere." >&5 -$as_echo "$as_me: WARNING: cannot find jni.h in anywhere." >&2;} - JNIfound=no -fi -rm -f conftest.err conftest.i conftest.$ac_ext - CPPFLAGS=${SAVE_CPPFLAGS} - fi + # win32/jni_md.h:34:9: error: ‘__int64’ does not name a type + JAVA_CPPFLAGS=-D__int64=int64_t + fi - # This allows jni-client.c to be built - if test x${JNIfound} = xyes; then - HAVE_JAVA_TRUE= - HAVE_JAVA_FALSE='#' -else - HAVE_JAVA_TRUE='#' - HAVE_JAVA_FALSE= -fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for jni.h" >&5 +$as_echo_n "checking for jni.h... " >&6; } +JNI_INCLUDE_DIRS="" - # For now, we assume that if the ant tool is there, then the javac and - # jar compiler/packager are installed as well. These are needed to build - # the jar file and install it. - # Extract the first word of "ant", so it can be a program name with args. -set dummy ant; ac_word=$2 +if test "x$JAVA_HOME" != x; then + _JTOPDIR="$JAVA_HOME" +else + if test "x$JAVAC" = x; then + JAVAC=javac + fi + # Extract the first word of "$JAVAC", so it can be a program name with args. +set dummy $JAVAC; ac_word=$2 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 $as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_ANTfound+:} false; then : +if ${ac_cv_path__ACJNI_JAVAC+:} false; then : $as_echo_n "(cached) " >&6 else - if test -n "$ANTfound"; then - ac_cv_prog_ANTfound="$ANTfound" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR + case $_ACJNI_JAVAC in + [\\/]* | ?:[\\/]*) + ac_cv_path__ACJNI_JAVAC="$_ACJNI_JAVAC" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then - ac_cv_prog_ANTfound="yes" + ac_cv_path__ACJNI_JAVAC="$as_dir/$ac_word$ac_exec_ext" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi @@ -19020,181 +19182,160 @@ done IFS=$as_save_IFS - test -z "$ac_cv_prog_ANTfound" && ac_cv_prog_ANTfound="no" -fi + test -z "$ac_cv_path__ACJNI_JAVAC" && ac_cv_path__ACJNI_JAVAC="no" + ;; +esac fi -ANTfound=$ac_cv_prog_ANTfound -if test -n "$ANTfound"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ANTfound" >&5 -$as_echo "$ANTfound" >&6; } +_ACJNI_JAVAC=$ac_cv_path__ACJNI_JAVAC +if test -n "$_ACJNI_JAVAC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_ACJNI_JAVAC" >&5 +$as_echo "$_ACJNI_JAVAC" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } fi - if test x${ANTfound} = xyes; then - HAVE_ANT_TRUE= - HAVE_ANT_FALSE='#' -else - HAVE_ANT_TRUE='#' - HAVE_ANT_FALSE= + if test "x$_ACJNI_JAVAC" = xno; then + as_fn_error $? "cannot find JDK; try setting \$JAVAC or \$JAVA_HOME" "$LINENO" 5 + fi + +# find the include directory relative to the javac executable +_cur=""$_ACJNI_JAVAC"" +while ls -ld "$_cur" 2>/dev/null | grep " -> " >/dev/null; do + { $as_echo "$as_me:${as_lineno-$LINENO}: checking symlink for $_cur" >&5 +$as_echo_n "checking symlink for $_cur... " >&6; } + _slink=`ls -ld "$_cur" | sed 's/.* -> //'` + case "$_slink" in + /*) _cur="$_slink";; + # 'X' avoids triggering unwanted echo options. + *) _cur=`echo "X$_cur" | sed -e 's/^X//' -e 's:[^/]*$::'`"$_slink";; + esac + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_cur" >&5 +$as_echo "$_cur" >&6; } +done +_ACJNI_FOLLOWED="$_cur" + + _JTOPDIR=`echo "$_ACJNI_FOLLOWED" | sed -e 's://*:/:g' -e 's:/[^/]*$::'` fi +case "$host_os" in + darwin*) # Apple JDK is at /System location and has headers symlinked elsewhere + case "$_JTOPDIR" in + /System/Library/Frameworks/JavaVM.framework/*) + _JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[^/]*$::'` + _JINC="$_JTOPDIR/Headers";; + *) _JINC="$_JTOPDIR/include";; + esac;; + *) _JINC="$_JTOPDIR/include";; +esac +$as_echo "$as_me:${as_lineno-$LINENO}: _JTOPDIR=$_JTOPDIR" >&5 +$as_echo "$as_me:${as_lineno-$LINENO}: _JINC=$_JINC" >&5 +# On Mac OS X 10.6.4, jni.h is a symlink: +# /System/Library/Frameworks/JavaVM.framework/Versions/Current/Headers/jni.h +# -> ../../CurrentJDK/Headers/jni.h. +as_ac_File=`$as_echo "ac_cv_file_$_JINC/jni.h" | $as_tr_sh` +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $_JINC/jni.h" >&5 +$as_echo_n "checking for $_JINC/jni.h... " >&6; } +if eval \${$as_ac_File+:} false; then : + $as_echo_n "(cached) " >&6 else - if false; then - HAVE_JAVA_TRUE= - HAVE_JAVA_FALSE='#' + test "$cross_compiling" = yes && + as_fn_error $? "cannot check for file existence when cross compiling" "$LINENO" 5 +if test -r "$_JINC/jni.h"; then + eval "$as_ac_File=yes" else - HAVE_JAVA_TRUE='#' - HAVE_JAVA_FALSE= + eval "$as_ac_File=no" fi - - if false; then - HAVE_ANT_TRUE= - HAVE_ANT_FALSE='#' +fi +eval ac_res=\$$as_ac_File + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +if eval test \"x\$"$as_ac_File"\" = x"yes"; then : + JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JINC" else - HAVE_ANT_TRUE='#' - HAVE_ANT_FALSE= + _JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[^/]*$::'` + as_ac_File=`$as_echo "ac_cv_file_$_JTOPDIR/include/jni.h" | $as_tr_sh` +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $_JTOPDIR/include/jni.h" >&5 +$as_echo_n "checking for $_JTOPDIR/include/jni.h... " >&6; } +if eval \${$as_ac_File+:} false; then : + $as_echo_n "(cached) " >&6 +else + test "$cross_compiling" = yes && + as_fn_error $? "cannot check for file existence when cross compiling" "$LINENO" 5 +if test -r "$_JTOPDIR/include/jni.h"; then + eval "$as_ac_File=yes" +else + eval "$as_ac_File=no" +fi +fi +eval ac_res=\$$as_ac_File + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +if eval test \"x\$"$as_ac_File"\" = x"yes"; then : + JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JTOPDIR/include" +else + as_fn_error $? "cannot find JDK header files" "$LINENO" 5 fi + fi -# =================================================================== -# swig is needed for compiling the perl and python bindings ... -# ... well, actually, no, its not. 'make dist' is currently set up to -# package all of the files generated by swig, so the user does not need -# to actually install it. However, swig is needed to create the package, -# and also needed to build from a github pull. Hmmm. What to do ... +# get the likely subdirectories for system specific java includes +case "$host_os" in +bsdi*) _JNI_INC_SUBDIRS="bsdos";; +freebsd*) _JNI_INC_SUBDIRS="freebsd";; +darwin*) _JNI_INC_SUBDIRS="darwin";; +linux*) _JNI_INC_SUBDIRS="linux genunix";; +osf*) _JNI_INC_SUBDIRS="alpha";; +solaris*) _JNI_INC_SUBDIRS="solaris";; +mingw*) _JNI_INC_SUBDIRS="win32";; +cygwin*) _JNI_INC_SUBDIRS="win32";; +*) _JNI_INC_SUBDIRS="genunix";; +esac - # Ubuntu has swig 2.0 as /usr/bin/swig2.0 - for ac_prog in swig swig2.0 +# add any subdirectories that are present +for JINCSUBDIR in $_JNI_INC_SUBDIRS do - # Extract the first word of "$ac_prog", so it can be a program name with args. -set dummy $ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_path_SWIG+:} false; then : - $as_echo_n "(cached) " >&6 -else - case $SWIG in - [\\/]* | ?:[\\/]*) - ac_cv_path_SWIG="$SWIG" # Let the user override the test with a path. - ;; - *) - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then - ac_cv_path_SWIG="$as_dir/$ac_word$ac_exec_ext" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi + if test -d "$_JTOPDIR/include/$JINCSUBDIR"; then + JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JTOPDIR/include/$JINCSUBDIR" + fi done - done -IFS=$as_save_IFS - - ;; -esac -fi -SWIG=$ac_cv_path_SWIG -if test -n "$SWIG"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SWIG" >&5 -$as_echo "$SWIG" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi + if test -n "$JNI_INCLUDE_DIRS"; then + JNIfound=yes + fi - test -n "$SWIG" && break -done + for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS + do + JAVA_CPPFLAGS="$JAVA_CPPFLAGS -I$JNI_INCLUDE_DIR" + done - if test -z "$SWIG" ; then - SwigF=no - elif test -n "2.0.0" ; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking SWIG version" >&5 -$as_echo_n "checking SWIG version... " >&6; } - swig_version=`$SWIG -version 2>&1 | grep 'SWIG Version' | sed 's/.*\([0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\).*/\1/g'` - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $swig_version" >&5 -$as_echo "$swig_version" >&6; } - if test -n "$swig_version" ; then - # Calculate the required version number components - required=2.0.0 - required_major=`echo $required | sed 's/[^0-9].*//'` - if test -z "$required_major" ; then - required_major=0 - fi - required=`echo $required | sed 's/[0-9]*[^0-9]//'` - required_minor=`echo $required | sed 's/[^0-9].*//'` - if test -z "$required_minor" ; then - required_minor=0 - fi - required=`echo $required | sed 's/[0-9]*[^0-9]//'` - required_patch=`echo $required | sed 's/[^0-9].*//'` - if test -z "$required_patch" ; then - required_patch=0 - fi - # Calculate the available version number components - available=$swig_version - available_major=`echo $available | sed 's/[^0-9].*//'` - if test -z "$available_major" ; then - available_major=0 - fi - available=`echo $available | sed 's/[0-9]*[^0-9]//'` - available_minor=`echo $available | sed 's/[^0-9].*//'` - if test -z "$available_minor" ; then - available_minor=0 - fi - available=`echo $available | sed 's/[0-9]*[^0-9]//'` - available_patch=`echo $available | sed 's/[^0-9].*//'` - if test -z "$available_patch" ; then - available_patch=0 - fi - # Convert the version tuple into a single number for easier comparison. - # Using base 100 should be safe since SWIG internally uses BCD values - # to encode its version number. - required_swig_vernum=`expr $required_major \* 10000 \ - \+ $required_minor \* 100 \+ $required_patch` - available_swig_vernum=`expr $available_major \* 10000 \ - \+ $available_minor \* 100 \+ $available_patch` - if test $available_swig_vernum -lt $required_swig_vernum; then - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: SWIG version >= 2.0.0 is required. You have $swig_version." >&5 -$as_echo "$as_me: WARNING: SWIG version >= 2.0.0 is required. You have $swig_version." >&2;} - SWIG='' - SwigF=no - else - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for SWIG library" >&5 -$as_echo_n "checking for SWIG library... " >&6; } - SWIG_LIB=`$SWIG -swiglib` - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SWIG_LIB" >&5 -$as_echo "$SWIG_LIB" >&6; } - SwigF=yes - fi - else - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot determine SWIG version" >&5 -$as_echo "$as_me: WARNING: cannot determine SWIG version" >&2;} - SWIG='' - SwigF=no - fi - fi + # This allows jni-client.c to be built + if test x${JNIfound} = xyes; then + HAVE_JAVA_TRUE= + HAVE_JAVA_FALSE='#' +else + HAVE_JAVA_TRUE='#' + HAVE_JAVA_FALSE= +fi -# Extract the first word of "swig", so it can be a program name with args. -set dummy swig; ac_word=$2 + # For now, we assume that if the ant tool is there, then the javac and + # jar compiler/packager are installed as well. These are needed to build + # the jar file and install it. + # Extract the first word of "ant", so it can be a program name with args. +set dummy ant; ac_word=$2 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 $as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_SWIGfound+:} false; then : +if ${ac_cv_prog_ANTfound+:} false; then : $as_echo_n "(cached) " >&6 else - if test -n "$SWIGfound"; then - ac_cv_prog_SWIGfound="$SWIGfound" # Let the user override the test. + if test -n "$ANTfound"; then + ac_cv_prog_ANTfound="$ANTfound" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH @@ -19203,7 +19344,7 @@ test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then - ac_cv_prog_SWIGfound="yes" + ac_cv_prog_ANTfound="yes" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi @@ -19211,34 +19352,46 @@ done IFS=$as_save_IFS - test -z "$ac_cv_prog_SWIGfound" && ac_cv_prog_SWIGfound="no" + test -z "$ac_cv_prog_ANTfound" && ac_cv_prog_ANTfound="no" fi fi -SWIGfound=$ac_cv_prog_SWIGfound -if test -n "$SWIGfound"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SWIGfound" >&5 -$as_echo "$SWIGfound" >&6; } +ANTfound=$ac_cv_prog_ANTfound +if test -n "$ANTfound"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ANTfound" >&5 +$as_echo "$ANTfound" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } fi -if test "x$SwigFound" = "xyes"; then - - + if test x${ANTfound} = xyes; then + HAVE_ANT_TRUE= + HAVE_ANT_FALSE='#' +else + HAVE_ANT_TRUE='#' + HAVE_ANT_FALSE= +fi - SWIG="$SWIG -c++" +else + if false; then + HAVE_JAVA_TRUE= + HAVE_JAVA_FALSE='#' +else + HAVE_JAVA_TRUE='#' + HAVE_JAVA_FALSE= fi - if test x${SWIGfound} = xyes; then - HAVE_SWIG_TRUE= - HAVE_SWIG_FALSE='#' + + if false; then + HAVE_ANT_TRUE= + HAVE_ANT_FALSE='#' else - HAVE_SWIG_TRUE='#' - HAVE_SWIG_FALSE= + HAVE_ANT_TRUE='#' + HAVE_ANT_FALSE= fi +fi # =================================================================== @@ -20748,6 +20901,163 @@ fi fi +# =================================================================== +# swig is needed for compiling the Perl and Python bindings ... +# ... well, actually, no, its not. 'make dist' is currently set up to +# package all of the files generated by swig, so the user does not need +# to actually install it. However, swig is needed to create the package, +# and also needed to build from a GitHub pull. + +if ! test -s "$srcdir/bindings/python/lg_python_wrap.cc"; then + if test -z "$HAVE_PYTHON2_TRUE"; then : + swig_required=Python2 +fi +fi +if ! test -s "$srcdir/bindings/python3/lg_python_wrap.cc"; then + if test -z "$HAVE_PYTHON3_TRUE"; then : + swig_required="$swig_required Python3" +fi +fi +if ! test -s "$srcdir/bindings/perl/lg_perl_wrap.cc"; then + if test -z "$HAVE_PERL_TRUE"; then : + swig_required="$swig_required Perl" +fi +fi + +if test -n "$swig_required"; then + + # Ubuntu has swig 2.0 as /usr/bin/swig2.0 + for ac_prog in swig swig2.0 +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_SWIG+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $SWIG in + [\\/]* | ?:[\\/]*) + ac_cv_path_SWIG="$SWIG" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_path_SWIG="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + ;; +esac +fi +SWIG=$ac_cv_path_SWIG +if test -n "$SWIG"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SWIG" >&5 +$as_echo "$SWIG" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$SWIG" && break +done + + if test -z "$SWIG" ; then + as_fn_error $? "'swig' is required to create bindings for: $swig_required" "$LINENO" 5 + elif test -n "2.0.0" ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking SWIG version" >&5 +$as_echo_n "checking SWIG version... " >&6; } + swig_version=`$SWIG -version 2>&1 | grep 'SWIG Version' | sed 's/.*\([0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\).*/\1/g'` + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $swig_version" >&5 +$as_echo "$swig_version" >&6; } + if test -n "$swig_version" ; then + # Calculate the required version number components + required=2.0.0 + required_major=`echo $required | sed 's/[^0-9].*//'` + if test -z "$required_major" ; then + required_major=0 + fi + required=`echo $required | sed 's/[0-9]*[^0-9]//'` + required_minor=`echo $required | sed 's/[^0-9].*//'` + if test -z "$required_minor" ; then + required_minor=0 + fi + required=`echo $required | sed 's/[0-9]*[^0-9]//'` + required_patch=`echo $required | sed 's/[^0-9].*//'` + if test -z "$required_patch" ; then + required_patch=0 + fi + # Calculate the available version number components + available=$swig_version + available_major=`echo $available | sed 's/[^0-9].*//'` + if test -z "$available_major" ; then + available_major=0 + fi + available=`echo $available | sed 's/[0-9]*[^0-9]//'` + available_minor=`echo $available | sed 's/[^0-9].*//'` + if test -z "$available_minor" ; then + available_minor=0 + fi + available=`echo $available | sed 's/[0-9]*[^0-9]//'` + available_patch=`echo $available | sed 's/[^0-9].*//'` + if test -z "$available_patch" ; then + available_patch=0 + fi + # Convert the version tuple into a single number for easier comparison. + # Using base 100 should be safe since SWIG internally uses BCD values + # to encode its version number. + required_swig_vernum=`expr $required_major \* 10000 \ + \+ $required_minor \* 100 \+ $required_patch` + available_swig_vernum=`expr $available_major \* 10000 \ + \+ $available_minor \* 100 \+ $available_patch` + + if test $available_swig_vernum -lt $required_swig_vernum; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: SWIG version >= 2.0.0 is required. You have $swig_version." >&5 +$as_echo "$as_me: WARNING: SWIG version >= 2.0.0 is required. You have $swig_version." >&2;} + SWIG='' + as_fn_error $? "'swig' is required to create bindings for: $swig_required" "$LINENO" 5 + else + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for SWIG library" >&5 +$as_echo_n "checking for SWIG library... " >&6; } + SWIG_LIB=`$SWIG -swiglib` + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SWIG_LIB" >&5 +$as_echo "$SWIG_LIB" >&6; } + + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot determine SWIG version" >&5 +$as_echo "$as_me: WARNING: cannot determine SWIG version" >&2;} + SWIG='' + as_fn_error $? "'swig' is required to create bindings for: $swig_required" "$LINENO" 5 + fi + fi + + + + + + SWIG="$SWIG -c++" + +fi + if test -n "$SWIG"; then + HAVE_SWIG_TRUE= + HAVE_SWIG_FALSE='#' +else + HAVE_SWIG_TRUE='#' + HAVE_SWIG_FALSE= +fi + + # ==================================================================== # check compiler flags @@ -20759,7 +21069,7 @@ -Wsign-compare -Werror-implicit-function-declaration \ -Wpointer-arith -Wwrite-strings -Wmissing-declarations \ -Wpacked -Wswitch-enum -Wmissing-format-attribute \ --Wstrict-aliasing -Winit-self \ +-Wstrict-aliasing -Winit-self -Wshadow \ -Wno-missing-field-initializers -Wno-unused-parameter \ -Wno-attributes -Wno-long-long -Winline" @@ -20933,7 +21243,47 @@ -ac_config_files="$ac_config_files Makefile link-grammar.pc link-grammar.spec link-grammar/Makefile link-grammar/corpus/Makefile link-grammar/minisat/Makefile link-grammar/sat-solver/Makefile link-grammar/link-features.h bindings/Makefile bindings/java-jni/Makefile bindings/java/Makefile bindings/java/build.xml bindings/ocaml/Makefile bindings/perl/Makefile bindings/python/Makefile bindings/python/__init__.py bindings/python3/Makefile bindings/python3/__init__.py bindings/python-examples/Makefile morphology/Makefile link-parser/Makefile tests/Makefile data/Makefile data/ady/Makefile data/amy/Makefile data/any/Makefile data/ar/Makefile data/ar/words/Makefile data/de/Makefile data/en/Makefile data/en/words/Makefile data/fa/Makefile data/fa/words/Makefile data/he/Makefile data/id/Makefile data/kz/Makefile data/lt/Makefile data/ru/Makefile data/ru/words/Makefile data/tr/Makefile data/vn/Makefile data/demo-sql/Makefile man/Makefile viterbi/Makefile" +# Transform confdefs.h into DEFS. +# Protect against shell expansion while executing Makefile rules. +# Protect against Makefile macro expansion. +# +# If the first sed substitution is executed (which looks for macros that +# take arguments), then branch to the quote section. Otherwise, +# look for a macro that doesn't take arguments. +ac_script=' +:mline +/\\$/{ + N + s,\\\n,, + b mline +} +t clear +:clear +s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g +t quote +s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g +t quote +b any +:quote +s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g +s/\[/\\&/g +s/\]/\\&/g +s/\$/$$/g +H +:any +${ + g + s/^\n// + s/\n/ /g + p +} +' +DEFS=`sed -n "$ac_script" confdefs.h` + +LG_DEFS=`$ECHO "$DEFS" | $SED 's/\\\\//g'` + + +ac_config_files="$ac_config_files Makefile link-grammar.pc link-grammar.spec link-grammar/Makefile link-grammar/corpus/Makefile link-grammar/minisat/Makefile link-grammar/sat-solver/Makefile link-grammar/link-features.h bindings/Makefile bindings/java-jni/Makefile bindings/java/Makefile bindings/java/build.xml bindings/ocaml/Makefile bindings/perl/Makefile bindings/python/Makefile bindings/python/__init__.py bindings/python3/Makefile bindings/python3/__init__.py bindings/python-examples/Makefile morphology/Makefile link-parser/Makefile tests/Makefile data/Makefile data/ady/Makefile data/amy/Makefile data/any/Makefile data/ar/Makefile data/ar/words/Makefile data/de/Makefile data/en/Makefile data/en/words/Makefile data/fa/Makefile data/fa/words/Makefile data/he/Makefile data/id/Makefile data/kz/Makefile data/lt/Makefile data/ru/Makefile data/ru/words/Makefile data/tr/Makefile data/vn/Makefile data/demo-sql/Makefile man/Makefile" cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure @@ -21116,10 +21466,6 @@ as_fn_error $? "conditional \"OS_WIN32\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi -if test -z "${OS_CYGWIN_TRUE}" && test -z "${OS_CYGWIN_FALSE}"; then - as_fn_error $? "conditional \"OS_CYGWIN\" was never defined. -Usually this means the macro was only invoked conditionally." "$LINENO" 5 -fi if test -z "${OS_X_TRUE}" && test -z "${OS_X_FALSE}"; then as_fn_error $? "conditional \"OS_X\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 @@ -21132,10 +21478,6 @@ as_fn_error $? "conditional \"WITH_SAT_SOLVER\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi -if test -z "${WITH_VITERBI_TRUE}" && test -z "${WITH_VITERBI_FALSE}"; then - as_fn_error $? "conditional \"WITH_VITERBI\" was never defined. -Usually this means the macro was only invoked conditionally." "$LINENO" 5 -fi if test -z "${WITH_CORPUS_TRUE}" && test -z "${WITH_CORPUS_FALSE}"; then as_fn_error $? "conditional \"WITH_CORPUS\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 @@ -21208,10 +21550,6 @@ as_fn_error $? "conditional \"HAVE_ANT\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi -if test -z "${HAVE_SWIG_TRUE}" && test -z "${HAVE_SWIG_FALSE}"; then - as_fn_error $? "conditional \"HAVE_SWIG\" was never defined. -Usually this means the macro was only invoked conditionally." "$LINENO" 5 -fi if test -z "${HAVE_PERL_TRUE}" && test -z "${HAVE_PERL_FALSE}"; then as_fn_error $? "conditional \"HAVE_PERL\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 @@ -21228,6 +21566,10 @@ as_fn_error $? "conditional \"HAVE_PYTHON3\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${HAVE_SWIG_TRUE}" && test -z "${HAVE_SWIG_FALSE}"; then + as_fn_error $? "conditional \"HAVE_SWIG\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 @@ -21625,7 +21967,7 @@ # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by link-grammar $as_me 5.3.16, which was +This file was extended by link-grammar $as_me 5.5.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -21682,7 +22024,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -link-grammar config.status 5.3.16 +link-grammar config.status 5.5.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" @@ -21802,6 +22144,9 @@ delay_variable_subst='$delay_variable_subst' macro_version='`$ECHO "$macro_version" | $SED "$delay_single_quote_subst"`' macro_revision='`$ECHO "$macro_revision" | $SED "$delay_single_quote_subst"`' +AS='`$ECHO "$AS" | $SED "$delay_single_quote_subst"`' +DLLTOOL='`$ECHO "$DLLTOOL" | $SED "$delay_single_quote_subst"`' +OBJDUMP='`$ECHO "$OBJDUMP" | $SED "$delay_single_quote_subst"`' enable_shared='`$ECHO "$enable_shared" | $SED "$delay_single_quote_subst"`' enable_static='`$ECHO "$enable_static" | $SED "$delay_single_quote_subst"`' pic_mode='`$ECHO "$pic_mode" | $SED "$delay_single_quote_subst"`' @@ -21834,12 +22179,10 @@ lt_cv_to_tool_file_cmd='`$ECHO "$lt_cv_to_tool_file_cmd" | $SED "$delay_single_quote_subst"`' reload_flag='`$ECHO "$reload_flag" | $SED "$delay_single_quote_subst"`' reload_cmds='`$ECHO "$reload_cmds" | $SED "$delay_single_quote_subst"`' -OBJDUMP='`$ECHO "$OBJDUMP" | $SED "$delay_single_quote_subst"`' deplibs_check_method='`$ECHO "$deplibs_check_method" | $SED "$delay_single_quote_subst"`' file_magic_cmd='`$ECHO "$file_magic_cmd" | $SED "$delay_single_quote_subst"`' file_magic_glob='`$ECHO "$file_magic_glob" | $SED "$delay_single_quote_subst"`' want_nocaseglob='`$ECHO "$want_nocaseglob" | $SED "$delay_single_quote_subst"`' -DLLTOOL='`$ECHO "$DLLTOOL" | $SED "$delay_single_quote_subst"`' sharedlib_from_linklib_cmd='`$ECHO "$sharedlib_from_linklib_cmd" | $SED "$delay_single_quote_subst"`' AR='`$ECHO "$AR" | $SED "$delay_single_quote_subst"`' AR_FLAGS='`$ECHO "$AR_FLAGS" | $SED "$delay_single_quote_subst"`' @@ -22003,7 +22346,10 @@ } # Quote evaled strings. -for var in SHELL \ +for var in AS \ +DLLTOOL \ +OBJDUMP \ +SHELL \ ECHO \ PATH_SEPARATOR \ SED \ @@ -22016,12 +22362,10 @@ lt_SP2NL \ lt_NL2SP \ reload_flag \ -OBJDUMP \ deplibs_check_method \ file_magic_cmd \ file_magic_glob \ want_nocaseglob \ -DLLTOOL \ sharedlib_from_linklib_cmd \ AR \ AR_FLAGS \ @@ -22226,7 +22570,6 @@ "data/vn/Makefile") CONFIG_FILES="$CONFIG_FILES data/vn/Makefile" ;; "data/demo-sql/Makefile") CONFIG_FILES="$CONFIG_FILES data/demo-sql/Makefile" ;; "man/Makefile") CONFIG_FILES="$CONFIG_FILES man/Makefile" ;; - "viterbi/Makefile") CONFIG_FILES="$CONFIG_FILES viterbi/Makefile" ;; *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; esac @@ -22804,6 +23147,15 @@ macro_version=$macro_version macro_revision=$macro_revision +# Assembler program. +AS=$lt_AS + +# DLL creation program. +DLLTOOL=$lt_DLLTOOL + +# Object dumper program. +OBJDUMP=$lt_OBJDUMP + # Whether or not to build shared libraries. build_libtool_libs=$enable_shared @@ -22883,9 +23235,6 @@ # convert \$build files to toolchain format. to_tool_file_cmd=$lt_cv_to_tool_file_cmd -# An object symbol dumper. -OBJDUMP=$lt_OBJDUMP - # Method to check whether dependent libraries are shared objects. deplibs_check_method=$lt_deplibs_check_method @@ -22898,9 +23247,6 @@ # Find potential files using nocaseglob when deplibs_check_method = "file_magic". want_nocaseglob=$lt_want_nocaseglob -# DLL creation program. -DLLTOOL=$lt_DLLTOOL - # Command to associate shared and link libraries. sharedlib_from_linklib_cmd=$lt_sharedlib_from_linklib_cmd @@ -23504,6 +23850,7 @@ prefix: ${prefix} C compiler: ${CC} ${CPPFLAGS} ${CFLAGS} C++ compiler: ${CXX} ${CPPFLAGS} ${CXXFLAGS} + Regex library: ${REGEX_LIBS} Error handler per-thread: ${error_handler_per_thread} Editline command-line history: ${edlin} UTF8 editline support: ${wedlin} diff -Nru link-grammar-5.3.16/configure.ac link-grammar-5.5.0/configure.ac --- link-grammar-5.3.16/configure.ac 2017-03-14 05:43:05.000000000 +0000 +++ link-grammar-5.5.0/configure.ac 2018-04-25 17:04:50.000000000 +0000 @@ -1,4 +1,4 @@ -AC_INIT([link-grammar],[5.3.16],[link-grammar@googlegroups.com]) +AC_INIT([link-grammar],[5.5.0],[link-grammar@googlegroups.com]) dnl Set release number dnl This is derived from "Versioning" chapter of info libtool documentation. @@ -7,11 +7,11 @@ LINK_MAJOR_VERSION=5 dnl 4a) 5) Increment when adding interfaces. dnl 6) Set to zero when removing or changing interfaces. -LINK_MINOR_VERSION=3 +LINK_MINOR_VERSION=5 dnl 3) Increment when interfaces not changed at all, dnl only bug fixes or internal changes made. dnl 4b) Set to zero when adding, removing or changing interfaces. -LINK_MICRO_VERSION=16 +LINK_MICRO_VERSION=0 dnl dnl Set this too MAJOR_VERSION_PLUS_MINOR_VERSION=`expr $LINK_MAJOR_VERSION + $LINK_MINOR_VERSION` @@ -41,8 +41,9 @@ AC_PROG_CC AC_PROG_CXX AC_PROG_CPP +AM_PROG_LEX AC_PROG_INSTALL -AC_PROG_LIBTOOL +#AC_PROG_LIBTOOL AC_PROG_LN_S AC_PROG_MAKE_SET AC_HEADER_STDC @@ -56,7 +57,8 @@ AC_C_CONST -AC_CHECK_FUNCS(strndup) +AC_CHECK_FUNCS(strndup strtok_r) +AC_CHECK_FUNCS(aligned_alloc posix_memalign _aligned_malloc) AC_FUNC_ALLOCA @@ -73,8 +75,8 @@ # ==================================================================== AC_MSG_CHECKING([for native Win32]) -case "$host" in - *-*-mingw*) +case "$host_os" in + *cygwin* | *msys* | *mingw* | *pw32 | *cegcc*) native_win32=yes ;; *) @@ -84,21 +86,9 @@ AC_MSG_RESULT([$native_win32]) AM_CONDITIONAL(OS_WIN32, test "x$native_win32" = "xyes") -AC_MSG_CHECKING([for Cygwin]) -case "$host" in - *-*-cygwin*) - cygwin=yes - ;; - *) - cygwin=no - ;; -esac -AC_MSG_RESULT([$cygwin]) -AM_CONDITIONAL(OS_CYGWIN, test "x$cygwin" = "xyes") - AC_MSG_CHECKING([for 64-bit Apple OSX]) -case "$host" in - x86_64-*-darwin*) +case "$host_os" in + darwin*) apple_osx=yes ;; *) @@ -108,26 +98,27 @@ AC_MSG_RESULT([$apple_osx]) AM_CONDITIONAL(OS_X, test "x$apple_osx" = "xyes") +HOST_OS="$host_os" +AC_SUBST(HOST_OS) # ==================================================================== CFLAGS="${CFLAGS} -O3" CXXFLAGS="${CXXFLAGS} -O3 -Wall" -# The std=c99/c11 flag provides the proper float-pt math decls working, +# The std=c11 flag provides the proper float-pt math decls working, # e.g. fmaxf However, it also undefined _BSD_SOURCE, etc which is # needed to get fileno, strdup, etc. and so it needs to be manually # enabled again. # Setting -D_POSIX_SOURCE messes up compilation on FreeBSD by # hiding strdup, etc. again. -# CFLAGS="${CFLAGS} -std=c99 -D_BSD_SOURCE -D_SVID_SOURCE -D_POSIX_C_SOURCE -D_GNU_SOURCE" -# Final solution: enable std=c11, explitictly turn on BSD and SVID and +# Final solution: enable std=c11, explicitly turn on BSD and SVID and # GNU, but do NOT turn on POSIX. # if test x${native_win32} = xyes; then - # Vikas says that -std=gnu99 is needed on cygwin/mingw - CFLAGS="-std=gnu99 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE ${CFLAGS}" + SPAWNV_WORKAROUND=-D_spawnv=spawnv # Fix MSYS2 wrapper compilation problem + CFLAGS="-std=c11 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE -D_ISOC11_SOURCE ${CFLAGS} ${SPAWNV_WORKAROUND}" CXXFLAGS="-std=c++11 ${CXXFLAGS}" # We need the shlwapi for PathRemoveFileSpecA(). LDFLAGS="${LDFLAGS} -lshlwapi" @@ -158,7 +149,7 @@ LDFLAGS="-stdlib=libc++ -lc++ ${LDFLAGS}" else # Else standard Linux/*BSD environment. - CFLAGS="-std=c11 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE ${CFLAGS}" + CFLAGS="-std=c11 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE -D_ISOC11_SOURCE ${CFLAGS}" CXXFLAGS="-std=c++11 ${CXXFLAGS}" fi fi @@ -387,7 +378,8 @@ fi AC_LANG([C]) - CPPFLAGS="$CPPFLAGS_SAVE -DUSE_SAT_SOLVER=1" + CPPFLAGS="$CPPFLAGS_SAVE" + AC_DEFINE(USE_SAT_SOLVER, 1, [Define for compilation]) AC_SUBST(ZLIB_CPPFLAGS) fi @@ -395,37 +387,6 @@ AM_CONDITIONAL(WITH_SAT_SOLVER, test "x$enable_sat_solver" = xyes) # ==================================================================== -# Viterbi disabled by default - -AC_ARG_ENABLE( viterbi, - [ --enable-viterbi enable use of the Viterbi parser], - [], - [enable_viterbi=no] -) - -if test "x$enable_viterbi" = "xyes"; then - CPPFLAGS="${CPPFLAGS} -DUSE_VITERBI" - - # We want to check for C++; the easiest way to do this is to - # use c++ to compile stdio.h and bomb if it fails. - AC_LANG([C++]) - AC_CHECK_HEADER([stdio.h],, - [AC_MSG_ERROR([C++ compiler not found; it is needed for the Viterbi parser])]) - AC_LANG([C]) - - # The viterbi decoder is designed to leak memory like crazy. - # Must use garbage collection in order for it to work. - PKG_CHECK_MODULES([LIBGC], [bdw-gc], [gc=yes], [gc=no]) - AC_SUBST(LIBGC_LIBS) - AC_SUBST(LIBGC_CFLAGS) - if test "x${gc}" = "xno"; then - AC_MSG_ERROR([Garbage collector not found; it is needed for the Viterbi parser]) - fi -fi - -AM_CONDITIONAL(WITH_VITERBI, test x${enable_viterbi} = xyes) - -# ==================================================================== # Corpus stats disabled by default AC_ARG_ENABLE( corpus-stats, @@ -435,7 +396,7 @@ ) if test "x$enable_corpus_stats" = "xyes" then - CPPFLAGS="${CPPFLAGS} -DUSE_CORPUS=1" + AC_DEFINE(USE_CORPUS, 1, [Define for compilation]) fi AM_CONDITIONAL(WITH_CORPUS, test x${enable_corpus_stats} = xyes) @@ -445,11 +406,11 @@ AC_ARG_ENABLE( wordgraph_display, [ --enable-wordgraph-display enable graphical display of the Wordgraph], [], - [enable_wordgraph_display=no] + [enable_wordgraph_display=yes] ) if test "x$enable_wordgraph_display" = "xyes" then - CPPFLAGS="${CPPFLAGS} -DUSE_WORDGRAPH_DISPLAY=1" + AC_DEFINE(USE_WORDGRAPH_DISPLAY, 1, [Define for compilation]) fi AM_CONDITIONAL(WITH_ANYSPLIT, test x${enable_wordgraph_display} = xyes) @@ -468,7 +429,7 @@ # Which is insane, of course, except that some Apple OSX systems # appear to be borken... have header files but no libs ... if test "x${SQLiteFound}" = "xyes"; then - CPPFLAGS="${CPPFLAGS} -DHAVE_SQLITE=1" + AC_DEFINE(HAVE_SQLITE, 1, [Define for compilation]) fi AM_CONDITIONAL(HAVE_SQLITE, test x${SQLiteFound} = xyes) AC_SUBST(SQLITE3_LIBS) @@ -501,7 +462,7 @@ # ==================================================================== # 2015-08: The current default for speller is hunspell, since aspell is not -# yet thread safe. +# yet thread safe. Jan 2018 - aspell still not thread-safe... HunSpellDictDir= HunSpellFound=no @@ -548,6 +509,7 @@ CPPFLAGS=$save_cpp_flags if test "x${ASpellFound}" = "xyes"; then AC_DEFINE(HAVE_ASPELL, 1, [Define for compilation]) + ASPELL_LIBS="${ASPELL_LIBS} -lpthread" AC_SUBST(ASPELL_LIBS) AC_SUBST(ASPELL_CFLAGS) fi @@ -594,23 +556,32 @@ # Tokenizing by RegEx AC_ARG_ENABLE( regex_tokenizer, - [ --enable-regex-tokenizer enable use of regex word splitter], + [ --enable-regex-tokenizer enable use of regex word splitter (experimental)], [], [enable_regex_tokenizer=no] ) if test "x$enable_regex_tokenizer" = "xyes" then - CPPFLAGS="${CPPFLAGS} -DUSE_REGEX_TOKENIZER=1" + AC_DEFINE(USE_REGEX_TOKENIZER, 1, [Define for compilation]) AX_PATH_LIB_PCRE fi # ==================================================================== +# Find a regex library that supports regexec(). +# Check in this order: libtre, libc, libregex. + +AC_CHECK_HEADER([tre/regex.h], [AC_CHECK_LIB(tre, regexec, [REGEX_LIBS=-ltre])]) -AC_CHECK_HEADER([regex.h],,[AC_MSG_ERROR([No regex.h header found])]) +if test -z "$REGEX_LIBS"; then + AC_CHECK_HEADER([regex.h], ,[AC_MSG_ERROR([No regex.h header found])]) + AC_CHECK_FUNCS(regexec, , + [AC_CHECK_LIB(regex, regexec, REGEX_LIBS=-lregex, + [AC_MSG_ERROR([No regex library found])])]) +fi -AC_CHECK_FUNCS(regexec,,[AC_CHECK_LIB(regex,regexec, - [REGEX_LIBS=-lregex], - [AC_MSG_ERROR([No regex library found])])]) +if test "x$REGEX_LIBS" = "x-ltre"; then + AC_DEFINE(HAVE_TRE_TRE_H, 1) +fi AC_SUBST(REGEX_LIBS) # ===================================================================== @@ -638,94 +609,37 @@ CFLAGS="${CFLAGS} -arch x86_64" fi - AC_MSG_CHECKING(for jni.h) - - # Try looking for jni in the standard locations first. - AC_TRY_CPP( - [#include ], - [AC_MSG_RESULT([yes, will build java libs]) - JNIfound=yes], - [AC_MSG_WARN([cannot find jni.h in standard location.]) - JNIfound=no]) - - # Didn't find it yet. Try looking in user-specified locations. - if test $JNIfound = "no"; then - JNI_GUESS=" \ - -I $JAVA_HOME/include \ - -I $JAVA_HOME/include/freebsd \ - -I $JAVA_HOME/include/linux \ - -I $JAVA_HOME/include/win32 \ - -I $JAVA_HOME/Headers \ - -I $JDK_HOME/include \ - -I $JDK_HOME/include/freebsd \ - -I $JDK_HOME/include/linux \ - -I $JDK_HOME/include/win32 \ - -I $JDK_HOME/Headers" - - AC_MSG_CHECKING(for jni.h in user-specified location) - SAVE_CPPFLAGS=${CPPFLAGS} - CPPFLAGS="${CPPFLAGS} ${JNI_GUESS}" - AC_TRY_CPP( - [#include ], - [AC_MSG_RESULT([yes, will build java libs]) - JNIfound=yes], - [AC_MSG_WARN([cannot find jni.h in user-specified location.]) - CPPFLAGS=${SAVE_CPPFLAGS} - JNIfound=no]) - fi - - # Still didn't find it. Try looking in some non-standard locations. - if test $JNIfound = "no"; then - JNI_GUESS=" \ - -I/usr/include/classpath/ \ - -I/usr/lib/jvm/default-java/include \ - -I/usr/lib/jvm/default-java/include/freebsd \ - -I/usr/lib/jvm/default-java/include/linux \ - -I/usr/lib/jvm/java-gcj/include \ - -I/usr/lib/jvm/java-gcj/include/freebsd \ - -I/usr/lib/jvm/java-gcj/include/linux \ - -I/usr/local/jdk1.6.0/include/\ - -I/usr/local/jdk1.6.0/include/freebsd \ - -I/usr/local/jdk1.6.0/include/linux \ - -I/usr/lib/jvm/java-6-sun/include/ \ - -I/usr/lib/jvm/java-6-sun/include/freebsd \ - -I/usr/lib/jvm/java-6-sun/include/linux \ - -I/usr/lib/j2sdk1.6-ibm/include \ - -I/usr/lib/j2sdk1.6-ibm/include/freebsd \ - -I/usr/lib/j2sdk1.6-ibm/include/linux \ - -I/usr/lib/jvm/java-7-openjdk-amd64/include \ - -I/usr/lib/jvm/java-7-openjdk-amd64/include/freebsd \ - -I/usr/lib/jvm/java-7-openjdk-amd64/include/linux \ - -I/usr/lib/jvm/java-8-openjdk-amd64/include \ - -I/usr/lib/jvm/java-8-openjdk-amd64/include/freebsd \ - -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux \ - -I/usr/lib/jvm/java-9-openjdk-amd64/include \ - -I/usr/lib/jvm/java-9-openjdk-amd64/include/freebsd \ - -I/usr/lib/jvm/java-9-openjdk-amd64/include/linux \ - -I/c/java/jdk1.6.0/include/ \ - -I/c/java/jdk1.6.0/include/win32/ \ - -I/Developer/SDKs/MacOSX10.5.sdk/System/Library/Frameworks/JavaVM.framework/Headers/ \ - -I/Developer/SDKs/MacOSX10.4u.sdk/System/Library/Frameworks/JavaVM.framework/Headers/ \ - -I /System/Library/Frameworks/JavaVM.framework/Versions/Current/Headers" - - AC_MSG_CHECKING(for jni.h in non-standard location) - SAVE_CPPFLAGS=${CPPFLAGS} - CPPFLAGS="${CPPFLAGS} ${JNI_GUESS}" - AC_TRY_CPP( - [#include ], - [AC_MSG_RESULT([yes, will build java libs]) - JAVA_CPPFLAGS=${JNI_GUESS} - JNIfound=yes], - [AC_MSG_WARN([cannot find jni.h in anywhere.]) - JNIfound=no]) - CPPFLAGS=${SAVE_CPPFLAGS} - fi + JAVA_SRCDIR=../../bindings/java + if test "$native_win32" = yes; then + # javac needs a Windows path + JAVA_SRCDIR=`cygpath -w "$JAVA_SRCDIR"` + + # The standard location of Windows Java is under "\Program Files". + # However, automake is unable to handle spaces in a filename even + # when quoted (e.g. "-I '$JAVA_HOME/include'"). As a workaround, + # convert $JAVA_HOME to a short DOS path. + JAVA_HOME=`cygpath --dos --mixed "$JAVA_HOME"` + + # win32/jni_md.h:34:9: error: ‘__int64’ does not name a type + JAVA_CPPFLAGS=-D__int64=int64_t + fi + AC_SUBST(JAVA_SRCDIR) + + AC_MSG_CHECKING(for jni.h) + AX_JNI_INCLUDE_DIR + if test -n "$JNI_INCLUDE_DIRS"; then + JNIfound=yes + fi + + for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS + do + JAVA_CPPFLAGS="$JAVA_CPPFLAGS -I$JNI_INCLUDE_DIR" + done + AC_SUBST(JAVA_CPPFLAGS) # This allows jni-client.c to be built AM_CONDITIONAL(HAVE_JAVA, test x${JNIfound} = xyes) - AC_SUBST(JAVA_CPPFLAGS) - # For now, we assume that if the ant tool is there, then the javac and # jar compiler/packager are installed as well. These are needed to build # the jar file and install it. @@ -738,20 +652,6 @@ fi # =================================================================== -# swig is needed for compiling the perl and python bindings ... -# ... well, actually, no, its not. 'make dist' is currently set up to -# package all of the files generated by swig, so the user does not need -# to actually install it. However, swig is needed to create the package, -# and also needed to build from a github pull. Hmmm. What to do ... - -AX_PKG_SWIG(2.0.0, [SwigF=yes], [SwigF=no]) -AC_CHECK_PROG(SWIGfound, swig, yes, no) -if test "x$SwigFound" = "xyes"; then - AX_SWIG_ENABLE_CXX -fi -AM_CONDITIONAL(HAVE_SWIG, test x${SWIGfound} = xyes) - -# =================================================================== PerlFound=no if test "x$enable_perl_bindings" = "xyes"; then @@ -876,6 +776,29 @@ fi fi +# =================================================================== +# swig is needed for compiling the Perl and Python bindings ... +# ... well, actually, no, its not. 'make dist' is currently set up to +# package all of the files generated by swig, so the user does not need +# to actually install it. However, swig is needed to create the package, +# and also needed to build from a GitHub pull. + +if ! test -s "$srcdir/bindings/python/lg_python_wrap.cc"; then + AM_COND_IF(HAVE_PYTHON2, [swig_required=Python2]) +fi +if ! test -s "$srcdir/bindings/python3/lg_python_wrap.cc"; then + AM_COND_IF(HAVE_PYTHON3, [swig_required="$swig_required Python3"]) +fi +if ! test -s "$srcdir/bindings/perl/lg_perl_wrap.cc"; then + AM_COND_IF(HAVE_PERL, [swig_required="$swig_required Perl"]) +fi + +if test -n "$swig_required"; then + AX_PKG_SWIG(2.0.0,, [AC_MSG_ERROR(['swig' is required to create bindings for: $swig_required])]) + AX_SWIG_ENABLE_CXX +fi +AM_CONDITIONAL(HAVE_SWIG, test -n "$SWIG") + # ==================================================================== # check compiler flags @@ -908,7 +831,7 @@ -Wsign-compare -Werror-implicit-function-declaration \ -Wpointer-arith -Wwrite-strings -Wmissing-declarations \ -Wpacked -Wswitch-enum -Wmissing-format-attribute \ --Wstrict-aliasing -Winit-self \ +-Wstrict-aliasing -Winit-self -Wshadow \ -Wno-missing-field-initializers -Wno-unused-parameter \ -Wno-attributes -Wno-long-long -Winline" @@ -969,6 +892,11 @@ AC_SUBST(CPPFLAGS) AC_SUBST(CXXFLAGS) +dnl Save the compilation definitions for an extended version printout +AC_OUTPUT_MAKE_DEFS() +LG_DEFS=`$ECHO "$DEFS" | $SED 's/\\\\//g'` +AC_SUBST(LG_DEFS) + AC_CONFIG_FILES([ Makefile link-grammar.pc @@ -1013,7 +941,6 @@ data/vn/Makefile data/demo-sql/Makefile man/Makefile -viterbi/Makefile ]) AC_OUTPUT @@ -1024,6 +951,7 @@ prefix: ${prefix} C compiler: ${CC} ${CPPFLAGS} ${CFLAGS} C++ compiler: ${CXX} ${CPPFLAGS} ${CXXFLAGS} + Regex library: ${REGEX_LIBS} Error handler per-thread: ${error_handler_per_thread} Editline command-line history: ${edlin} UTF8 editline support: ${wedlin} diff -Nru link-grammar-5.3.16/data/ady/4.0.affix link-grammar-5.5.0/data/ady/4.0.affix --- link-grammar-5.3.16/data/ady/4.0.affix 2017-01-12 05:47:42.000000000 +0000 +++ link-grammar-5.5.0/data/ady/4.0.affix 2017-08-11 14:52:16.000000000 +0000 @@ -1,3 +1,4 @@ +#include /any/affix-punc % Mark the first morpheme by appending STEMSUBSCR. .=: STEMSUBSCR+; @@ -9,7 +10,7 @@ % (w: word, t:stem, s: suffix) % Split words into 1 or 2 parts (and no more than two parts) -% Accroding to the 4.0.dict, the first part will act as a stem, +% According to the 4.0.dict, the first part will act as a stem, % and the second part as a suffix carrying syntactic inflection. 2: REGPARTS+; @@ -29,14 +30,12 @@ % Multiple regexes per class are permitted. % % The word parts currently contain at least one character -% (null morphems are not supported). -% -% If needed, add the same regex in more than one definition. +% (null morphemes are not supported). -% Regex to match the prefix (including the whole word) -".*" : REGPRE+; +% Regex to match the prefix +"": REGPRE+; -% Regex to match the suffix (not including the whole word) -".*": REGSUF+; +% Regex to match the suffix +"": REGSUF+; % End of Anysplit parameters diff -Nru link-grammar-5.3.16/data/ady/4.0.constituent-knowledge link-grammar-5.5.0/data/ady/4.0.constituent-knowledge --- link-grammar-5.3.16/data/ady/4.0.constituent-knowledge 2017-01-12 05:47:42.000000000 +0000 +++ link-grammar-5.5.0/data/ady/4.0.constituent-knowledge 2017-06-17 02:03:04.000000000 +0000 @@ -1,5 +1,2 @@ STARTING_LINK_TYPE_TABLE: -ANY* v - - diff -Nru link-grammar-5.3.16/data/ady/4.0.dict link-grammar-5.5.0/data/ady/4.0.dict --- link-grammar-5.3.16/data/ady/4.0.dict 2017-01-27 22:04:07.000000000 +0000 +++ link-grammar-5.5.0/data/ady/4.0.dict 2018-04-27 00:47:47.000000000 +0000 @@ -12,7 +12,7 @@ LEFT-WALL: ANY+; ANY-WORD: {@ANY-} & {@ANY+}; -UNKNOWN-WORD: {@ANY-} & {@ANY+}; +: {@ANY-} & {@ANY+}; ANY-PUNCT: {@ANY-} & {@ANY+}; JUNK: {@ANY-} & {@ANY+}; diff -Nru link-grammar-5.3.16/data/ady/4.0.knowledge link-grammar-5.5.0/data/ady/4.0.knowledge --- link-grammar-5.3.16/data/ady/4.0.knowledge 2017-01-12 05:47:42.000000000 +0000 +++ link-grammar-5.5.0/data/ady/4.0.knowledge 2017-05-17 18:30:06.000000000 +0000 @@ -1,28 +1,5 @@ ; Post-processing knowledge file -; ---------------------------------------------------------------------------- -; This file contains the knowledge related to post-processing, in the -; form of lists and rules. This file is read by post-process.c at run-time. -; Syntax of file: -; line starting with ";" is a comment -; commas are field delimiters -; any token beginning with the character @ is expanded to the set -; of symbols it defined. e.g. one could write -; FOO: blah1 blah2 blah3 -; thus defining a set FOO containing three strings. Then one could later write -; BAR: blah5 @FOO blah8 -; which defines a set BAR containing 5 strings. -; -; Capitalized tokens are *required*, though if you feel like providing an -; empty list afterwards, that's your right. -; ---------------------------------------------------------------------------- - -; ---------------------------------------------------------------------- -; ---------------------- LINK TYPE TABLE------------------------------- -; ---------------------------------------------------------------------- -; The following table associates a domain type with each possible -; starting link. It contains pairs: the first of each pair is a link -; type, and the second is the domain to which that link type belongs. - STARTING_LINK_TYPE_TABLE: - ANY e + +; This table is empty! diff -Nru link-grammar-5.3.16/data/ady/4.0.regex link-grammar-5.5.0/data/ady/4.0.regex --- link-grammar-5.3.16/data/ady/4.0.regex 2017-01-12 02:59:10.000000000 +0000 +++ link-grammar-5.5.0/data/ady/4.0.regex 2017-08-11 14:52:16.000000000 +0000 @@ -5,21 +5,22 @@ % % %***************************************************************************% -% Want to match apostrophes, for abreviations (I'm I've, etc.) since these -% cannot be auto-split with the current splitter. Also want to accept -% hyphenated words, and words with underbars in them. -ANY-WORD: /^[[:alnum:]_'-]+$/ +% Want to match apostrophes, for abbreviations (I'm I've, etc.) since +% these cannot be auto-split with the current splitter. +% Hyphenated words, and words with underbars in them, get split. +ANY-WORD: /^[[:alnum:]']+$/ ANY-PUNCT: /^[[:punct:]]+$/ % Simple two-part random morphology: match any string of one or more % letters as a stem, and the rest as a suffix. % We are currently using .= to denote the end of a stem. -SIMPLE-STEM: /^[[:alnum:]_'-]+.=/ -SIMPLE-SUFF: /[[:alnum:]_'-]+$/ +SIMPLE-STEM: /^[[:alnum:]']+.=/ +SIMPLE-SUFF: /=[[:alnum:]']+$/ % Match anything that doesn't match the above. % Match anything that isn't white-space. -% Well ... actually, reject anything that begins or ends with -% punctuation. We do this, so that tokenize can split off the +% Well ... actually, reject anything that contains +% punctuation. We do this, so that the tokenizer can split off the % the affixes (trailing commas, etc) correctly. -JUNK: /^[^[:punct:]][^[:space:]]+[^[:punct:]]$/ +JUNK: !/[[:punct:]]/ +JUNK: /^/ diff -Nru link-grammar-5.3.16/data/ady/Makefile.in link-grammar-5.5.0/data/ady/Makefile.in --- link-grammar-5.3.16/data/ady/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/ady/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/amy/4.0.affix link-grammar-5.5.0/data/amy/4.0.affix --- link-grammar-5.3.16/data/amy/4.0.affix 2017-01-27 22:11:49.000000000 +0000 +++ link-grammar-5.5.0/data/amy/4.0.affix 2017-08-11 14:52:16.000000000 +0000 @@ -1,3 +1,4 @@ +#include /any/affix-punc % Mark the first morpheme by appending STEMSUBSCR. .=: STEMSUBSCR+; @@ -8,7 +9,7 @@ % Program and dictionary sanity check: Allow only these combinations. % (w: word, p:prefix, m:middle, t:stem, s: suffix) % -% See the file api.c circa line 1100 for details +% See the file sane.c circa line 120 for details % (or grep for AFFIXTYPE in the code). "w|ts|pts|pms|pmss|ptss|ps|ts": SANEMORPHISM+; @@ -32,19 +33,16 @@ % Multiple regexes per class are permitted. % % The word parts currently contain at least one character -% (null morphems are not supported). -% -% If needed, add the same regex in more than one definition. +% (null morphemes are not supported). -% Regex to match the prefix (including the whole word) -".*" : REGPRE+; +% Regex to match the prefix +"" : REGPRE+; % Regex to match the middle parts (not including the prefix or the suffix) -".*": REGMID+; +"": REGMID+; %".{2,}": REGMID+; -% Regex to match the suffix (not including the whole word) -".*": REGSUF+; -%"[aeiouy]$" "[rst][^=]$": REGSUF+; +% Regex to match the suffix +"": REGSUF+; % End of Anysplit parameters diff -Nru link-grammar-5.3.16/data/amy/4.0.constituent-knowledge link-grammar-5.5.0/data/amy/4.0.constituent-knowledge --- link-grammar-5.3.16/data/amy/4.0.constituent-knowledge 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/amy/4.0.constituent-knowledge 2017-06-17 02:03:31.000000000 +0000 @@ -1,5 +1,2 @@ STARTING_LINK_TYPE_TABLE: -ANY* v - - diff -Nru link-grammar-5.3.16/data/amy/4.0.dict link-grammar-5.5.0/data/amy/4.0.dict --- link-grammar-5.3.16/data/amy/4.0.dict 2017-01-27 22:07:11.000000000 +0000 +++ link-grammar-5.5.0/data/amy/4.0.dict 2018-04-27 00:48:01.000000000 +0000 @@ -12,7 +12,7 @@ LEFT-WALL: ANY+; ANY-WORD: {@ANY-} & {@ANY+}; -UNKNOWN-WORD: {@ANY-} & {@ANY+}; +: {@ANY-} & {@ANY+}; ANY-PUNCT: {@ANY-} & {@ANY+}; JUNK: {@ANY-} & {@ANY+}; diff -Nru link-grammar-5.3.16/data/amy/4.0.knowledge link-grammar-5.5.0/data/amy/4.0.knowledge --- link-grammar-5.3.16/data/amy/4.0.knowledge 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/amy/4.0.knowledge 2017-05-17 18:29:49.000000000 +0000 @@ -1,28 +1,5 @@ ; Post-processing knowledge file -; ---------------------------------------------------------------------------- -; This file contains the knowledge related to post-processing, in the -; form of lists and rules. This file is read by post-process.c at run-time. -; Syntax of file: -; line starting with ";" is a comment -; commas are field delimiters -; any token beginning with the character @ is expanded to the set -; of symbols it defined. e.g. one could write -; FOO: blah1 blah2 blah3 -; thus defining a set FOO containing three strings. Then one could later write -; BAR: blah5 @FOO blah8 -; which defines a set BAR containing 5 strings. -; -; Capitalized tokens are *required*, though if you feel like providing an -; empty list afterwards, that's your right. -; ---------------------------------------------------------------------------- - -; ---------------------------------------------------------------------- -; ---------------------- LINK TYPE TABLE------------------------------- -; ---------------------------------------------------------------------- -; The following table associates a domain type with each possible -; starting link. It contains pairs: the first of each pair is a link -; type, and the second is the domain to which that link type belongs. - STARTING_LINK_TYPE_TABLE: - ANY e + +; This table is empty! diff -Nru link-grammar-5.3.16/data/amy/4.0.regex link-grammar-5.5.0/data/amy/4.0.regex --- link-grammar-5.3.16/data/amy/4.0.regex 2017-01-27 21:57:17.000000000 +0000 +++ link-grammar-5.5.0/data/amy/4.0.regex 2017-08-11 14:52:16.000000000 +0000 @@ -5,21 +5,22 @@ % % %***************************************************************************% -% Want to match apostrophes, for abreviations (I'm I've, etc.) since these -% cannot be auto-split with the current splitter. Also want to accept -% hyphenated words, and words with underbars in them. -ANY-WORD: /^[[:alnum:]_'-]+$/ +% Want to match apostrophes, for abbreviations (I'm I've, etc.) since +% these cannot be auto-split with the current splitter. +% Hyphenated words, and words with underbars in them, get split. +ANY-WORD: /^[[:alnum:]']+$/ ANY-PUNCT: /^[[:punct:]]+$/ % Multi-part random morphology: match any string as prefix, stem, or % suffix. -MOR-PREF: /^[[:alnum:]_'-]+=$/ -MOR-STEM: /^[[:alnum:]_'-]+.=$/ -MOR-SUFF: /^=[[:alnum:]_'-]+$/ +MOR-PREF: /^[[:alnum:]']+=$/ +MOR-STEM: /^[[:alnum:]']+.=$/ +MOR-SUFF: /^=[[:alnum:]']+$/ % Match anything that doesn't match the above. % Match anything that isn't white-space. -% Well ... actually, reject anything that begins or ends with -% punctuation. We do this, so that tokenize can split off the +% Well ... actually, reject anything that contains +% punctuation. We do this, so that the tokenizer can split off the % the affixes (trailing commas, etc) correctly. -JUNK: /^[^[:punct:]][^[:space:]]+[^[:punct:]]$/ +JUNK: !/[[:punct:]]/ +JUNK: /^/ diff -Nru link-grammar-5.3.16/data/amy/Makefile.in link-grammar-5.5.0/data/amy/Makefile.in --- link-grammar-5.3.16/data/amy/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/amy/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/any/4.0.affix link-grammar-5.5.0/data/any/4.0.affix --- link-grammar-5.3.16/data/any/4.0.affix 2017-04-15 21:43:39.000000000 +0000 +++ link-grammar-5.5.0/data/any/4.0.affix 2017-08-11 14:52:16.000000000 +0000 @@ -1,11 +1 @@ - -")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』」 "’’" "’" ''.y '.y -"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ?! ….y ....y "”" "–" "‐" 、 ~ -¢ ₵ ™ ℠ : RPUNC+; - -"(" "{" "[" "<" « 〈 ( 〔 《 【 [ 『 「 `` „ “ ‘ ''.x '.x ….x ....x -¿ ¡ "$" -£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점 -† †† ‡ § ¶ © ® ℗ № "#": LPUNC+; - -""«»《》【】『』`„“": QUOTES+; +#include /any/affix-punc diff -Nru link-grammar-5.3.16/data/any/4.0.constituent-knowledge link-grammar-5.5.0/data/any/4.0.constituent-knowledge --- link-grammar-5.3.16/data/any/4.0.constituent-knowledge 2017-01-09 04:21:35.000000000 +0000 +++ link-grammar-5.5.0/data/any/4.0.constituent-knowledge 2017-06-17 02:03:13.000000000 +0000 @@ -1,3 +1,2 @@ STARTING_LINK_TYPE_TABLE: -ANY* v diff -Nru link-grammar-5.3.16/data/any/4.0.dict link-grammar-5.5.0/data/any/4.0.dict --- link-grammar-5.3.16/data/any/4.0.dict 2017-04-15 21:43:39.000000000 +0000 +++ link-grammar-5.5.0/data/any/4.0.dict 2018-04-27 00:48:06.000000000 +0000 @@ -7,14 +7,14 @@ % % %***************************************************************************% -% Dictionary version number is 5.3.16 (formatted as V5v3v16+) -: V5v3v16+; +% Dictionary version number is 5.4.3 (formatted as V5v4v3+) +: V5v4v3+; % Left-wall @ANY allows multiple head-words, e.g. head-nouns and also % head-verbs. See the README.md file for more. LEFT-WALL: @ANY+; ANY-WORD: {@ANY-} & {@ANY+}; ANY-PUNCT: {@ANY-} & {@ANY+}; -UNKNOWN-WORD: {@ANY-} & {@ANY+}; +: {@ANY-} & {@ANY+}; JUNK: {@ANY-} & {@ANY+}; diff -Nru link-grammar-5.3.16/data/any/4.0.knowledge link-grammar-5.5.0/data/any/4.0.knowledge --- link-grammar-5.3.16/data/any/4.0.knowledge 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/any/4.0.knowledge 2017-05-17 18:26:42.000000000 +0000 @@ -1,28 +1,5 @@ ; Post-processing knowledge file -; ---------------------------------------------------------------------------- -; This file contains the knowledge related to post-processing, in the -; form of lists and rules. This file is read by post-process.c at run-time. -; Syntax of file: -; line starting with ";" is a comment -; commas are field delimiters -; any token beginning with the character @ is expanded to the set -; of symbols it defined. e.g. one could write -; FOO: blah1 blah2 blah3 -; thus defining a set FOO containing three strings. Then one could later write -; BAR: blah5 @FOO blah8 -; which defines a set BAR containing 5 strings. -; -; Capitalized tokens are *required*, though if you feel like providing an -; empty list afterwards, that's your right. -; ---------------------------------------------------------------------------- - -; ---------------------------------------------------------------------- -; ---------------------- LINK TYPE TABLE------------------------------- -; ---------------------------------------------------------------------- -; The following table associates a domain type with each possible -; starting link. It contains pairs: the first of each pair is a link -; type, and the second is the domain to which that link type belongs. - STARTING_LINK_TYPE_TABLE: - ANY e + +; This table is empty! diff -Nru link-grammar-5.3.16/data/any/4.0.regex link-grammar-5.5.0/data/any/4.0.regex --- link-grammar-5.3.16/data/any/4.0.regex 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/any/4.0.regex 2017-08-11 14:52:16.000000000 +0000 @@ -5,15 +5,16 @@ % % %***************************************************************************% -% Want to match apostrophes, for abreviations (I'm I've, etc.) since these -% cannot be auto-split with the current splitter. Also want to accept -% hyphenated words, and word with underbars in them. -ANY-WORD: /^[[:alnum:]_'-]+$/ +% Want to match apostrophes, for abbreviations (I'm I've, etc.) since +% these cannot be auto-split with the current splitter. +% Hyphenated words, and words with underbars in them, get split. +ANY-WORD: /^[[:alnum:]']+$/ ANY-PUNCT: /^[[:punct:]]+$/ % Match anything that doesn't match the above. % Match anything that isn't white-space. -% Well ... actually, reject anything that begins or ends with -% punctuation. We do this, so that tokenize can split off the +% Well ... actually, reject anything that contains +% punctuation. We do this, so that the tokenizer can split off the % the affixes (trailing commas, etc) correctly. -JUNK: /^[^[:punct:]][^[:space:]]+[^[:punct:]]$/ +JUNK: !/[[:punct:]]/ +JUNK: /^/ diff -Nru link-grammar-5.3.16/data/any/affix-punc link-grammar-5.5.0/data/any/affix-punc --- link-grammar-5.3.16/data/any/affix-punc 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/data/any/affix-punc 2018-04-25 17:07:19.000000000 +0000 @@ -0,0 +1,12 @@ +")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』」 """ "’’" "’" ''.y '.y +"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ?! ….y ....y "”" +_ - ‐ ‑ ‒ – — ― ~ ━ ー 、 +¢ ₵ ™ ℠ : RPUNC+; + +"(" "{" "[" "<" « 〈 ( 〔 《 【 [ 『 「 """ `` „ “ ‘ ''.x '.x ….x ....x +¿ ¡ "$" +_ - ‐ ‑ ‒ – — ― ━ ー ~ +£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점 +† †† ‡ § ¶ © ® ℗ № "#": LPUNC+; + +-- ‒ – — ― - _ "(" ")" "[" "]" ... … "," ";" ":": MPUNC+; diff -Nru link-grammar-5.3.16/data/any/Makefile.am link-grammar-5.5.0/data/any/Makefile.am --- link-grammar-5.3.16/data/any/Makefile.am 2017-04-15 21:43:39.000000000 +0000 +++ link-grammar-5.5.0/data/any/Makefile.am 2018-01-04 06:38:34.000000000 +0000 @@ -1,5 +1,6 @@ DICTS= \ + affix-punc \ 4.0.affix \ 4.0.constituent-knowledge \ 4.0.dict \ diff -Nru link-grammar-5.3.16/data/any/Makefile.in link-grammar-5.5.0/data/any/Makefile.in --- link-grammar-5.3.16/data/any/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/any/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -333,6 +338,7 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ DICTS = \ + affix-punc \ 4.0.affix \ 4.0.constituent-knowledge \ 4.0.dict \ diff -Nru link-grammar-5.3.16/data/any/README.md link-grammar-5.5.0/data/any/README.md --- link-grammar-5.3.16/data/any/README.md 2017-04-15 21:43:39.000000000 +0000 +++ link-grammar-5.5.0/data/any/README.md 2017-08-13 22:11:23.000000000 +0000 @@ -1,5 +1,5 @@ -Defintion of the "ANY" language. +Definition of the "ANY" language. -------------------------------- The dictionary define here will parse "any" language, exploring all combinatoric possibilities. It is used for certain machine-learning @@ -9,7 +9,7 @@ Some important design notes to be kept in mind. * The code implicitly assumes that white-space is a word-separator. - This works great for just about all modern langauges, but can + This works great for just about all modern languages, but can present difficulties for ancient and academic texts. In particular, Chinese texts are NOT written with white-space @@ -18,9 +18,9 @@ * Punctuation: the `4.0.affix` file defines a set of leading and trailing punctuation that is automatically stripped from the - beginings and endings of words. The list of punctuation in manually + beginnings and endings of words. The list of punctuation in manually assembled, and is more-or-less complete and appropriate for most - modern lanuages. Again, this assumption can be problematic. + modern languages. Again, this assumption can be problematic. In a truly purist approach to language-learning, the learning process should be able to discern punctuation on it's own, without needing diff -Nru link-grammar-5.3.16/data/ar/4.0.dict link-grammar-5.5.0/data/ar/4.0.dict --- link-grammar-5.3.16/data/ar/4.0.dict 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/ar/4.0.dict 2018-04-27 00:48:40.000000000 +0000 @@ -71,7 +71,7 @@ ); -/ar/words/words.n /ar/words/words.noun /ar/words/words.Ndip /ar/words/words.N0 /ar/words/words.years UNKNOWN-WORD.n : ( or ) & & ; +/ar/words/words.n /ar/words/words.noun /ar/words/words.Ndip /ar/words/words.N0 /ar/words/words.years .n : ( or ) & & ; /ar/words/words.Ndu : {NSUFFPLd+} & ( or ) & & ; /ar/words/words.Nap : NSUFFFEMsg+ & ( or ) & & ; /ar/words/words.NAt : NSUFFFEMpl+ & ( or ) & & ; @@ -81,7 +81,7 @@ /ar/words/words.Nall : (NSUFFFEM+ or NSUFFPL+) & ( or ) & & ; /ar/words/words.N_slash_ap : {NSUFFPLd+ or NSUFFFEMsg+ or NSUFFFEMpl+} & ( or ) & & ; -/ar/words/words.Nprop /ar/words/words.noun_prop UNKNOWN-WORD.Nprop: +/ar/words/words.Nprop /ar/words/words.noun_prop .Nprop: (({G+} & & & ) or (G- & {G+})); % Proper nouns % Quantifiers @@ -163,7 +163,7 @@ ; %% Perfect Verbs -/ar/words/words.PV /ar/words/words.PV_V /ar/words/words.PV_C /ar/words/words.PV_Atn /ar/words/words.PV_0 UNKNOWN-WORD.PV : +/ar/words/words.PV /ar/words/words.PV_V /ar/words/words.PV_C /ar/words/words.PV_Atn /ar/words/words.PV_0 .PV : {PVSUFFSUBJ+} & {NEGp-} & {C- or CCVr-} & ((VSUFFDO+ & {S+ or CCSUBJ+}) or ({S+ or CCSUBJ+} & [] & {[O+ or CCOBJ+]})) & & {SUB+ or CCVl+}; %% Intransitive Perfect Verbs @@ -185,7 +185,7 @@ PVSUFFSUBJ-; %% Imperfect Verbs -/ar/words/words.IV /ar/words/words.IV_yu /ar/words/words.IV_0 UNKNOWN-WORD.IV : +/ar/words/words.IV /ar/words/words.IV_yu /ar/words/words.IV_0 .IV : IVPRE- & {NEGi- or AUX-} & {C- or CCVr-} & {IVSUFFSUBJ+} & (({VSUFFDO+} & {S+ or CCSUBJ+}) or ({S+ or CCSUBJ+} & [] )) & {[O+ or CCOBJ+]} & & {SUB+ or CCVl+} ; %% Intransitive Imperfect Verbs diff -Nru link-grammar-5.3.16/data/ar/Makefile.in link-grammar-5.5.0/data/ar/Makefile.in --- link-grammar-5.3.16/data/ar/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/ar/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -217,6 +217,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -245,6 +246,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -253,13 +255,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -326,12 +331,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/ar/words/Makefile.in link-grammar-5.5.0/data/ar/words/Makefile.in --- link-grammar-5.3.16/data/ar/words/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/ar/words/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/command-help-en.txt link-grammar-5.5.0/data/command-help-en.txt --- link-grammar-5.3.16/data/command-help-en.txt 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/data/command-help-en.txt 2018-04-26 02:38:14.000000000 +0000 @@ -0,0 +1,261 @@ +% link-parser internal command documentation. +% +% The internal help system first displays the hard-coded one-line description +% of each variable (or command) and its current and default values, and then +% the matching text entry from this file. + +[graphics] +The displayed words are marked as follows: + [word] Null-linked word + word[!] word classified by a regex + word[!REGEX_NAME] word classified by REGEX_NAME (turn on by !morphology=1) + word[~] word generated by a spell guess (unknown original word) + word[&] word run-on separated by a spell guess + word[?].POS word is unknown (POS is found by the parser) + word.POS word found in the dictionary as word.POS + word.#CORRECTION word is probably a typo - got linked as CORRECTION + +For dictionaries that support morphology (turn on by !morphology=1): + word= A prefix morpheme + =word A suffix morpheme + word.= A stem + +For more details see: + https://www.abisource.com/projects/link-grammar/dict/ + +[constituents] +Its value may be: + 0 Disabled + 1 Treebank-style constituent tree + 2 Flat, bracketed tree [A like [B this B] A] + 3 Flat, treebank-style tree (A like (B this)) + +[spell] +If zero, no spell and run-on corrections of unknown words are performed. +Else, use up to this many spell-guesses per unknown word. In that case, +the number of run-on corrections (word split) of unknown words is not +limited. + +[width] +When writing to a terminal, this value is set from its width. + +[verbosity] +This is the level of description printed about the internal program working. +Values 1-4 are appropriate for use by the program user. +Higher values intended for LG dictionary writers and library developers. + +Some useful values: + 0 No prompt, minimal library messages + 1 Normal verbosity + 2 Show times of the parsing steps + 3 Display data file search and locale setup + 5-9 Tokenizer and parser debugging + 10-19 Dictionary debugging + 101 Print all the dictionary connectors, along with their length limit + +[morphology] +When False, the Graphics display (see "!help graphics") morphemes +one token. When True, the morphemes are shown as separate tokens. +Also, when a word matches a RegEx, the matching dictionary entry +is shown. + +[limit] +The maximum number of linkages that are considered in post-processing. If +more than this number of linkages found by the parser, then a random +sample of linkages is chosen. + +[cost-max] +Determines the maximum disjunct cost used during parsing, where the cost +of a disjunct is equal to the maximum cost of all of its connectors. + +[bad] +When True, display also linkages that fail post-processing, along with the +post-processing error that was found. + +This mode is useful for people editing the dictionary or post-processing. +The invalid linkages will be outputted after the valid ones. + +The parser will only output the linkages it finds at whatever stage it had +gotten to when it found a valid linkage. For example, if it had gotten to +null-link stage 2 before finding its first valid linkage, it will also output +invalid linkages found at null-link stage 2. There is no way of seeing invalid +linkages found at earlier stages. + +[short] +Determines how long the links are allowed to be. The intended use of this +is to speed up parsing by not considering very long links for most +connectors, since they are rarely used in a correct parse. An entry for +UNLIMITED-CONNECTORS in the dictionary will specify which connectors +(according to the connector matching rules) are exempt from this length +limit. + +[timeout] +Determines the approximate maximum time (in seconds) that parsing is +allowed to take. After this time has expired, the parsing process is +artificially forced to complete quickly by pretending that no further +solutions can be constructed. The actual parsing time might be slightly +longer. + +For the SAT parser (see "!help use-sat") this option has no effect (yet). + +[memory] +The value of this variable has no effect for now. + +[null] +When False, only linkages with no null links are considered. +When True, if the parser tries to find linkages with the minimal +possible number of null links. + +[panic] +Use "panic-mode" if a parse cannot be found within the time set +by !timeout. In that mode, various parse options are adjusted so +a less accurate parse can be found quickly. + +[use-sat] +Note: This parser cannot find yet linkages with null-links. +Also, the !timeout setup doesn't affect it. +It is supposed to be much faster than the standard parser on very long +sentences (that have a full parse - without null links). + +[walls] +The setup of this variable affects the display of the Graphics-mode +(see "!help graphics"). +When True, the RIGHT-WALL and LEFT_WALL are always displayed. +When False, they are not displayed if their links are not considered +"interesting" (by a hard-coded criterion in the LG library). + +[islands-ok] +This option determines whether or not "islands" of links are allowed. +For example, the following linkage has an island: + +linkparser> this sentence is false this sentence is true +No complete linkages found. +Found 16 linkages (8 had no P.P. violations) at null count 1 + Linkage 1, cost vector = (UNUSED=0 DIS= 0.00 LEN=11) + + +----------->WV---------->+ + +------->Wd-------+ | + | +--Dsu*c-+--Ss*s-+-Paf-+ +--Dsu*c-+--Ss*s-+--Pa-+ + | | | | | | | | | +LEFT-WALL this.d sentence.n is.v false.a this.d sentence.n is.v true.a + +[postscript] +For actual use, this postscript output needs a header that is printed when +!ps-header=True is set. +Note: The postscript output currently malfunctions for sentences longer +than a page width. + +[ps-header] +When True, print also an appropriate postscript header when +!postscript=True is set. + +%[cluster] +%This modes is not active for now (the source code may need fixes). + +%[senses] +%This modes is not active for now (the source code may need fixes). + +%[cost-model] +%The only allowed value is 1 for now (the source code may need fixes). + +[links] +When True, display complete link data, including post-processing domains. +This mode is set to True when the standard input is not a terminal. + +[disjuncts] +When True, display the disjuncts that used for each word, including their +cost. + +[batch] +When True, the program process sentences in batch-mode, in which null-links +are not allowed (as if !null=False got issued). + +It is normally used with: +link-parser [dictionary name] [arguments] < input-file + +The user indicates in the input file which sentences should be rejected, and +which should be accepted; the parser then notifies the user when its judgments +differ. To say that the parser "rejects" a sentence means that it does not find +any linkages without null-links. + +To prepare a file for use in batch-mode, put "!batch" at the top of the file +in order to toggle it to True. +Then, for each sentence that should be rejected, precede the sentence with "*" +in the file. If it should be accepted, precede it with nothing. After +processing a batch file, the parser will then print the number of errors in the +file: i.e., the number of sentences on which its judgments differ with the +judgments indicated by the symbols. + +Our GitHub repository contains several large batch-files; For English, the +three most important ones are "corpus-basic.batch", "corpus-fixes.batch" and +"corpus-fix-long.batch". +See: https://github.com/opencog/link-grammar/tree/master/data/en + +For more details see BATCH-MODE in: +https://www.abisource.com/projects/link-grammar/dict/introduction.html + +[echo] +This mode is set to True when the standard input is not a terminal. + +[rand] +Random numbers are used for sampling linkages if needed +(see "!help limit"). + +[debug] +This variable is for LG library development. +Its purpose is to limit debug output, which may have a big volume +otherwise, to certain functions or files. For example: + +$ link-parser -verbosity=6 -debug=flatten_wordgraph,print.c + +will only show messages from the `flatten_wordgraph()` function or the +print.c file. + +For more details see debug/README.md in the LG library source code +directory. + +[test] +This variable is used to enable features that are for debug or have yet +no other variable to control them. +For example, this setting will show all the linkages without a need to +press RETURN after each one: + +!test=auto-next-linkages + +For more details, see debug/README.md and link-grammar/README.md +in our GitHub repository https://github.com/opencog/link-grammar . + +[file] +This command gets as an argument a file name to run in batch-mode. +See "!help batch". + +[variables] +Variables can be set as follows: + + ! Toggle the specified Boolean variable. + != Assign that value to that variable. + +[wordgraph] +This variable controls displaying the word-graph of the sentence. +The word-graph is a representation of the relations between the sentence +tokens, as set by the library tokenizer before the parsing step. + +Its value may be: + 0 Disabled + 1 Default display + 2 Display parent tokens as subgraphs + 3 Use esoteric display flags as set by !test=wg:FLAGS + +% FLAGS documentation: +% These flags are defined in wordgraph.h. +% Below, unsplit-word means a token before getting split. +% (*) marks the flags that are enabled for the default display. +% +% c Compact display +% d (*) Display debug labels +% h Display hex node numbers (for "dot" commands debug) +% l (*) Add a legend +% p Display back-pointing word-graph links +% s Display unsplit-words as subgraphs +% u (*) Display unsplit-word links +% x Display using X11 even on Windows (if supported) diff -Nru link-grammar-5.3.16/data/de/4.0.dict link-grammar-5.5.0/data/de/4.0.dict --- link-grammar-5.3.16/data/de/4.0.dict 2017-01-27 22:01:05.000000000 +0000 +++ link-grammar-5.5.0/data/de/4.0.dict 2018-04-27 00:48:47.000000000 +0000 @@ -521,5 +521,5 @@ % With the following line in the dictionary, the parser will simply % skip over (null-link) unknown words. If you remove it, the parser % will output an error for any unknown words. -UNKNOWN-WORD: NO+; +: NO+; diff -Nru link-grammar-5.3.16/data/de/4.0.regex link-grammar-5.5.0/data/de/4.0.regex --- link-grammar-5.3.16/data/de/4.0.regex 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/de/4.0.regex 2018-04-27 00:50:43.000000000 +0000 @@ -120,4 +120,4 @@ % tokenizer explicitly mangles based on these punctution marks. % % Look for at least four in a row. -UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/ +: /^[.,-]{4}[.,-]*$/ diff -Nru link-grammar-5.3.16/data/de/Makefile.in link-grammar-5.5.0/data/de/Makefile.in --- link-grammar-5.3.16/data/de/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/de/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/demo-sql/4.0.constituent-knowledge link-grammar-5.5.0/data/demo-sql/4.0.constituent-knowledge --- link-grammar-5.3.16/data/demo-sql/4.0.constituent-knowledge 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/demo-sql/4.0.constituent-knowledge 2017-05-30 04:56:05.000000000 +0000 @@ -1,5 +1 @@ - -STARTING_LINK_TYPE_TABLE: -ANY* v - - +; This file intentionally empty. diff -Nru link-grammar-5.3.16/data/demo-sql/4.0.knowledge link-grammar-5.5.0/data/demo-sql/4.0.knowledge --- link-grammar-5.3.16/data/demo-sql/4.0.knowledge 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/demo-sql/4.0.knowledge 2017-05-30 04:56:05.000000000 +0000 @@ -1,28 +1,2 @@ ; Post-processing knowledge file - -; ---------------------------------------------------------------------------- -; This file contains the knowledge related to post-processing, in the -; form of lists and rules. This file is read by post-process.c at run-time. -; Syntax of file: -; line starting with ";" is a comment -; commas are field delimiters -; any token beginning with the character @ is expanded to the set -; of symbols it defined. e.g. one could write -; FOO: blah1 blah2 blah3 -; thus defining a set FOO containing three strings. Then one could later write -; BAR: blah5 @FOO blah8 -; which defines a set BAR containing 5 strings. -; -; Capitalized tokens are *required*, though if you feel like providing an -; empty list afterwards, that's your right. -; ---------------------------------------------------------------------------- - -; ---------------------------------------------------------------------- -; ---------------------- LINK TYPE TABLE------------------------------- -; ---------------------------------------------------------------------- -; The following table associates a domain type with each possible -; starting link. It contains pairs: the first of each pair is a link -; type, and the second is the domain to which that link type belongs. - -STARTING_LINK_TYPE_TABLE: - ANY e +; This file intentionally empty. diff -Nru link-grammar-5.3.16/data/demo-sql/Makefile.in link-grammar-5.5.0/data/demo-sql/Makefile.in --- link-grammar-5.3.16/data/demo-sql/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/demo-sql/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -127,6 +127,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -155,6 +156,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -163,13 +165,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -236,12 +241,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/en/4.0.affix link-grammar-5.5.0/data/en/4.0.affix --- link-grammar-5.3.16/data/en/4.0.affix 2017-01-05 17:04:05.000000000 +0000 +++ link-grammar-5.5.0/data/en/4.0.affix 2018-04-25 17:04:45.000000000 +0000 @@ -3,18 +3,12 @@ % i.e. spaces are inserted between the affix and the word itself. % % Some of the funky UTF-8 parenthesis are used in Asian texts. -% In order to allow single straight quote ' and double straight quote '' -% to be stripped off from both the left and the right, they are -% distinguished by the suffix .x and .y (as as Mr.x Mrs.x or Jr.y Sr.y) -% % 。is an end-of-sentence marker used in Japanese texts. % Punctuation appearing on the right-side of words. -% Note: the ellipsis ....y must appear *before* the dot ".", else the -% splitting won't work right. -")" "}" "]" ">" "".y" » 〉 ) 〕 》 】 ] 』 」 "’’" "’" ” ''.y '.y `.y -"%" "," ....y "." 。   .y ‧ ":" ";" "?" "!" ‽ ؟ ? ! -….y ━.y –.y ー.y ‐.y 、.y +")" "}" "]" ">" """ » 〉 ) 〕 》 】 ] 』 」 "’’" "’" ” '' ' ` +"%" "," ... "." 。 ‧ ":" ";" "?" "!" ‽ ؟ ? ! +_ ‐ ‑ ‒ – — ― … ━ – ー ‐ 、 ~ ¢ ₵ ™ ℠ : RPUNC+; @@ -24,26 +18,35 @@ % Paragraph marks % Assorted bullets and dingbats % Dashes of various sorts -% "real" English prefixes: Y' w/ -% Y'gotta Y'gonna -% coffe w/milk -"(" "{" "[" "<" "".x" « 〈 ( 〔 《 【 [ -『 「 、.x `.x `` „ ‘ “ ''.x '.x ….x ....x +"(" "{" "[" "<" """ « 〈 ( 〔 《 【 [ +『 「 、 ` `` „ ‘ “ '' ' … ... ¿ ¡ "$" US$ USD C$ £ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점 † †† ‡ § ¶ © ® ℗ № "#" -* • ⁂ ❧ ☞ ◊ ※ ○ 。.x ゜ ✿ ☆ * ◕ ● ∇ □ ◇ @ ◎ -–.x ━.x ー.x -- - ‧.x -y' w/ +* • ⁂ ❧ ☞ ◊ ※ ○ 。 ゜ ✿ ☆ * ◕ ● ∇ □ ◇ @ ◎ +_ ‐ ‑ ‒ – — ― ~ – ━ ー -- - ‧ +w/ : LPUNC+; +% Split words that contain the following toekns in the middle of them. +% We don't want comma's in this list; it tends to mess up numbers. e.g. +% "The enzyme has a weight of 125,000 to 130,000" +% We don't want colon's in this list, it tends to mess up time +% expressions: "The train arriaves at 13:42" +% Some kind of fancier technique is needed for tokenizing those. +% +% TODO: this list should be expanded with other "typical"(?) junk +% that is commonly (?) in broken texts. +-- ‒ – — ― "(" ")" "[" "]" ... ";": MPUNC+; + % Suffixes 's 're 've 'd 'll 'm ’s ’re ’ve ’d ’ll ’m: SUF+; % Prefixes +% "real" English prefix: y' w/ % Y'gotta Y'gonna -% coffe w/milk -% y' w/: PRE+; +% coffee w/milk +y' w/: PRE+; % The below is a quoted list, used during tokenization. Do NOT put % spaces in between the various quotation marks!! @@ -51,7 +54,7 @@ % The below is a quoted list, used during tokenization. Do NOT put % spaces in between the various symbols!! -"()¿¡†‡§¶©®℗№#*•⁂❧☞◊※○。゜✿☆*◕●∇□◇@◎–━ー---‧": BULLETS+; +"()¿¡†‡§¶©®℗№#*•⁂❧☞◊※○。゜✿☆*◕●∇□◇@◎–━ー—---‧": BULLETS+; /en/words/units.1: UNITS+; /en/words/units.1.dot: UNITS+; @@ -63,3 +66,5 @@ % units.6 contains just a single, sole slash in it. This allows units % such as mL/s to be split at the slash. /en/words/units.6: UNITS+; +% +/en/words/units.a: UNITS+; diff -Nru link-grammar-5.3.16/data/en/4.0.constituent-knowledge link-grammar-5.5.0/data/en/4.0.constituent-knowledge --- link-grammar-5.3.16/data/en/4.0.constituent-knowledge 2016-09-20 01:52:01.000000000 +0000 +++ link-grammar-5.5.0/data/en/4.0.constituent-knowledge 2018-03-12 00:03:31.000000000 +0000 @@ -1,12 +1,15 @@ -DOMAIN_STARTER_LINKS: S##* S##i SF SX Ce* Cet RS O OX Js Jp J* MVp Mp R* Rn Pa TH Pg Pv I PP Cr -Cs MX#* Wc Wd Wi Wa -TI BIt MVs TO Mv Mg* MVa OF ON IN QI Ma CP* CPi CPx MVt S##w MX#r Pp MVi MVg Mgp MVx SI MX#p Cc -S##t S##h S##b S##q L MX#a MG JG MX#x U S##d JT MVh Mr B#w B#d MVb COq Mj OD CX S##g PF Zc MX#d Bc -K NIax +DOMAIN_STARTER_LINKS: S##* S##b S##d S##g S##h S##i S##q S##s S##t S##w +SF SX Ce* Cet RS O OX Js Jp J* R* Rn Pa Pg Pp Pv TH I PP Cc Cr Cs +MX#* MX#a MX#d MX#p MX#r MX#x +MVa MVb MVg MVh MVi MVp MVs MVt MVx +Wc Wd Wi Wa +Ma Mg* Mgp Mj Mp Mr Mv +TI BIt TO OF ON IN QI CP* CPi CPx SI +L MG JG U JT B#w B#d COq OD CX PF Zc Bc K NIax -DOMAIN_CONTAINS_LINKS: +DOMAIN_CONTAINS_LINKS: -URFL_ONLY_DOMAIN_STARTER_LINKS: +URFL_ONLY_DOMAIN_STARTER_LINKS: LEFT_DOMAIN_STARTER_LINKS: A EA E CO#s CO* COd* YS YP Yt Yd GN DTie CO#n COp D##n ND @@ -21,15 +24,17 @@ ; constituent phrase-structure grammar is hardly straight-forward, as ; a quick browse of constituents.c will reveal. ; -STARTING_LINK_TYPE_TABLE: +STARTING_LINK_TYPE_TABLE: S##* v -S##i v -S##t v -S##h v S##b v -S##q v S##d z S##g v +S##h v +S##i v +S##q v +S##s v +S##t v +S##w z SF v SX v Wd s @@ -42,8 +47,8 @@ RS z O n OX n -Js n Jp n +Js n J* n MVp p Mp p @@ -85,7 +90,6 @@ CPx q MVt p MX#r b -S##w z Pp p MVg z Mgp z @@ -123,20 +127,20 @@ NIax h ; ---------------------------------------------------------------------- -; These links are not put in the word/link graph. They also cannot be the -; starter links for a domain. +; These links are not put in the word/link graph. They also cannot be the +; starter links for a domain. IGNORE_THESE_LINKS: Xca HA ; ---------------------------------------------------------------------- -; these links are not traced further if they point back before the root word +; these links are not traced further if they point back before the root word ; The WV, CV and IV links point to root verbs, and cause trouble if traced ; backwards. For example: "Every farmer has a donkey." The period at the ; end of the sentence triggers the trace-back, and blows away the subject. -RESTRICTED_LINKS: - B#* D##w B#w B#d AFh MVt Xx HL SFsic AFd Bc CX EAh +RESTRICTED_LINKS: + B#* D##w B#w B#d AFh MVt Xx HL SFsic AFd Bc CX EAh H HA PFc B#j Wd PF Z BW CV IV WV diff -Nru link-grammar-5.3.16/data/en/4.0.dict link-grammar-5.5.0/data/en/4.0.dict --- link-grammar-5.3.16/data/en/4.0.dict 2017-02-12 09:48:28.000000000 +0000 +++ link-grammar-5.5.0/data/en/4.0.dict 2018-04-27 00:38:44.000000000 +0000 @@ -10,8 +10,8 @@ % % %***************************************************************************% -% Dictionary version number is 5.3.15 (formatted as V5v3v15+) -: V5v3v15+; +% Dictionary version number is 5.5.0 (formatted as V5v5v0+) +: V5v5v0+; : EN4us+; % _ORGANIZATION OF THE DICTIONARY_ @@ -117,10 +117,28 @@ % noun-main-s -- singular % XXX FIXME: is often used with and sub % has a R+ & B+ on it. The problem here is that R+ & B+ should not -% be used with the J- here. This needs to be refactored to prevent -% this, or at least, cost it in some way. +% be used with the J- and sometimes not with the O-. So, for example, +% "I still remember the room I kissed him" - room is the object, and +% the "I" should not be relative. +% This needs to be refactored to prevent this, at least for J. How to +% handle the R+ & B+ on inappropriate O- is less clear. +% +% (Js- & {Mf+}): Allows constructions involving "of" to link locally, +% e.g. "Got it from the Abbey of Stratford Langthorne" +% links "of" to "Abbey" instead of "it". +% +% Ss*s+: blocks links to Ss*t- which accepts singular subject, but +% requires plural object. See "are.v" for examples. +% : - (Ss+ & ) or SIs- or Js- or Os- + (Ss*s+ & ) or SIs- or (Js- & {Mf+}) or Os- + or + or ; + +% noun-main-e: used for proper names (named entities) +% Os*e- is used to allow certain adjectival uses. +: + (Ss*s+ & ) or SIs- or (Js- & {Mf+}) or Os*e- or or ; @@ -195,7 +213,7 @@ % [@AN-].1: add a tiny cost so that A- is preferred to AN- when there % is a choice. The is because some nouns are also listed as adjectives, % and we want to use the adjective version A- link in such cases. -% [@AN- & @A-] has cost so that G links are prefered. +% [@AN- & @A-] has cost so that G links are preferred. % {[@AN-].1} & {@A- & {[[@AN-]]}}; : (@A- & {[[@AN-]]}) @@ -254,9 +272,6 @@ : TOo+ & IV+; : TOt+ & B+; -% Connects verb to coordinating conjunction. -: VC+; - : Ce+ & CV+; : Cet+ & CV+; : Cs+ & CV+; @@ -269,16 +284,19 @@ % Fronted prepositional and participle phrases, used with % subject-object inversion. Wp connect to preps only. -: [Wp- & PF+]; +% Why is there a cost? How big should the cost be? +% PFd+: prevent links to PFt- +: [dWp- & (dPFb+ or dPFd+)]0.1; % The use of COa here needs to be carefully re-examined; it is used much too freely. % COa+ is used to block links to COd- -% Xc+ & Ic+: connect to imperatives (infinitve verbs): "Anyhow, don't" +% Xc+ & Ic+: connect to imperatives (infinitive verbs): "Anyhow, don't" % Wc- & Xc+ & Qd+: subject-object inversion: "anyhow, am I right?" +% This gets a fairly stiff cost if the comma is missing. : {[[Wa-]]} & ((Xc+ & Ic+) or - (Wc- & (Xc+ or [()]) & Qd+) or + (Wc- & (Xc+ or [()]1.2) & Qd+) or ({Xd-} & (Xc+ or [[()]]) & [COa+])); % Just pure singular entities, no mass nouns @@ -290,10 +308,10 @@ % % We do NOT tag these with , a this messes up first-word % processing in tokenize.c. So for example, we do *not* want "There" -% in "There they are" tagged as an entity, just because its capitalized. +% in "There they are" tagged as an entity, just because it's capitalized. % We really do want to force the lower-case usage, because the lower case -% is in the dict, and its the right word to use. (The only entities that -% should be tagged as such are those that are in the dicts, in thier +% is in the dict, and it's the right word to use. (The only entities that +% should be tagged as such are those that are in the dicts, in their % capitalized form, e.g. "Sue.f" female given name as opposed to "sue.v" % verb in the sentence "Sue went to the store.") % @@ -307,7 +325,7 @@ % % MX+ & : country names: "...went to Paris, France" % -INITIALS : + : ({NM+} & ({G-} & {[MG+]} & (({DG- or [[GN-]] or [[@A- & @AN-]] or [[{@A-} & {D-}]] or ({@A-} & Jd- & Dmc-)} & (( & (JG- or )) @@ -320,17 +338,17 @@ or ; % As above, but with a tiny extra cost, so that a dictionary word is -% prefered to the regex match (i.e. for a common noun starting a +% preferred to the regex match (i.e. for a common noun starting a % sentence). However, the other regex matches (e.g. MC-NOUN-WORDS) % should have a cost that is even higher (so that we take the % capitalized version before we take any other matches.) -CAPITALIZED-WORDS: []0.05; +: []0.06; % Capitalized words that seem to be plural (by ending with an s, etc) % -- But not all words that end with an 's' are plural: % e.g. Cornwallis ... and some of these can take a singular determiner: % "a Starbucks" -PL-CAPITALIZED-WORDS: +: ({NM+} & {G-} & {[MG+]} & (({DG- or [[GN-]] or [[{@A-} & ({Dmc-} or {Ds-})]] or ({@A-} & Jd- & Dmc-) } & ([ & (JG- or )] @@ -352,7 +370,7 @@ % "Tom" is a given name, but can also be a proper name, so e.g. % "The late Mr. Tom will be missed." which needs A-, D- links % Wa-: A single exclamation: "Tom! Hey, Tom! Oh, hello John!" -% is trikcy when used with [[...]] connectors. +% is tricky when used with [[...]] connectors. % Careful for bad parses of % "This is the dog and cat Pat and I chased and ate" % "actress Whoopi Goldberg and singer Michael Jackson attended the ceremony" @@ -362,7 +380,7 @@ : {G-} & {[MG+]} & (({DG- or [GN-]2.1 or [[{@A-} & {D-}]]} & - (({@MX+} & {NMr+} & (JG- or or )) + (({@MX+} & {NMr+} & (JG- or or )) or YS+ or YP+)) or AN+ @@ -422,7 +440,7 @@ or Us- or ({D*u-} & Wa-)); -GREEK-LETTER-AND-NUMBER pH.i x.n: ; + pH.i x.n: ; % Same as pattern used in words.n.4 -- mass nouns or countable nouns : ; @@ -445,24 +463,9 @@ or Up- or ({Dmc-} & Wa-)); -%for YEAR-DATE year numbers -: - NMd- - or ({EN-} & (NIfn+ or NItn-)) - or NN+ - or AN+ - or Wa- - or ((Xd- & TY- & Xc+) or TY-) - or ({EN- or NIc-} - & (ND+ - or OD- - or ({{@L+} & DD-} - & ([[Dmcn+]] - or (( or TA-) & (JT- or IN- or [[]])))))); - % Number abbreviations: no.x No.x % pp. paragraph, page art article -% RR roural route +% RR rural route No.x No..x no.x no..x Nos.x Nos..x nos.x nos..x Nr.x Nr..x Nrs.x Nrs..x nr.x nr..x nrs.x nrs..x Num.x Num..x num.x num..x pp.x pp..x @@ -494,7 +497,8 @@ Blvd.y Blvd..y Pkwy.y Pkwy..y Hwy.y Hwy..y AG.y Assn.y Assn..y Corp.y Corp..y Co.y Co..y Inc.y Inc..y PLC.y -Pty.y Pty..y Ltd.y Ltd..y LTD.y Bldg.y Bldg..y and_Co GmBH.y: +Pty.y Pty..y Ltd.y Ltd..y LTD.y Bldg.y Bldg..y and_Co GmBH.y +et_al et_al. : ({[X-]} & G-) & {[[Xi+]]} & {[MG+]} & (({DG- or [[GN-]] or [[{@A-} & {D-}]]} & (({@MX+} & (JG- or )) or @@ -506,8 +510,12 @@ % Titles, e.g. Joe Blow, Esq. or Dr. Smarty Pants, Ph.D. -% Gack. See absurdely large collection at: +% Gack. See absurdly large collection at: % http://en.wikipedia.org/wiki/List_of_post-nominal_letters +% The generic case of strings of all-uppercase are handled +% by the regexes ALL-UPPER and INITIALS. The below are mixed-case +% or other very common degrees. +.y .y Jr.y Jr..y Sr.y Sr..y Esq.y Esq..y AB.y A.B..y AIA.y A.I.A..y BA.y B.A..y BFA.y B.F.A..y BS.y B.S..y BSc.y B.Sc..y @@ -532,8 +540,9 @@ G+); % The generic category for strings containing a hyphen -PART-NUMBER.n -HYPHENATED-WORDS.n: +.n +.n +.n: [[({@AN-} & {@A-} & (({NM+ or D-} & (( & ( or )) @@ -557,7 +566,7 @@ % % ({NMa+} & AN+): He takes vitamin D supplements. % -% XXX TODO fixme: there are many gerund-like nouns in here (e.g. "reading") +% XXX TODO FIXME: there are many gerund-like nouns in here (e.g. "reading") % which screw things up when linking to "be" (e.g. "I have to be reading now") % by appearing as objects (O-) connector when really the verb form (Pg-) % is what should be happening. So rip these words out... (similar remarks for @@ -623,7 +632,7 @@ /en/words/words.n.2.s : or ; -PL-GREEK-LETTER-AND-NUMBER: ; +: ; % plural nouns not ending in "s" % almost exactly identical to except that there is @@ -679,6 +688,7 @@ % If PH is possible, then it is preferred. See PH below for explanation. : [PHc-]-0.1 or (); : [PHv-]-0.1 or (); +: {PH-}; : @@ -692,9 +702,9 @@ % nouns that are mass only % absolutism.n absorption.n abstinence.n abundance.n academia.n -/en/words/words.n.3-vowel: +/en/words/words.n.3-vowel /en/words/words.lang.3-vowel: or ; -/en/words/words.n.3-const: +/en/words/words.n.3-const /en/words/words.lang.3-const: or ; % Gonna treat these as mass nouns, not sure if this is correct. @@ -711,7 +721,7 @@ % allow these to take AN+ links (we want to have red.a get used instead). % But we do need these as nouns, so as to parse 'she prefers red'. % However, assign a cost, so that 'her shoes are red' gets red.a (with -% the Pa link) perfered over red.n (with the O link). +% the Pa link) preferred over red.n (with the O link). % % Doesn't seem to need a noun-and-x to make this work ... % In other respects, these are kind-of-like mass nouns... @@ -739,7 +749,7 @@ or Wa-; % SINGULAR ENTITIES FOR ENTITY EXTRACTION -% This must appear after other categories so it doesnt interfere with those. +% This must appear after other categories so it doesn't interfere with those. /en/words/entities.national.sing: or ; @@ -832,6 +842,7 @@ or Us- or ({Ds-} & Wa-)); +% ----------------------------------------------------------------- % determiner constructions, with a dangling of: "a number of", "a lot of" % "I have a number of cookies" % "a pride of lions" "a litter of kittens" all take determiners @@ -864,33 +875,48 @@ or ({Ds**c-} & ) or Us-)); -% This gets a cost, so that the {Jd-} link for measures.1 is prefered. +% This gets a cost, so that the {Jd-} link for measures.1 is preferred. kind_of: [] or EA+ or EE+ or Wa-; -% This gets a cost, so that the {Jd-} link for measures.1 is prefered. +% This gets a cost, so that the {Jd-} link for measures.1 is preferred. type_of sort_of breed_of species_of: [] or [Us-] or [Wa-]; -% This gets a cost, so that the {Jd-} link for measures.2 is prefered. +% This gets a cost, so that the {Jd-} link for measures.2 is preferred. kinds_of types_of sorts_of breeds_of species_of: [{{@AN-} & @A-} & U+ & (({Dmc-} & & ( or )) or ({Dmc-} & ) or Up-)]; -percent.u: +% A+: "It has twice the percent value" +percent.u parts.u: ( & ((ND- & {DD-} & & ( or B*x+)) or + or (ND- & {DD-} & ) or - U-)) or - (ND- & (OD- or AN+ or YS+)); + U-)) + or (ND- & (OD- or AN+ or YS+)) + or ({E- or EA-} & A+); + +% Weight and voume in chemistry +% "An example would be a 5% weight per volume NaCl solution." +weight.u volume.u: + AN- & Mp+ & AN+; + +% "It is a a 5%(w/v) NaCl solution." +% "We measure the concentration in v/v %" +v/v vol/vol volume/volume w/v weight/vol weight/volume: + ({{Xd- & Xc+} & AN-} & AN+) + or EA+; +% ----------------------------------------------------------------- % This set of disjuncts should probably be split up and refined. % "shame.n", "crux.n" are here because they need the Ss*t connector % to pick up "that" in "The crux of it is that we must act first." @@ -962,7 +988,7 @@ proof.n doubt.n suspicion.n hope.n knowledge.n relief.n disclosure.n fear.n principle.n concern.n philosophy.n risk.n threat.n conviction.n theory.n speculation.n news.n belief.n contention.n thought.n myth.n -discovery.n rumor.n probability.n fact.n feeling.n comment.n +discovery.n rumor.n probability.n fact.n feeling.n comment.n process.n perception.n sense.n realization.n view.n consensus.n notification.n rule.n danger.n warning.n suggestion.n: ( & ( @@ -1089,7 +1115,7 @@ ; % Regex-based guessing of unknown words, ending in -ity -acy -ance -NOUN-TO-WORDS.n: +.n: ; % Nouns formerly classified as mass nouns (words.n.3) but can take "to" @@ -1105,9 +1131,9 @@ refusals.n pledges.n urges.n missions.n rights.n desires.n needs.n ambitions.n capacities.n mandates.n promises.n abilities.n options.n commitments.n intents.n opportunities.n plans.n plots.n -responsibilities.n chances.n campaigns.n offers.n pressures.n obligations orders.n -temptations.n vows.n permits.n impetuses.n proclivities.n propensities.n moves.n -votes.n bills.n incentives.n: +responsibilities.n chances.n campaigns.n offers.n pressures.n +obligations orders.n temptations.n vows.n permits.n impetuses.n +proclivities.n propensities.n moves.n votes.n bills.n incentives.n: ( & (({{Jd-} & Dmc-} & {@M+} & {( or (R+ & Bp+)) & {[[@M+]]}} & {@MXp+} & ( or @@ -1267,10 +1293,12 @@ % ==================================================================== %PRONOUNS +% MXs+: "he, the shop owner, ..." she he: - {[[R+ & Bs+]]} & ((Ss+ & ) or SIs- or SJls+); + {[[R+ & Bs+]]} & (({MXs+} & Ss+ & ) or SIs- or SJls+); % The E- is for "It's either us or them" ... not ideal, but OK +% See also me.p below. me him: J- or Ox- or ({[[E-]]} & SJl+) or SJr-; @@ -1317,11 +1345,13 @@ % yisser yousser ye'r: Irish English second-person possessive -- % https://en.wikipedia.org/wiki/Irish_English -its my.p your their.p our thy yisser.p yousser ye'r: +its my.p your their.p our thy.p yisser.p yousser ye'r: DP+ or ({AL-} & {@L+} & (D+ or DD+)); +% Possessive version of me % Cost on D, DD: avoids use as determiner on "Make me coffee" +%suppress: DUP-BASE (for me.p) me.p: DP+ or [{AL-} & {@L+} & (D+ or DD+)]; @@ -1356,35 +1386,39 @@ % (Osm- & {@M+}): "is this it?" "Do you have it ready?" % "leave it alone" "make it rain" % (the m prevents links as indirect object) +% MX+ & Ss+: "it -- fried rice -- is very popular" it: [J-] or (Osm- & {@M+}) or (Jd- & Dmu- & Os-) or (Jd- & Dmu- & {Wd-} & S+) - or ((Ss+ or SFsi+) & ) + or ({MX+} & (Ss+ or SFsi+) & ) or SIs- or SFIsi- or OXi- or Vp- or SJls+ or SJrs-; % O*c: "we will go faster than they" +% MXp+: "they, the twins, did it" they: - ({[[R+ & Bp+]]} & ((Sp+ & ) or SIp-)) or + ({[[R+ & Bp+]]} & (({MXp+} & Sp+ & ) or SIp-)) or Ox- or SJlp+ or SJrp-; % DD+: "... how we two should work together" +% MXp+: "We, the undersigned, ..." we: - (Sp+ & ) + ({MXp+} & Sp+ & ) or Ox- or DD+ or SIp- or SJlp+ or SJrp-; -% XXX why is this marked plural (Sp, SIp) ?? a cheap hack to make I've work? +% XXX why is this marked plural (Sp, SIp) ?? a cheap hack to make "I've" work? % We use here to prevent lower-case magnling by % the tokenizer. % SJrp-: allows only "...and I", disallows "I and ..." +% MXs+: "I, Joseph, rang the bell" I.p: - or ((Sp*i+ or SX+) & ) + or ((({MXs+} & Sp*i+) or SX+) & ) or SIp*i- or SJr- or SJl+ @@ -1454,7 +1488,7 @@ or DTn+ or Wa-; -% [[]] costs so that ditranstive verbs don't suffer: +% [[]] costs so that ditransitive verbs don't suffer: % "I taught these mice to jump", taught is ditransitive, we don't want % "these" to be the object. See also "those" % (Jd- & Dmu- & Op-): "I gave him a number of these" @@ -1467,7 +1501,7 @@ or or Wa-; -% [[]] costs so that ditranstive verbs don't suffer, +% [[]] costs so that ditransitive verbs don't suffer, % and get the D+ link instead of the O- link. % See also "these" those: @@ -1512,14 +1546,16 @@ % A naked costs more than one with other links, % so that ditransitive verbs don't get spurious links to all.a -% XXX can this be tighetend up?? -% costs no mater what, so that Ofd+ is prefered. +% XXX can this be tightened up?? +% costs no mater what, so that Ofd+ is preferred. % [E+]0.5: all modifying a verb probably is not right. +% Wa-: "All the people!" as a response to a question. all.a: ({EN-} & ( [E+]0.5 or Dm+ or NIc+ + or (ALx+ & (Jp+ or Ju+) & Wa-) or ( (@M+ or (ALx+ & (Jp+ or Ju+)) or (R+ & B+) or EL+) & []) @@ -1586,7 +1622,7 @@ : PHc+ or [()]0.2; : PHv+ or [()]0.2; % -% XXX why doesn't this clash with a.eq ?? +% See also a.eq below. a: ({(AA- & HA-) or ALa- or [[Lf+]]} & (Ds**c+ or ( & Ds**x+))) or NN+ or NSa+ or NIm+; @@ -1617,7 +1653,7 @@ % [[ & ]] or or Wa-; -% NNumeric modifier: "a couple of thousand dollars" +% Numeric modifier: "a couple of thousand dollars" a_couple_of: NN+ or ND+ or NIn+; @@ -1676,7 +1712,7 @@ or MVa-; % "a part.n" should cover most cases. Perhaps [[OF+ & ]] should be -% reomved?? Anyway, its costed to give OFd+ priority. Likewise, should probably +% removed?? Anyway, it's costed to give OFd+ priority. Likewise, should probably % retire as well, right? part.i: (OFd+ & Dm+) @@ -1699,7 +1735,7 @@ or [[]] or Wa-; -% costly so that OFd+ is prefered. +% costly so that OFd+ is preferred. rest.i: [[DD- & ]]; @@ -1860,7 +1896,8 @@ % For number, cost is added to the roles to prioritize % postmodifier and numeric determiner roles. - +% [[A+]]: "the five seat washed out" +% two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty twenty-one twenty-two twenty-three twenty-four @@ -1882,6 +1919,7 @@ oh-one oh-two oh-three oh-four oh-five oh-six oh-seven oh-eight oh-nine o-one o-two o-three o-four o-five o-six o-seven o-eight o-nine zero-one zero-two zero-three zero-four zero-five zero-six zero-seven zero-eight zero-nine +four-score several: NA+ or NMw- or @@ -1892,7 +1930,8 @@ ({{@L+} & DD-} & (Dmcn+ or ( & []))))) or (NA- & { & }) or (NA- & Xd- & TY- & Xc+) - or Wa-; + or Wa- + or [[A+]]; oh.zero: (NA- & NA+); zero.n: (NA- & NA+) or NN+ or Ds+ or ( & ) or Wa-; @@ -1905,7 +1944,7 @@ % as effectively Dmcn and ND are the "same thing" more or less. % % ({ND+} & NIfn+) or (NItn- & {ND+}): "between 7:30AM and 9:30AM" -NUMBERS FRACTION: + : NMn- or ({EN-} & (({ND+} & NIfn+) or (NItn- & {ND+}))) or NN+ @@ -1921,17 +1960,17 @@ % HMS-TIME consists of HH:MM:SS(AM|PM) type expressions % and should probably have a narrower set of parse patterns than numbers in % general. e.g. should not have EQ links XXX todo -- fix this. -HMS-TIME: NUMBERS & {TZ+}; +: & {TZ+}; % Allowing postposed roman numerals only for now. % e.g "Pope Pious XII" -ROMAN-NUMERAL-WORDS.rn: +.rn: NMr- or ((Wd- or NMr-) & NIa+); % nouns that look like roman numerals. Limited requirements to avoid % excessive ambiguity. -ROMAN-NUMERAL-WORDS.n: {@MX+} & (); +.n: {@MX+} & (); % NMa-: Allow post-posed letter modifiers: e.g. "Vitamin A" % Wd- & NIa+: Allow numbered, bulleted lists: "B: Press button firmly" @@ -1942,30 +1981,61 @@ A.id B.id C.id D.id E.id F.id G.id H.id J.id K.id L.id M.id N.id O.id P.id Q.id R.id S.id T.id U.id V.id W.id X.id Y.id Z.id: - or NMa- - or ((Wd- or NMa-) & NIa+) + or [NMa-]0.5 + or [(Wd- or NMa-) & NIa+]0.5 or Ju-; % Avoid having I.id interfere with pronoun I. I.id: [[ or NMa- or (Wd- & NIa+)]]; +% Variables: "suppose that X or Y is responsible." +J.n K.n L.n M.n N.n P.n Q.n R.n S.n T.n V.n W.n X.n Y.n Z.n: + + or + or ; + +% Given-name initials: +A. B. C. D. E. F. G. H. I. J. K. L. M. N. +O. P. Q. R. S. T. U. V. W. X. Y. Z. : + + or ({G-} & G+); + % Days of month % Note, however, this over-rides generic numbers in this range % and so is a superset of the generic numbers disjuncts % The following should match NUMBERS with the addition of "or TM-". 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31: -NUMBERS or TM- or [[G+]]; + or TM- or [[G+]]; % Ordinals - day-of-month expressions. -% Used only in espressions such as "December 2nd" +% Used only in expressions such as "December 2nd" % Must use regex here as well, to avoid conflict with other regexes first.ti second.ti third.ti fourth.ti fifth.ti sixth.ti seventh.ti eighth.ti ninth.ti tenth.ti eleventh.ti twelfth.ti thirteenth.ti fourteenth.ti fifteenth.ti sixteenth.ti seventeenth.ti eighteenth.ti nineteenth.ti twentieth.ti twenty-first.ti twenty-second.ti twenty-third.ti twenty-fourth.ti twenty-fifth.ti twenty-sixth.ti twenty-seventh.ti twenty-eighth.ti twenty-ninth.ti thirtieth.ti -thirty-first.ti DAY-ORDINALS.ti: TM-; +thirty-first.ti .ti: TM-; + +% For YEAR-DATE year numbers +% AN+ is given a cost, because attaches incorrectly to +% measurements of various kinds, where the number is not actually a +% date, and ND is the preferred linkage. +% This needs cleanup, I think ...!? +: + NMd- + or ({EN-} & (NIfn+ or NItn-)) + or NN+ + or [AN+] + or Wa- + or ((Xd- & TY- & Xc+) or TY-) + or ({EN- or NIc-} + & (ND+ + or OD- + or ({{@L+} & DD-} + & ([[Dmcn+]] + or (( or TA-) & (JT- or IN- or [[]])))))); % Years w/o apostrophe: e.g. 47 Ford Fairlane or 57 Chevy 01 02 03 04 05 06 07 08 09: or [[G+]]; @@ -1977,11 +2047,11 @@ 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99: - NUMBERS or or [[G+]]; + or or [[G+]]; % the DECADE-DATE regex matches 1950s 1950's 1950’s etc. % A+: It's an old 50's love song -DECADE-DATE + '00s '10s '20s '30s '40s '50s '60s '70s '80s '90s ‘00s ‘10s ‘20s ‘30s ‘40s ‘50s ‘60s ‘70s ‘80s ‘90s 00's 10's 20's 30's 40's 50's 60's 70's 80's 90's: @@ -1990,7 +2060,7 @@ % year numbers % 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 -YEAR-DATE: NUMBERS or or [[G+]]; +: or or [[G+]]; % Years: e.g. '47 Ford Fairlane or '57 Chevy '00 '01 '02 '03 '04 '05 '06 '07 '08 '09 @@ -2082,18 +2152,20 @@ & (ND+ or NIn+ or OD- or ({DD-} & ([[Ds+]] or Dmcn+ or ( & ))))); -and_a_half: (NW- or NSn-) & -(NNy+ or ({EN- or NIc-} & (ND+ or NIn+ or ({DD-} & -(Dmcn+ or ( & )))))); +and_a_half: + (NW- or NSn- or ND-) & + (NNy+ or ({EN- or NIc-} & (ND+ or NIn+ or ({DD-} & + (Dmcn+ or ( & )))))); quarter.i: -NS- & {EN-} & (NF+ or ( & )); + NS- & {EN-} & (NF+ or ( & )); thirds.m fourths.m quarters.m fifths.m sixths.m sevenths.m eighths.m ninths.m tenths.m: -NW- & {EN-} & (NF+ or ( & )); + NW- & {EN-} & (NF+ or ( & )); -first.a: L- or Pa- or E+ or MVa- or ({Xc+ & {Xd-}} & CO+) or A+ or [Jp-] or -TT+ or ((DD- or [[NSa-]]) & & {} & ); +first.a: + L- or Pa- or E+ or MVa- or ({Xc+ & {Xd-}} & CO+) or A+ or [Jp-] or + TT+ or ((DD- or [[NSa-]]) & & {} & ); last.a dead_last dead_fucking_last DFL: L- @@ -2106,8 +2178,9 @@ or A+ or [Jp-]; -second.a: L- or Pa- or MVa- or ({Xc+ & {Xd-}} & CO+) or -(DD- & & {} & ) or NR+ or A+; +second.a: + L- or Pa- or MVa- or ({Xc+ & {Xd-}} & CO+) or + (DD- & & {} & ) or NR+ or A+; % This uses the L link for superlatives, but leads to strange parses: % "We celebrated their eleventh anniversary" parses differently @@ -2163,7 +2236,7 @@ % prefer G+ over A+ in general, as these are typically parts of names. % ({Ds-} & AJla+): "he is in either the X or the Y battalion" 1º.a 2º.a 3º.a 4º.a 5º.a 6º.a 7º.a 8º.a 9º.a -DAY-ORDINALS.a ORDINALS.a: +.a .a: Pa- or MVa- or ({Xc+ & {Xd-}} & CO+) or @@ -2213,7 +2286,7 @@ ninety-first.ord ninety-second.ord ninety-third.ord ninety-fourth.ord ninety-fifth.ord ninety-sixth.ord ninety-seventh.ord ninety-eighth.ord ninety-ninth.ord -DAY-ORDINALS.ord ORDINALS.ord : +.ord .ord : (Wd- & {M+} & Ss*o+); % TODO: un-parenthesized cases, e.g. @@ -2221,6 +2294,7 @@ % - the strength was in the order of gerE > cotD > yfhP P2 > yfhP P1 % also remember "-->" +%suppress: DUP-BASE (for a.eq) A.eq B.eq C.eq D.eq E.eq F.eq G.eq H.eq I.eq J.eq K.eq L.eq M.eq N.eq O.eq P.eq Q.eq R.eq S.eq T.eq U.eq V.eq W.eq X.eq Y.eq Z.eq a.eq b.eq c.eq d.eq e.eq f.eq g.eq h.eq i.eq j.eq k.eq l.eq m.eq @@ -2232,7 +2306,7 @@ or_so: ND- & {{@L+} & DD-} & (Dmcn+ or ( & )); % Allows parsing of "dollars per day" or "mL/sec" but is somewhat -% inconsistent with the equation persing otherwise described below. +% inconsistent with the equation parsing otherwise described below. % XXX overall, eqn parsing could be strengthened. per "/.per": Us+ & Mp-; @@ -2251,7 +2325,7 @@ % : links verb to wall or to controlling phrase. % : links verbs to singular subjects % : links verbs to plural subjects -% : links to infinitve +% : links to infinitive % : to plural subjects or infinitives % : to singular or plural subject % : to past-participles @@ -2265,18 +2339,20 @@ % IV connects infinitives to the head-verb % VC connects the head-word to a subsequent coordinating conjunction. % -% There are some other such connectors that don't quite fit this patten: -% AF, Z, and in many cases B (for example TOt+ & B+) for this reason, we -% have to have a costly null [[()]] below, although we would really really +% There are some other such connectors that don't quite fit this pattern: +% AF, Z, and in many cases B (for example TOt+ & B+). For this reason, we +% have to have a costly null [()] below, although we would really really % like to get rid of it. But that would take a lot of Z and B and AF link % fiddling about, so we have to live with this for now. % % Also: CP-, Eq+ and COq+ all connect to verbs, and are so disjoined % with % -: ((dWV- or dCV- or dIV-) & {VC+}) or [[()]]; +: ((dWV- or dCV- or dIV-) & {VC+}) or [()]; % : (dWV- or dCV- or dIV-) & {VC+}; +: {@MV+} & {VC+}; + % When we are done, remove the option costly NULL below. : hWV+ or [[()]]; % : hWV+; @@ -2285,7 +2361,7 @@ % Pv- & no wall: "John felt vindicated" % The problem here is that for passives (i.e. to-be), The Pv should get the wall % but in the other cases it should not. We could/should tighten this up by using -% Pvp+ on to-be, using Pvv for the others, and demaninds the wall only for Pvp. +% Pvp+ on to-be, using Pvv for the others, and demand the wall only for Pvp. % XXX FIXME, the above needs fixing. % % : PP- & WV-: "I have seen it". @@ -2294,16 +2370,16 @@ % XXX FIXME: for certain transitive verbs, we really want verb-ico to be % in the form (I- & B- & ) for example: "that I did not know". % -: {@E-} & ((Ss- & ) or (RS- & Bs-)); -: {@E-} & ((Sp- & ) or (RS- & Bp-)); -: {@E-} & ((S- & ) or (RS- & B-)); +: {@E-} & ((Ss- & {hPFt-} & ) or (RS- & Bs-)); +: {@E-} & ((Sp- & {hPFt-} & ) or (RS- & Bp-)); +: {@E-} & ((S- & {hPFt-} & ) or (RS- & B-)); : {@E-} & PP- & {}; : {@E-} & (Pg- or Mg-); % Pv- & OFj+: "knowledge was gained of the activities" -: {@E-} & ((Pv- & {} & {OFj+}) or Mv-) & {@MV+}; -: {@E-} & ((Pv- & {} & {K+}) or Mv-) & {@MV+}; -: {@E-} & ((Pv- & {}) or Mv-); +: {@E-} & ((Pv- & {hPFt-} & {} & {OFj+}) or Mv-) & ; +: {@E-} & ((Pv- & {hPFt-} & {} & {K+}) or Mv-) & ; +: {@E-} & ((Pv- & {hPFt-} & {}) or Mv-); : or ; % used only in "as " constructions, which behave @@ -2312,19 +2388,23 @@ % % Sa*v-: "He did it as expected" % Sa*v- & MXsr-: "the movie, as filmed, is too long" -% The cost on MXsr+ is to give MVs preference for +% The cost on MXsr+ is to give MVs preference for % "She sang well, as planned" -% {MV+}: "the man, as acquiescing as he was, set a bad precedent." -% I want to use MVz+ for above, but it doesn't work... +% {MVz+}: "the man, as acquiescing as he was, set a bad precedent." % Pv- & CV- & MXsr-: "The accused, as shall be proven, is innocent" +% S- & CV- & MXsr-: "The accused, as I suspected, is innocent" +% I*x- & CV- & MXsr-: "The accused, as I will show, is innocent" +% The x on I*x blocks I*v, so that Pv- is used. : - (Sa*v- or (Pv- & CV-)) & {Xd-} & {[MXsr-]0.1 & {MV+}} & {Xc+}; + ((Sa*v- or EAy- or ({E-} & Pv- & CV-)) & {Xd-} & {[MXsr-]0.1 & {MVz+}} & {Xc+}) + or ({E-} & S- & CV- & {Xd-} & MXsr- & {Xc+}) + or ({E-} & I*x- & CV- & {Xd-} & MXsr- & {Xc+}); % Cost: "He was driven home" should NOT use driven.v as adjective! % From what I can tell, can be used anywhere that % can be... except forr said.v-d : - ({@E-} & [A+]0.5) + (({@E-} or {@EA-} or {@AN-}) & [A+ or Pa-]0.5) or ; % Wi- & {NM+}: imperative numbered lists: "Step 5. Do this." @@ -2332,15 +2412,16 @@ % allows a really weird subject-object inversion to proceed: % e.g. "In the corner lay it" with it as object. That's just % wrong... but this requires lots of places to fix. -: {@E-} & I- & ; -: {@E-} & ((I- & {} & {@E-}) or ({[CO-]} & Wi- & {NM+})); +% VJrpi-: "I aim to do something and also to help." +: {@E-} & I- & ( or VJrpi-); +: {@E-} & ((I- & { or VJrpi-} & {@E-}) or ({[CO-]} & Wi- & {NM+})); : or ; -: {@E-} & PF- & {} & SI+; -: {@E-} & PF- & {} & SIp+; +: {@E-} & hPF- & {} & hSI+; +: {@E-} & hPF- & {} & hSIp+; % is meant to be a generic replacement in the naked B- in -% many transitive verb constructions. For quetions, we need to force +% many transitive verb constructions. For questions, we need to force % a verb-wall connector; this is what the (B*w- or B*m-) & % part does. For the other B- forms, we don't need the wall. To force % the wall, we just list all the others. @@ -2405,7 +2486,7 @@ % Iq-: "The big question is did he do it?" % Xd- & Iq-: "The big question is, did he do it?" : Rw- or ({{Xd-} & Iq-} & (Qd- or ((Qw- or Qe-) & ))) or [()]; -% Just like above, but no aux, shuld always be anded with I+. +% Just like above, but no aux, should always be anded with I+. % The idea here is that the verb on the other end of the I+ will % connect to the wall. : Rw- or ({{Xd-} & Iq-} & (Qd- or Qw- or Qe-)) or [()]; @@ -2419,7 +2500,7 @@ : {@E-} & (((S- or SFsi-) & ) or (RS- & B-)); : {@E-} & PPf- & ; : {@E-} & (Pgf- or Mg-); -: {@E-} & ((Pvf- & ) or Mv-) & {@MV+}; +: {@E-} & ((Pvf- & ) or Mv-) & ; : {@E-} & ((Pvf- & ) or Mv-); % These are the verb-form expressions for special verbs that can take @@ -2427,7 +2508,7 @@ % These are used almost exclusively with auxiliary verbs. % This is why they don't have & in them: we don't want the % auxiliary attaching to the wall, we want only the main verb doing this. -% The Ss- or Sp- prevent attachements to Sa- for "as.e" phrases. +% The Ss- or Sp- prevent attachments to Sa- for "as.e" phrases. : {@E-} & (Sp- or SFp- or If- or (RS- & Bp-) or Wi-); : {@E-} & (Ss- or SFs- or (RS- & Bs-)); : {@E-} & (Ss- or SFs- or SFu- or (RS- & Bs-)); @@ -2449,12 +2530,16 @@ % Almost identical to the above, except that the verb attaches to the % wall. We cannot use verb-s for this, since the SFsi prevents the parse % of sentences like "there appears to be a problem". +% If- blocks the Ix+ on would, be : {@E-} & (((Sp- or SFp- or If-) & ) or (RS- & Bp-) or Wi-); : {@E-} & (((Ss- or SFs-) & ) or (RS- & Bs-)); : {@E-} & (((Ss- or SFs- or SFu-) & ) or (RS- & Bs-)); : {@E-} & (((S- or SF- or PPf-) & ) or (RS- & B-)); : {@E-} & (((S- or SF-) & ) or (RS- & B-)); +% Almost like above, but without the RS- & B- +: {@E-} & (Ss- or SFs- or SFu-) & ; + % conjoined verbs. % VJr == left % VJl == right @@ -2469,10 +2554,10 @@ % The conjunction should take an object if both verbs are transitive, % e.g. "I saw and greeted Sue", which should parse as % "I (saw and greeted) Sue". -% VJ**i == intranstive +% VJ**i == intransitive % VJ**t == transitive % -% s == singluar, pl == plural, sp == singular or plural +% s == singular, pl == plural, sp == singular or plural % g == gerund : {@E-} & VJrs-; : {@E-} & VJls+; @@ -2522,7 +2607,7 @@ % for example: "We neither ate nor drank for three days" -% present tense, but allows transitive connectinos to 'and' +% present tense, but allows transitive connections to 'and' % past tense macro, intransitive variation @@ -2587,10 +2672,10 @@ % O+ & : "did" is not an auxiliary, and so needs the wall. : (( - or (O+ & ) + or (O+ & {@MV+} & ) or [[@MV+ & O*n+]] or Vd+ - or ({N+} & (CX- or [[()]]))) & {@MV+}) + or ({N+} & (CX- or [[()]]))) & ) or ({N+} & I*d+); % I used verb-and-sp-i but maybe verb-and-pl is better? @@ -2599,23 +2684,25 @@ % Naked I*d-: "How do you do?", "what is there to do?" % I*t- & O+: forces use of object in to-do expressions. % "Are you really going to do it to them?" +% { or VJrpi-}: "I aim to help and also to do something" do.v: ({@E-} & (Sp- or SFp- or (RS- & Bp-) or ({Ic-} & Wi-)) & ) or ( & ([] or ())) or ( & ) or ((SIp+ or SFIp+) & (( & I*d+) or CQ-)) - or ({@E-} & I*t- & O+ & {@MV+} & IV-) - or ({@E-} & I- & (( or O+ or [[@MV+ & O*n+]] or CX-) & {@MV+}) & {}) + or ({@E-} & I*t- & O+ & IV- & ) + or ({@E-} & I- & (( or O+ or [[@MV+ & O*n+]] or CX-) & ) & { or VJrpi-}) or ({@E-} & I- & CV-) or ({@E-} & I*d- & {}); +% Ss- & : "so it does!" % Ss- & & @MV+: "he does as he pleases." does.v: (( & ()) or ( & ([] or ())) or (() & )) - or ({@E-} & Ss- & & @MV+) + or ({@E-} & Ss- & & ) or ((SIs+ or SFIs+) & (( & I*d+) or CQ-)); % Ss- & & @MV+: "he did as he pleased." @@ -2623,7 +2710,7 @@ did.v-d: ( & ) or ( & ) - or ({@E-} & Ss- & & @MV+) + or ({@E-} & Ss- & & ) or ( & ) or ( & ) or ((SI+ or SFI+) & (( & I*d+) or CQ-)); % @@ -2632,7 +2719,7 @@ % Os+ & @MV+: "I've done that chore many times" : - or (O+ & {@MV+}) + or (O+ & ) or [[@MV+ & O*n+]] or Vd+; @@ -2643,7 +2730,7 @@ ( & ([] or ())) or (() & )) or - or ( & ) + or ( & {}) or (S- & ); % Pa- & Pg+: "I am done working", "I am through being mad" @@ -2652,7 +2739,7 @@ or (AJra- & {@MV+}) or ({@MV+} & AJla+); -doing.v: & (O+ or or [[@MV+ & O*n+]] or Vd+) & {@MV+}; +doing.v: & (O+ or or [[@MV+ & O*n+]] or Vd+) & ; doing.g: ((O+ or or [[@MV+ & O*n+]] or Vd+) & {@MV+} & ) or ; better.i fine.i ok.i okay.i OK.i poorly.i well.i: {EE-} & Vd-; @@ -2666,8 +2753,8 @@ or ({@E-} & {Ic-} & Wi- & {I*d+}); doesn't doesn’t: - (( & (SIs+ or SFIs+) & I*d+) or - ) & (I*d+ or or [[()]]); + (( & (SIs+ or SFIs+) & I*d+) or ) + & (I*d+ or or [[()]]); didn't.v-d didn’t.v-d: (( & (SI+ or SFI+)) or ) @@ -2688,7 +2775,7 @@ : ({@MV+} & ( or [TO+]) & ) or ((B- or (O+ & )) & {@MV+} & {[I*j+ or Pv+]}) - or (([[@MV+ & O*n+]] or CX- or [[()]]) & {@MV+}) + or (([[@MV+ & O*n+]] or CX- or [[()]]) & ) or ({N+} & PP+); have.v: @@ -2719,30 +2806,35 @@ ( & ( or ((O+ or ) & {@MV+} & {[I*j+ or Pv+]}) or - (([[@MV+ & O*n+]] or CX-) & {@MV+}))) or + (([[@MV+ & O*n+]] or CX-) & ))) or [[(SI*j+ or SFI**j+) & PP+ & ((Xd- & VCq- & Xc+) or VCq- or ({{Xd-} & Xc+} & COp+))]]; %we'd they'd I'd he'd she'd you'd: (PP+ or ({Vw+} & I+)) & ; -’d 'd: S- & (PP+ or I+); + +% S- & I+: "I'd love to" +% RS- & Bs- & PP+: "He looked at the girl who'd been knitting" +’d 'd: + (S- & (PP+ or I+)) + or (RS- & Bs- & PP+); having.v: & ; having.g: ( & ) or ; % PP is disjoined with because when PP is used, has/have/had -% is an auxiliarry verb, an should not get a wall connection! +% is an auxiliary verb, an should not get a wall connection! hasn't hasn’t: (( & (SIs+ or SFIs+)) or ()) - & (PP+ or ((([[O+]] & {@MV+}) or [[()]]) & )); + & (PP+ or ((([[O+]] & ) or [[()]]) & )); haven't haven’t: (( & (SIp+ or SFIp+)) or ({@E-} & (Sp- or SFp- or (RS- & Bp-)))) - & (PP+ or ((([[O+]] & {@MV+}) or [[()]]) & )); + & (PP+ or ((([[O+]] & ) or [[()]]) & )); hadn't.v-d hadn’t.v-d: (( & (SI+ or SFI+)) or ({@E-} & (S- or SFs- or SFp- or (RS- & B-)))) - & (PP+ or ((([[O+]] & {@MV+}) or [[()]]) & )); + & (PP+ or ((([[O+]] & ) or [[()]]) & )); % Give [K+] a cost so as to prefer Pp+ in general %: @@ -2788,7 +2880,7 @@ or ( & OF+) or (Osi+ & R+ & Bs+ & ) or (Opi+ & R+ & Bp+ & ) - or ([[()]] & )) & {@MV+}) + or ([[()]] & )) & ) or ( & Pp+ & {THi+ or @MV+}) or THb+ or @@ -2804,7 +2896,7 @@ ({@EBm+} & ((([B**t-] or [K+] or BI+ or OF+ or PFb- or (Osi+ & R+ & Bs+) or (Opi+ & R+ & Bp+) or - [[()]]) & {@MV+}) or + [[()]]) & ) or (Pp+ & {THi+ or @MV+}) or THb+ or or @@ -2814,23 +2906,21 @@ % O*m+ allows "If only there were more!" % THb+ allows "It is your fault that you're a failure." -% The cost on @MV+ causes attachements to the object to be prefered -% over attachments to the copula; for example, prepositions should -% almost surely attach via Mp+ link to the object, as opposed to -% using an MVp+ link to the copula. Example: +% The @MV+ allows modifiers (e.g. prep modifiers) to attach both to +% the object and to the copula. Example: % "There is neither pine nor apple in the pineapple." % "There is no ham in the hamburger, and neither pine nor apple in the pineapple." : - {@EBm+} & (O*t+ or O*m+) & {[@MV+]} & {THb+}; + {@EBm+} & (O*t+ or O*m+) & {@MV+} & {THb+}; : - {@EBm+} & (Opt+ or Omm+) & {[@MV+]} & {THb+}; + {@EBm+} & (Opt+ or Omm+) & {@MV+} & {THb+}; : - {@EBm+} & (Ost+ or Opt+ or Omm+) & {[@MV+]} & {THb+}; + {@EBm+} & (Ost+ or Opt+ or Omm+) & {@MV+} & {THb+}; : - {@EBm+} & Out+ & {[@MV+]} & {THb+}; + {@EBm+} & Out+ & {@MV+} & {THb+}; : or ( & ); : or ( & ); @@ -2856,20 +2946,20 @@ ( & ) or ( & ) or ( & ) - or ( & (SIs*x+ or SIs*b+ or SFIs+) & {}) + or ({} & (SIs*x+ or SIs*b+ or SFIs+) & {}) or (Sa*a- & Pv+) or (Ss*w- & & Pp+ & TO+ & IV+) or (EQ*r- & S- & & EQ*r+); % Similar to above, but no S-O inversion, and no equation. -% Also, a cost, so that possesive 's is preferred. +% Also, a cost, so that possessive 's is preferred. % Also, this can be a contraction for "has": "he has" -> "he's" % & PP+: "He's gone to Boston" (viz "He has gone to Boston") % But also, some contractions are prohibited: % *That's just the kind of person he's -- so: Ss- & PF- not allowed % SIs+ & PFb-: "Where's the ball?" 's.v ’s.v: - [( & + [( & (({@EBm+} & ((( (O*t+ & ) or [K+ & ] @@ -2902,7 +2992,7 @@ are.v: ({@E-} & (Spx- or SFp- or (RS- & Bp-)) & ) or ({@E-} & Ss*t- & & ) - or ( & (SIpx+ or SFIp+) & {}); + or ({} & (SIpx+ or SFIp+) & {}); %we're they're I'm you're: %(({@EBm+} & (((O*t+ or K+ or BI+ or OF+ or Pp+) & {@MV+}) or or Pa+)) or @@ -2910,7 +3000,7 @@ % they're we're ’re 're: - Spx- & (({@EBm+} & ((((O*t+ & ) or K+ or BI+ or OF+ or Pp+) & {@MV+}) or or (Pa+ & ) )) or + Spx- & (({@EBm+} & ((((O*t+ & ) or K+ or BI+ or OF+ or Pp+) & ) or or (Pa+ & ) )) or ({N+} & (Pg+ or Pv+))); % yisser: "you're" Irish English @@ -2935,7 +3025,7 @@ or ( & ) or [[(SI*j+ or SFI**j+) & & ((Xd- & VCq- & Xc+) or VCq- or ({{Xd-} & Xc+} & COp+))]]; -% Ss*w-: allows Wh subjets: "Who am I?" +% Ss*w-: allows Wh subjects: "Who am I?" am.v: ({@E-} & SX- & ) or ( & SXI+ & {}) @@ -2945,7 +3035,7 @@ % I'm == I am ’m 'm: - SX- & (({@EBm+} & (((O*t+ or K+ or BI+ or OF+ or Pp+) & {@MV+}) or or Pa+)) or + SX- & (({@EBm+} & (((O*t+ or K+ or BI+ or OF+ or Pp+) & ) or or Pa+)) or ({N+} & (Pg+ or Pv+))) & ; % S*x- used for passive participles: "this action be taken". @@ -2971,7 +3061,7 @@ being.v: ((({@EBm+} & (((O*t+ or [B**t-] or Pp+ or K+ or OF+ or BI+ or or THb+) - & {@MV+}) + & ) or Pa+)) or AF- or Pv+) & ) @@ -3008,7 +3098,7 @@ % Sa*a- & Ix+: "..., as shall be proven" will.v can.v may.v must.v could.v might.v shall.v shalt.v: ((SI+ or SFI+) & (( & I+) or CQ-)) - or ({N+} & & (I+ or (CX- & {@MV+}) or or [[()]])) + or ({N+} & & (I+ or (CX- & ) or or [[()]])) or (Sa*a- & Ix+) or ( & {N+} & {@E-} & I+) or ({N+} & {@E-} & I+ & ); @@ -3023,14 +3113,14 @@ % : "You know you should." should.v: ((SI+ or SFI+) & (( & I+) or CQ-)) or - ({N+} & & (I+ or (CX- & {@MV+}) or or [[()]])) or + ({N+} & & (I+ or (CX- & ) or or [[()]])) or ( & I+) or (I+ & ) or [[(SI*j+ or SFI**j+) & I+ & ((Xd- & VCq- & Xc+) or VCq- or ({{Xd-} & Xc+} & COp+))]]; % : "I sure wish he would." would.v: ((SI+ or SFI+) & (( & {Vw+} & I+) or CQ-)) or - ({N+} & & (({RT+} & I+) or (CX- & {@MV+}) or or [[()]])) or + ({N+} & & (({RT+} & I+) or (CX- & ) or or [[()]])) or ( & I+) or (I+ & ); % TO+: "I ought to." @@ -3109,8 +3199,8 @@ % the verb form: 1=plural-infinitive, 2=singular, 3=past("ed"), % 4=progressive("-ing"), 5=gerund("-ing".) -% abbreviations for ditransitive and optionally ditranstive verbs -% ditranstive verbs take a direct and indirect object +% abbreviations for ditransitive and optionally ditransitive verbs +% ditransitive verbs take a direct and indirect object % e.g. "I gave her a rose" % B- & O+ & O*n+: "What are the chances you'll give her a kiss?" % O+ & @MV+ & O*n+: "I gave him for his birthday a very expensive present" @@ -3129,7 +3219,7 @@ % ----------------------------------------------------------- % common intransitive verbs -: {@MV+}; +: ; % XXX Hmmm. There is a fair number of verbs in here that are "weakly" % transitive, i.e. are transitive in various rare usages: @@ -3146,15 +3236,23 @@ % also could be sorted out. % % accrue.v ache.v acquiesce.v ad-lib.v adhere.v adjoin.v alight.v -/en/words/words.v.1.1: +% +% : "here rest the remains of St. Stephen" +% XXX Some but not all of these verbs allow inversion; basically, +% anything that is not locative or directive won't allow inversion. +% +/en/words/words.v.1.1: + (( & ({[[O+]]} & )) or ( & (({[[O+]]} & ) or ())) or - (({[[O+]]} & ) & )); + (({[[O+]]} & ) & )) + or ; % accounts.v accrues.v aches.v acquiesces.v ad-libs.v adheres.v % : Locative subj-obj inversion "far out in the sea lives a fish" % XXX Some but not all of these verbs allow inversion; basically, % anything that is not locative or directive won't allow inversion. +% /en/words/words.v.1.2: (( & ()) or @@ -3163,11 +3261,15 @@ or ; % accounted.v accrued.v ached.v acquiesced.v ad-libbed.v adhered.v +% Pa+: "He ad-libbed, uninterrupted." /en/words/words.v.1.3: - (( & ({[[O+]]} & )) or - ( & ([{[[O+]]} & ] or ())) or - (({[[O+]]} & ) & )) + (( & (({[[O+]]} & ) + or ({Xc+} & Pa+))) or + ( & ([({[[O+]]} & ) + or ({Xc+} & Pa+)] or ())) or + ((({[[O+]]} & ) + or ({Xc+} & Pa+)) & )) or ; % : "It was rusted closed" @@ -3192,7 +3294,7 @@ % -------------------------------------------------------------- % intransitive verbs that can take particles like "up" and "out" -: {K+} & {@MV+}; +: {K+} & ; % barge.v booze.v bottom.v bow.v branch.v breeze.v brim.v bulge.v cave.v /en/words/words.v.5.1: @@ -3225,6 +3327,7 @@ (( & ()) or ( & ([] or ())) or (() & )); + /en/words/words.v.5.4: ( & ) or or @@ -3232,9 +3335,9 @@ % -------------------------------------------------------------- -% irregular -- coming is in words.v.5.4 ... +% irregular : - ({(K+ & {Pa+}) or Pv+ or [[Pg+]] or } & {@MV+}) + ({(K+ & {Pa+}) or Pv+ or [[Pg+]] or } & ) or ({@MV+} & Pa+); come.v: @@ -3256,10 +3359,21 @@ ( & ([] or ())) or (() & )) or ; +% : "he got what he had coming" +coming.v: + ( & ) + or + (( & ()) or + ( & ([]0.2 or ())) or + (() & ) or + ) + or + or ; + % -------------------------------------------------------------- % optionally transitive verbs % abdicate.v abide.v abort.v accelerate.v acclimate.v acclimatize.v -: {O+ or or [[@MV+ & O*n+]]} & {@MV+}; +: {O+ or or [[@MV+ & O*n+]]} & ; /en/words/words.v.2.1: (( & ()) or @@ -3275,15 +3389,18 @@ % : only about half of the words in words.v.2.3 should get % this, the other half clearly should not. I'm too lazy to sort it out, % right now. +% has as a part of it. +% Pa+: "he paced, worried" +% & Pa+: "she was posed reclining" /en/words/words.v.2.3: - (( & ()) or - ( & ([]0.2 or ())) or - (() & ) or + (( & ( or ({Xc+} & Pa+))) or + ( & ([ or ({Xc+} & Pa+)]0.2 or ())) or + (( or ({Xc+} & Pa+)) & ) or ) - or + or ( & {{Xc+} & Pa+}) + or ( & O+ & Xc+) or - or or ; % Pa+: "The vase landed, unbroken" @@ -3384,7 +3501,6 @@ ) or ( & {{@MV+} & Pa+}) or - or or ; judging.v: & ; @@ -3393,7 +3509,7 @@ % -------------------------------------------------------------- -: {K+ or OD+} & {@MV+}; +: {K+ or OD+} & ; rise.v fall.v: (( & ()) or ( & (() or ())) or @@ -3444,10 +3560,10 @@ % [A+]0.5: He was xxx'ed there should have xxx as verb not adjective. % : - ((K+ & {[[@MV+]]} & (O*n+ or Pa+ or Pv+)) + ((K+ & {[[@MV+]]} & (O*n+ or ({Xc+} & (Pa+ or Pv+)))) or ({O+ or } & {K+}) or [[@MV+ & O*n+]] - ) & {@MV+}; + ) & ; /en/words/words.v.6.1: @@ -3466,7 +3582,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or or ({K+} & ); @@ -3570,7 +3686,7 @@ or ((O+ or ) & ({@MV+} & Pa**j+)) or ({@E-} & & O+ & {Pa**j+ or K+}) or [[@MV+ & O*n+]] - ) & {@MV+}; + ) & ; catch.v drive.v strike.v: @@ -3658,7 +3774,7 @@ or or [[@MV+ & O*n+]] or ({@E-} & & O+) - ) & {@MV+}; + ) & ; /en/words/words.v.4.1 : (( & ()) or @@ -3681,15 +3797,15 @@ ); % is too broad for most of these, but is OK for many. +% is part of /en/words/words.v.4.3: (( & ()) or ( & ([]0.2 or ())) or (() & ) or ) - or - or (({@E-} or {@EA-} or {@AN-}) & [A+]0.5) - or + or ( & {{Xc+} & Pa+}) + or or ; % !?XXX many of the new additions fail some common sentences, such as: @@ -3700,9 +3816,8 @@ ( & ([]0.2 or ())) or (() & ) or ) - or - or (({@E-} or {@EA-} or {@AN-}) & [A+]0.5) - or + or ( & {{Xc+} & Pa+}) + or or ; /en/words/words.v.4.4 @@ -3721,7 +3836,7 @@ (() & ) or ); -hit.v-d misread.v-d shed.v-d rid.v-d overcome.v-d offset.v-d +hit.v-d misread.v-d shed.v-d rid.v-d overcome.v-d overrun.v-d upset.v-d undercut.v-d: (( & ()) or @@ -3751,7 +3866,7 @@ or or [[@MV+ & O*n+]] or ({@E-} & & O+) - ) & {@MV+}; + ) & ; frank.v : (( & ()) or @@ -3785,7 +3900,7 @@ (() & ) or ) or - or (({@E-} or {@EA-} or {@AN-}) & A+) + or or ; corralling.v crowning.v decanting.v inking.v intoning.v roping.v: & ; @@ -3810,7 +3925,7 @@ (() & ) or ); -% A+: "she gave him some slcied bread" +% A+: "she gave him some sliced bread" baked.v-d sliced.v-d: (( & ()) or @@ -3818,7 +3933,7 @@ (() & ) or ) or - or (({@E-} or {@EA-} or {@AN-}) & A+) + or or ; dictated.v-d kissed.v-d: @@ -3838,7 +3953,7 @@ % A+: "She gave him some buttered bread" : - or ((O+ & K+) & {@MV+}) + or ((O+ & K+) & ) or ; butter.v: (( & ()) or @@ -3854,7 +3969,7 @@ ( & ([] or ())) or (() & )) or - or (({@E-} or {@EA-} or {@AN-}) & A+); + or ; buttering.v: & ; buttering.g: @@ -3868,12 +3983,14 @@ % % B- & O+ & {K+}: % what are the chances she will TRACK him down to the farm? -% +% Pa+: "he cut out after fifth period" +% K+ & Pa+: "it washed up, unbroken" : ((K+ & {[[@MV+]]} & O*n+) or ((O+ or ) & {K+}) or ({@E-} & & O+ & {K+}) - or [[@MV+ & O*n+]]) & {@MV+}; + or ({K+} & {Xc+} & Pa+) + or [[@MV+ & O*n+]]) & ; /en/words/words.v.8.1: (( & ()) or @@ -3900,34 +4017,35 @@ (( & ()) or ( & ([] or ())) or (() & )) or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or or - or ({K+} & ); % : only about half of the words in words.v.8.3 should get % this, the other half clearly should not. I'm too lazy to sort it out, % right now. +% Pa+: "it washed up unbroken" /en/words/words.v.8.3: (( & ()) or ( & ([]0.2 or ())) or (() & ) or ) or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or or - or ({K+} & ); -cut.v: +cut.v-d: (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ) or - ( & ) or - ( & {K+} & {@MV+}) or - or ({K+} & ); + ) + or ( & ) + or ( & {K+} & ) + or ( & O+ & Xc+) + or + or ({K+} & ); /en/words/words.v.8.4: & ; % : has O+ in it; so must not have S+ in @@ -3937,7 +4055,7 @@ ; % -------------------------------------------------------------- -: (((O+ or ) & {OD+}) or [[@MV+ & O*n+]]) & {@MV+}; +: (((O+ or ) & {OD+}) or [[@MV+ & O*n+]]) & ; raise.v lower.v up.v: (( & ()) or ( & (() or ())) or @@ -4037,6 +4155,15 @@ (() & )); happening.v occuring.v occurring.v: ( & ) or ; +% : a subset of , used for urges/desires +% Allows "he does as he pleases" type constructions, using the +% CP link in a paraphrasing-like way. +% O+: "... as it pleases him" +% Pv-: "... as it was hoped" (XXX why not PP-, here ???) +: + {E-} & (S- or PP- or Pv-) & & {Xd-} & [dCPu-]-0.05 & + ({O+ & } or {@MV+ & Xc+}); + % ditransitve : ({@MV+} & {TH+ or or RSe+ or }) or @@ -4048,11 +4175,11 @@ wishes.v: (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or ; wished.v-d: (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or ; wishing.v: & ; wishing.g: ( & ) or ; @@ -4063,18 +4190,21 @@ ({@MV+} & {TH+ or or RSe+ or }) or [[O+ & {@MV+}]]; -hope.v agree.v pretend.v swear.v pray.v vow.v vote.v: +hope.v agree.v pretend.v swear.v pray.v vow.v vote.v: + (( & ()) or ( & (() or ())) or (() & )); -hopes.v agrees.v pretends.v swears.v prays.v vows.v votes.v: +hopes.v agrees.v pretends.v swears.v prays.v vows.v votes.v: + (( & ()) or ( & ([] or ())) or - (() & )); -pretended.v-d prayed.v-d: + (() & )) or ; +pretended.v-d prayed.v-d: + (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or ; % The ( & THi+) allows "it is hoped that ..." to parse. % Naked Pv+: "..., as was hoped." @@ -4085,6 +4215,7 @@ (() & ) or ) or ( & {THi+}) + or or ; % Naked Pv+: "..., as was agreed." @@ -4095,7 +4226,9 @@ (() & ) or ) or ( & {TH+}) + or or ; + swore.v-d: (( & ()) or ( & ([]0.2 or ())) or @@ -4104,44 +4237,55 @@ sworn.v: (( & ()) or ( & ([] or ())) or - (() & )) or ; + (() & )) or or ; + hoping.v agreeing.v pretending.v swearing.v praying.v vowing.v voting.v: ( & ) or ; +: {@E-} & S- & hPFt- & ; + % XXX Why is there a cost on Pv+ ?? "John appeared vindicated" % N+: "It appears not" +% : "so it seems", "so it appears" +% Ix- & PF- & : "so it would seem" : - {@MV+} & {(Pa+ & ) or or THi+ or AF- or N+ or [Pv+]}; + {@MV+} & {(Pa+ & ) + or + or THi+ + or AF- + or N+ + or [{Xc+} &Pv+]}; appear.v: (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or (Ix- & PF- & ); appears.v: (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or ; appeared.v-d: (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or ; appearing.v: ( & ) or ; % XXX Why is there a cost on Pv+ ?? "John seemed vindicated" % N+: "It seems not" +% : "so seems it" : {@MV+} & ((Pa+ & ) or or LI+ or THi+ or AF- or N+ or [Pv+]); seem.v: (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or (Ix- & PF- & ); seems.v: (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or or ; seemed.v-d: (( & ()) or ( & ([] or ())) or - (() & )); + (() & )) or or ; seeming.v: ( & ) or ; : {@MV+} & { or QI+}; @@ -4165,14 +4309,16 @@ (( & ()) or ( & (() or ())) or - (() & )); + (() & )) or ; + asserts.v contends.v remarks.v retorts.v intimates.v exclaims.v conjectures.v alleges.v surmises.v opines.v insinuates.v supposes.v: (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ); + ) or ; + retorted.v intimated.v exclaimed.v conjectured.v surmised.v-d opined.v-d insinuated.v-d: (( & ()) or @@ -4184,7 +4330,9 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & THi+); + ( & THi+) or + ; + alleged.v-d: (( & ()) or @@ -4193,6 +4341,7 @@ ) or ( & THi+) or ; + supposed.v-d: (( & ()) or @@ -4200,8 +4349,7 @@ (() & ) or ) or ( & { or THi+ or Z-}) - or - or ; + or ; asserting.v contending.v remarking.v retorting.v intimating.v exclaiming.v conjecturing.v alleging.v surmising.v opining.v insinuating.v @@ -4268,7 +4416,7 @@ ( & ) or ; -: {@MV+} & { or TH+ or RSe+ or (OF+ & {@MV+}) or BW-}; +: {@MV+} & { or TH+ or RSe+ or (OF+ & ) or BW-}; dream.v complain.v: (( & ()) or ( & (() or ())) or @@ -4290,7 +4438,7 @@ % O+ & O*n: "She will think it an act of kindness." % O+ & Pa**j: "She will think it true." : - ({@MV+} & { or TH+ or RSe+ or Z- or (OF+ & {@MV+}) or BW-}) + ({@MV+} & { or TH+ or RSe+ or Z- or (OF+ & ) or BW-}) or (O+ & {@MV+} & {O*n+ or Pa**j+}); think.v: @@ -4387,7 +4535,7 @@ % go.w: {E-} & (Wi- or S-) & I+; % B-: "which way did it go?" -: {K+ or [[Pa+]] or [Pg+] or I*g+ or } & {@MV+}; +: {K+ or [[{Xc+} & Pa+]] or [Pg+] or I*g+ or } & ; go.v: (( & ()) or ( & (() or ())) or @@ -4419,7 +4567,7 @@ % XXX TODO maybe need VJ and-able links for going etc. ??? % : "there is going to be a meeting" going.v goin'.v: - (( or ({K+ or [[Pa+]]} & {@MV+})) & ) or + (( or ({K+ or [[{Xc+} & Pa+]]} & )) & ) or or ; @@ -4432,7 +4580,7 @@ or ({@E-} & Sp*i- & WV- & I*t+); % transitive: "stay the prisoner's execution" -: {({@MV+} & (Pa+ or AF-)) or ({K+} & {@MV+}) or (O+ & {@MV+})}; +: {({@MV+} & (Pa+ or AF-)) or ({K+} & ) or (O+ & )}; stay.v: (( & ()) or ( & (() or ())) or @@ -4449,7 +4597,7 @@ ) or ; staying.v: ( & ) or ; -: {({@MV+} & Pa+) or ({O+ or } & {K+} & {@MV+})}; +: {({@MV+} & Pa+) or ({O+ or } & {K+} & )}; stand.v sit.v: (( & ()) or ( & (() or ())) or @@ -4467,7 +4615,7 @@ standing.v sitting.v: & ; standing.g sitting.g: ( & ) or or ; -: ({@MV+} & {LI+ or Pa+ or AF-}) or {O+ & K+ & {@MV+}}; +: ({@MV+} & {LI+ or Pa+ or AF-}) or {O+ & K+ & }; sound.v: (( & ()) or ( & (() or ())) or @@ -4485,7 +4633,7 @@ sounding.v: ( & ) or ; % K: "He is acting up" -: {({@MV+} & (LI+ or Pa+)) or ({K+ or AF-} & {@MV+})}; +: {({@MV+} & (LI+ or Pa+)) or ({K+ or AF-} & )}; act.v: (( & ()) or ( & (() or ())) or @@ -4498,7 +4646,7 @@ acted.v-d: (( & ()) or ( & ([] or ())) or - (() & )) or ( & {K+} & {@MV+}); + (() & )) or ( & {K+} & ); acting.v: ( & ) or ; % Pa: The team reigns undefeated @@ -4525,9 +4673,9 @@ % K+ connects to particles. % [Pa+]0.1: prefer MVa to Pa whenever possible: "She look right" % [K+]0.2: prefer Pa+ to K+ whenever possible: "She looked up" -: {({@MV+} & (LI+ or [Pa+]0.1)) - or ({[K+]0.2 or AF-} & {@MV+}) - or ((O+ & K+) & {@MV+})}; +: {({@MV+} & (LI+ or [{Xc+} & Pa+]0.1)) + or ({[K+]0.2 or AF-} & ) + or ((O+ & K+) & )}; look.v: (( & ()) or ( & (() or ())) or @@ -4549,7 +4697,7 @@ % MV+: "who did you wave to?" : {@MV+} - or ((O+ & K+) & {@MV+}) + or ((O+ & K+) & ) or ; wave.v: (( & ()) or @@ -4612,7 +4760,7 @@ (() & )); consisting.v: ( & ) or ; -: {K+ or OF+} & {@MV+}; +: {K+ or OF+} & ; die.v: (( & ()) or ( & (() or ())) or @@ -4627,7 +4775,7 @@ (() & )); dying.v: ( & ) or ; -: {({[[@MV+]]} & OT+) or BT-} & {@MV+}; +: {({[[@MV+]]} & OT+) or BT-} & ; last.v wait.v: (( & ()) or ( & (() or ())) or @@ -4646,21 +4794,24 @@ % TRANSITIVE COMPLEX VERBS (Those that take O+) : or ({@MV+} & ); + attempt.v undertake.v deserve.v manage.v plot.v prefer.v neglect.v -afford.v commit.v profess.v desire.v: +afford.v commit.v profess.v desire.v please.v: (( & ()) or ( & (() or ())) or (() & )); -attempts.v undertakes.v deserves.v manages.v plots.v prefers.v -neglects.v affords.v commits.v professes.v desires.v: + +attempts.v undertakes.v manages.v plots.v prefers.v +neglects.v affords.v commits.v professes.v: (( & ()) or ( & ([]0.2 or ())) or (() & ) or ); -attempted.v deserved.v managed.v plotted.v preferred.v neglected.v afforded.v committed.v -professed.v-d desired.v-d: + +attempted.v managed.v plotted.v preferred.v neglected.v +afforded.v committed.v professed.v-d: (( & ()) or ( & ([]0.2 or ())) or @@ -4669,6 +4820,7 @@ or or ; + undertook.v-d: (( & ()) or ( & ([]0.2 or ())) or @@ -4678,20 +4830,50 @@ (( & ()) or ( & ([] or ())) or (() & )) or ; -attempting.g undertaking.g deserving.g -plotting.g preferring.g neglecting.g affording.g committing.g professing.g -desiring.g: -( & ) or ; -managing.g: ( & ) or or ; -attempting.v undertaking.v deserving.v managing.v -plotting.v preferring.v neglecting.v affording.v committing.v professing.v -desiring.v: - & ; + +attempting.g undertaking.g deserving.g preferring.g +neglecting.g affording.g committing.g desiring.g +pleasing.g: + ( & ); + +% : "The constant plotting annoyed Jim" +plotting.g professing.g: + ( & ) or ; + +managing.g: + ( & ) or or ; + +attempting.v undertaking.v deserving.v managing.v plotting.v +preferring.v neglecting.v affording.v committing.v professing.v +desiring.v pleasing.v: + & ; + +% : "he gets as he deserves." +deserves.v desires.v pleases.v: + + (( & ()) or + ( & ([]0.2 or ())) or + (() & ) or + ) or ; + +deserved.v-d desired.v-d pleased.v-d: + + (( & ()) or + ( & ([]0.2 or ())) or + (() & ) or + ) or + or + or + or + ; % like but with particle -: ({@MV+} & ) or -((((O+ or ) & {K+}) or (K+ & {[[@MV+]]} & O*n+) or -([[@MV+ & O*n+]])) & {@MV+}); +: + ({@MV+} & ) + or ((((O+ or ) & {K+}) + or (K+ & {[[@MV+]]} & O*n+) + or ([[@MV+ & O*n+]])) & ); + seek.v: (( & ()) or ( & (() or ())) or @@ -4710,12 +4892,16 @@ seeking.v: & ; % Naked @MV+: "She volunteered as a nurse." -: {} or ({@MV+} & ) or @MV+; -decline.v fail.v hasten.v volunteer.v aim.v: +: {} or ({@MV+} & ) or ; + +decline.v fail.v hasten.v volunteer.v aim.v: + (( & ()) or ( & (() or ())) or (() & )); -declines.v fails.v hastens.v volunteers.v aims.v: + +declines.v fails.v hastens.v volunteers.v aims.v: + (( & ()) or ( & ([]0.2 or ())) or (() & ) or @@ -4751,7 +4937,7 @@ ({@MV+} & ) or ({({O+ or } & {K+}) or (K+ & {[[@MV+]]} & O*n+) or - [[@MV+ & O*n+]]} & {@MV+}); + [[@MV+ & O*n+]]} & ); fight.v: (( & ()) or ( & (() or ())) or @@ -4765,7 +4951,7 @@ (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ) or ( & {K+} & {@MV+}) or + ) or ( & {K+} & ) or ({K+} & ); fighting.g: ( & ) or ; fighting.v: & ; @@ -4784,7 +4970,7 @@ (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ); + ) or ; threatened.v-d meant.v-d arranged.v-d pledged.v-d: @@ -4792,9 +4978,9 @@ ( & ([]0.2 or ())) or (() & ) or ) or + or or or - or ; meaning.g arranging.g threatening.g pledging.g: @@ -4888,7 +5074,7 @@ : {} or - ({@MV+} & (TH+ or or or RSe+ or Zs- or QI+ or (OF+ & {@MV+}))); + ({@MV+} & (TH+ or or or RSe+ or Zs- or QI+ or (OF+ & ))); learn.v: (( & ()) or ( & (() or ())) or @@ -4925,7 +5111,6 @@ ) or ( & {THi+ or TSi+ or Z-}) or - or or ; proposing.g: ( & ) or ; proposing.v: & ; @@ -5073,7 +5258,7 @@ : ((({O+ or } & {K+}) or (K+ & {[[@MV+]]} & O*n+) or - [[@MV+ & O*n+]]) & {@MV+}) or + [[@MV+ & O*n+]]) & ) or ({@MV+} & ( or Pg+)); start.v stop.v try.v: @@ -5091,7 +5276,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or ({K+} & ); starting.g stopping.g trying.g @@ -5132,7 +5317,6 @@ ) or ( & {THi+}) or - or or ; recognizing.g disputing.g accepting.g calculating.g deducing.g recording.g envisioning.g recounting.g signifying.g clarifying.g disclosing.g @@ -5145,7 +5329,8 @@ undisputed.v: ( & {THi+}); -: {} or ({@MV+} & TH+); +% MVp+: "it repeated for ..." "She provided for ..." +: {} or ({@MV+} & TH+) or MVp+; repeat.v reflect.v provide.v counter.v signal.v: (( & ()) or ( & (() or ())) or @@ -5162,9 +5347,8 @@ (() & ) or ) or or - or or - ; + ; provided.v-d: (( & ()) or @@ -5213,7 +5397,6 @@ ) or ( & {THi+}) or - or or ; sensing.v doubting.v reckoning.v reaffirming.v stipulating.v @@ -5286,23 +5469,25 @@ ) or ( & {THi+}) or - or or ; imagining.g: ( & ) or ; imagining.v: & ; -% Pa**j link: The doctor declared him insane. +% Pa**j+: "The doctor declared him insane." +% MVa+: "he will suspect soon", "he suspects already" : - or - ({@MV+} & ( or TH+ or RSe+ or Pg+ or Z-)) or - ((O+ or ) & ({@MV+} & Pa**j+)); + + or ({@MV+} & ( or TH+ or RSe+ or Pg+ or Z-)) + or ((O+ or ) & ({@MV+} & Pa**j+)) + or MVa+; declare.v fear.v conclude.v suspect.v concede.v presume.v foresee.v emphasize.v maintain.v acknowledge.v note.v confirm.v stress.v assume.v: (( & ()) or ( & (() or ())) or - (() & )); + (() & )) or ; + declares.v fears.v concludes.v suspects.v concedes.v presumes.v foresees.v emphasizes.v maintains.v acknowledges.v notes.v confirms.v stresses.v assumes.v: @@ -5310,7 +5495,8 @@ (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ); + ) or ; + declared.v feared.v concluded.v suspected.v conceded.v presumed.v emphasized.v maintained.v acknowledged.v noted.v confirmed.v-d stressed.v-d assumed.v-d: @@ -5321,13 +5507,13 @@ ) or ( & {THi+ or ({@MV+} & Pa+) }) or - or or ; + foresaw.v-d: (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ); + ) or ; foreseen.v: (( & ()) or @@ -5374,7 +5560,7 @@ ({@MV+} & ( or TH+ or RSe+)) or ((({O+ or } & {K+}) or (K+ & {[[@MV+]]} & O*n+) or - [[@MV+ & O*n+]]) & {@MV+}); + [[@MV+ & O*n+]]) & ); rule.v add.v: (( & ()) or ( & (() or ())) or @@ -5390,7 +5576,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {({@MV+} & (THi+ or )) or ({K+} & {@MV+})}) or + ( & {({@MV+} & (THi+ or )) or ({K+} & )}) or ({K+} & ) or ; ruling.g adding.g: ( & ) or ; @@ -5399,7 +5585,7 @@ % with particle : ({@MV+} & (TH+ or Zs- or )) or - ((((O+ or ) & {K+}) or (K+ & {[[@MV+]]} & O*n+)) & {@MV+}) or + ((((O+ or ) & {K+}) or (K+ & {[[@MV+]]} & O*n+)) & ) or ([[@MV+ & O*n+]]); figure.v: (( & ()) or @@ -5416,7 +5602,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or ({K+} & ); figuring.g: ( & ) or ; figuring.v: & ; @@ -5429,7 +5615,7 @@ % I- & B- & : "What did John say you should do?" predict.v realize.v discover.v determine.v announce.v say.v mention.v admit.v -recall.v reveal.v state.v observe.v indicate.v +recall.v reveal.v divulge.v state.v observe.v indicate.v analyse.v analyze.v assess.v establish.v evaluate.v examine.v question.v test.v hypothesize.v hypothesise.v document.v envisage.v: @@ -5439,21 +5625,23 @@ or (I- & & ); predicts.v realizes.v discovers.v determines.v announces.v says.v -mentions.v admits.v recalls.v reveals.v states.v observes.v indicates.v +mentions.v admits.v recalls.v reveals.v divulges.v states.v observes.v +indicates.v analyses.v analyzes.v assesses.v establishes.v evaluates.v examines.v -questions.v tests.v hypothesizes.v hypothesises.v envisages.v documents.v: +questions.v tests.v hypothesizes.v hypothesises.v envisages.v +documents.v: (( & ()) or ( & ([]0.2 or ())) or (() & ) or ); -predicted.v realized.v discovered.v determined.v announced.v mentioned.v -admitted.v recalled.v revealed.v stated.v observed.v indicated.v -analysed.v analyzed.v assessed.v established.v evaluated.v examined.v -questioned.v tested.v -hypothesized.v-d hypothesised.v-d well-established.v-d documented.v-d -envisaged.v-d: +predicted.v-d realized.v-d discovered.v-d determined.v-d announced.v-d +mentioned.v-d admitted.v-d recalled.v-d revealed.v-d divulged.v-d +stated.v-d observed.v-d indicated.v-d analysed.v-d analyzed.v-d +assessed.v-d established.v-d evaluated.v-d examined.v-d questioned.v-d +tested.v-d hypothesized.v-d hypothesised.v-d well-established.v-d +envisaged.v-d documented.v-d: (( & ()) or ( & ([]0.2 or ())) or @@ -5461,7 +5649,6 @@ ) or ( & {THi+}) or - or or ; % the second line is almost, not quite, @@ -5477,13 +5664,13 @@ predicting.g realizing.g discovering.g determining.g announcing.g saying.g mentioning.g admitting.g recalling.g revealing.g -stating.g observing.g indicating.g +divulging.g stating.g observing.g indicating.g analysing.g analyzing.g assessing.g establishing.g evaluating.g examining.g questioning.g testing.g hypothesizing.g hypothesising.g documenting.g envisaging.g: ( & ) or ; predicting.v realizing.v discovering.v determining.v announcing.v saying.v mentioning.v admitting.v recalling.v revealing.v -stating.v observing.v indicating.v +divulging.v stating.v observing.v indicating.v analysing.v analyzing.v assessing.v establishing.v evaluating.v examining.v questioning.v testing.v hypothesizing.v hypothesising.v documenting.v envisaging.v: @@ -5498,20 +5685,22 @@ (( & ()) or ( & (() or ())) or (() & )); -guesses.v estimates.v understands.v notices.v explains.v demonstrates.v: +guesses.v estimates.v understands.v notices.v explains.v esplains.v +demonstrates.v: (( & ()) or ( & ([]0.2 or ())) or (() & ) or ); -guessed.v-d understood.v-d noticed.v-d explained.v-d demonstrated.v-d: +guessed.v-d understood.v-d noticed.v-d explained.v-d esplained.v-d +demonstrated.v-d: (( & ()) or ( & ([]0.2 or ())) or (() & ) or ) or ( & {THi+}) - or + or or ; estimated.v-d: @@ -5520,41 +5709,42 @@ (() & ) or ) or ( & {THi+}) or - or or - ; + ; guessing.g estimating.g understanding.g noticing.g explaining.g demonstrating.g: ( & ) or ; guessing.v estimating.v understanding.v noticing.v explaining.v demonstrating.v: & ; % (QI+ & {MV+}): "I did not know why until recently" +% MVa+: "He knows already" % MVb+: "He should know better" : {} - or ({@MV+} & (((OF+ or QI+)& {@MV+}) or or TH+ or RSe+ or Zs-)) - or ({Xc+} & MVb+); + or ({@MV+} & (((OF+ or QI+) & ) or or TH+ or RSe+ or Zs-)) + or ({Xc+} & (MVa+ or MVb+)); know.v: - (( & ()) or - ( & (() or ())) or - (() & )); + (( & ( & {dCPu-})) or + ( & (( & {dCPu-}) or ())) or + (( & {dCPu-}) & )) or ; knows.v: (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ); + ) or ; knew.v-d: (( & ()) or ( & ([]0.2 or ())) or (() & ) or ); -known.v: +% dCPu-: "... as is well known" +known.v well-known.v: (( & ()) or ( & ([] or ())) or (() & )) or - ( & {THi+ or or QIi+}) or + ( & {THi+ or or QIi+} & {dCPu-}) or or ; knowing.g: ( & ) or ; @@ -5579,14 +5769,13 @@ ) or ( & {THi+ or TSi+}) or or - or ; requesting.g: ( & ) or ; requesting.v: & ; % XXX why is there a cost on Pv ?? : or - ({@MV+} & (Pa+ or TH+ or or RSe+ or AF- or Vf+ or (LI+ or {@MV+}) or [Pv+])); + ({@MV+} & (Pa+ or TH+ or or RSe+ or AF- or Vf+ or (LI+ or ) or [Pv+])); feel.v: (( & ()) or ( & (() or ())) or @@ -5641,7 +5830,6 @@ ) or or or - or ; studying.g: ( & ) or ; studying.v: & ; @@ -5665,7 +5853,6 @@ ) or or - or or ; discussing.g: ( & ) or ; discussing.v: @@ -5695,7 +5882,6 @@ ) or or or - or ; opposing.g enjoying.g advocating.g contemplating.g @@ -5707,7 +5893,8 @@ favoring.v: & ; -: {} or ({@MV+} & Pg+); +% MVp+: "he finished at last" +: {} or ({@MV+} & Pg+) or (MVp+ & {VC+}); finish.v practice.v resist.v: (( & ()) or ( & (() or ())) or @@ -5721,26 +5908,32 @@ % : "I want it finished" finished.v-d practiced.v-d resisted.v-d quitted.v-d: - (( & ()) or - ( & ([]0.2 or ())) or - (() & ) or - ) or or or ; + (( & ( or ({Xc+} & Pa+))) or + ( & ([ or ({Xc+} & Pa+)]0.2 or ())) or + (( or ({Xc+} & Pa+)) & ) or + ) + or + or + or ; quit.v-d: (( & ()) or ( & (() or ())) or (() & )) or - (( & ()) or - ( & ([]0.2 or ())) or - (() & ) or - ) or - or - or - ; -finishing.g practicing.g resisting.g -quitting.g: ( & ) or ; -finishing.v practicing.v resisting.v quitting.v: & ; + (( & ( or ({Xc+} & Pa+))) or + ( & ([ or ({Xc+} & Pa+)]0.2 or ())) or + (( or ({Xc+} & Pa+)) & ) or + ) + or + or + or ; + +finishing.g practicing.g resisting.g quitting.g: + ( & ) or ; + +finishing.v practicing.v resisting.v quitting.v: + & ; % Pv-: "I want it over with" over_with: ; @@ -5752,7 +5945,7 @@ or (K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+}) or ((O+ or ) & Pa**j+) - or [[@MV+ & O*n+]]) & {@MV+}) + or [[@MV+ & O*n+]]) & ) or ({@MV+} & (Pa+ or AF-)); turn.v: @@ -5770,14 +5963,14 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & {K+} & {@MV+}) + or ( & {K+} & ) or ({K+} & ); turning.v: & ; turning.g: ( & ) or ; % plus TI : - ((O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]] or Pv+) & {@MV+}) + ((O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]] or Pv+) & ) or ({@MV+} & (AF- or Pa+)); become.v: (( & ()) or @@ -5804,7 +5997,7 @@ or ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]] - or [Pv+]} & {@MV+}); + or [Pv+]} & ); remain.v: (( & ()) or ( & (() or ())) or @@ -5827,7 +6020,7 @@ ({@MV+} & (AF- or Pa+)) or ((({O+ or } & {K+}) or (K+ & {[[@MV+]]} & O*n+) - or [[@MV+ & O*n+]]) & {@MV+}); + or [[@MV+ & O*n+]]) & ); grow.v: (( & ()) or @@ -5848,14 +6041,14 @@ (( & ()) or ( & ([] or ())) or (() & )) or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or or ({K+} & ); growing.g: ( & ) or or ; growing.v: & ; % plus OF -: {O+ or or [[@MV+ & O*n+]] or ({@MV+} & OF+)} & {@MV+}; +: {O+ or or [[@MV+ & O*n+]] or ({@MV+} & OF+)} & ; approve.v: (( & ()) or ( & (() or ())) or @@ -5875,7 +6068,7 @@ approving.v: & ; % plus OF -: (O+ or or [[@MV+ & O*n+]] or ({@MV+} & OF+)) & {@MV+}; +: (O+ or or [[@MV+ & O*n+]] or ({@MV+} & OF+)) & ; dispose.v conceive.v: (( & ()) or ( & (() or ())) or @@ -5899,7 +6092,7 @@ ((K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+}) or [[@MV+ & O*n+]] - or OF+) & {@MV+}; + or OF+) & ; speak.v: (( & ()) or ( & (() or ())) or @@ -5919,7 +6112,7 @@ (( & ()) or ( & ([] or ())) or (() & )) or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or ({K+} & ) or ; speaking.v: & ; @@ -5928,7 +6121,7 @@ % @MV+: "The coffee tastes (the same) as it did last year." (do not want O for "the same") : - or ({@MV+} & ((LI+ & {@MV+}) or AF- or Pa+ or OF+)) + or ({@MV+} & ((LI+ & ) or AF- or Pa+ or OF+)) or @MV+; taste.v: (( & ()) or @@ -5949,7 +6142,7 @@ : {} - or ({@MV+} & ((LI+ & {@MV+}) or AF- or Pa+ or OF+)) + or ({@MV+} & ((LI+ & ) or AF- or Pa+ or OF+)) or @MV+; reek.v smell.v: (( & ()) or @@ -5968,9 +6161,9 @@ reeking.g smelling.g: ( & ) or ; reeking.v smelling.v: & ; -% plus partcle and Vt +% plus particle and Vt : - (((K+ & {[[@MV+]]} & O*n+) or ((O+ or ) & {K+ or Vt+}) or [[@MV+ & O*n+]]) & {@MV+}) or + (((K+ & {[[@MV+]]} & O*n+) or ((O+ or ) & {K+ or Vt+}) or [[@MV+ & O*n+]]) & ) or ({O+} & (OT+ or BT-) & {@MV+} & { or }) or (OXii+ & Vtg+ & {@MV+} & TH+) or @MV+; @@ -5992,7 +6185,7 @@ ); taken.v: ( & ) - or ( & {K+} & {@MV+}) + or ( & {K+} & ) or or ({K+} & ) or (Pvf- & & Vtg+ & THi+); @@ -6007,10 +6200,11 @@ : ((K+ & {[[@MV+]]} & O*n+) or ((O+ or ) & (K+ or Pp+ or WR-)) or - (Vp+ & (Zs- or MVa+))) & {@MV+}; + (Vp+ & (Zs- or MVa+))) & ; % I- & WR- & & O+: "where did you put it?" +% bare : "The problem, put simply, is money." put.v-d: (( & ()) or @@ -6019,8 +6213,8 @@ ) or ( & ) or ({@E-} & I- & WR- & & O+) or - ( & (K+ or Pp+ or WR-) & {@MV+}) or - ((K+ or Pp+) & ); + ( & (K+ or Pp+ or WR-) & ) or + ({K+ or Pp+} & ); puts.v: (( & ()) or ( & ([]0.2 or ())) or @@ -6033,7 +6227,7 @@ : (( or (K+ & O*n+) or - ( & {O+})) & {@MV+} & {}) or + ( & {O+})) & & {}) or ([[@MV+ & O*n+]]); cost.v-d: @@ -6050,7 +6244,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & (({K+} & {@MV+}) or Pa+ or Pg+)) or + ( & (({K+} & ) or Pa+ or Pg+)) or ({K+ or Pa+ or Pg+} & ); costs.v: (( & ()) or @@ -6062,10 +6256,10 @@ % ditransitive : - ( & {@MV+}) or + ( & ) or (K+ & {[[@MV+]]} & O*n+) or ( & O+) or - ((O+ or ) & (({@MV+} & (Pa+ or AF- or Pg+)) or ({K+} & {@MV+}))) or + ((O+ or ) & (({@MV+} & (Pa+ or AF- or Pg+)) or ({K+} & ))) or ([[@MV+ & O*n+]]) or ({@MV+} & (TH+ or or RSe+)); @@ -6083,20 +6277,20 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & (({K+ or AF-} & {@MV+}) or Pa+ or Pg+)) or + ( & (({K+ or AF-} & ) or Pa+ or Pg+)) or ({K+ or Pa+ or Pg+} & ); finding.v: & ; finding.g: ( & ) or ; -% ditranstive +% ditransitive : - ((O+ or ) & (({K+} & {@MV+}) or ({@MV+} & (Pa+ or AF- or Pv+)))) + ((O+ or ) & (({K+} & ) or ({@MV+} & (Pa+ or AF- or Pv+)))) or (( or (K+ & {[[@MV+]]} & O*n+) or K+ or ( & O+) - ) & {@MV+}) - or ({@MV+} & (Pa+ or AF- or Pp+ or )); + ) & ) + or ({@MV+} & (Pa+ or AF- or Pp+ or or VC+)); get.v: (( & ()) or @@ -6125,16 +6319,18 @@ (( & ()) or ( & ([] or ())) or (() & )) or - ( & {K+ or Pp+} & {@MV+}) or + ( & {K+ or Pp+} & ) or ({K+ or Pp+} & ); getting.v gettin'.v gettin.v: & & ; getting.g: ( & ) or ; % Pa+: "He left unarmed" : - ((O+ or ) & (({K+} & {@MV+}) or ({@MV+} & {Pa+ or AF- or Pv+ or Pg+}))) - or ({@MV+} & (Pa+ or Pv+)) - or ({(K+ & {[[@MV+]]} & O*n+) or ([[@MV+ & O*n+]])} & {@MV+}); + ((O+ or ) & + (({K+} & ) + or ({@MV+} & {Pa+ or AF- or Pv+ or Pg+}))) + or ({@MV+} & {Xc+} & (Pa+ or Pv+)) + or ({(K+ & {[[@MV+]]} & O*n+) or ([[@MV+ & O*n+]])} & ); leave.v: (( & ()) or @@ -6151,14 +6347,14 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & (({K+ or AF-} & {@MV+}) or Pv+ or Pa+ or Pg+)) + or ( & (({K+ or AF-} & ) or Pv+ or Pa+ or Pg+)) or ({K+ or ({@MV+} & (Pv+ or Pa+ or Pg+))} & ); leaving.v leavin'.v: & ; leaving.g leavin'.g: ( & ) or ; : - ((O+ or (K+ & {[[@MV+]]} & O*n+) or [[@MV+ & O*n+]] or Vk+) & {@MV+}) - or ({O+ or } & (([K+] & {@MV+}) or ({@MV+} & (Pa+ or AF- or Pg+ or Pv+)))); + ((O+ or (K+ & {[[@MV+]]} & O*n+) or [[@MV+ & O*n+]] or Vk+) & ) + or ({O+ or } & (([K+] & ) or ({@MV+} & (Pa+ or AF- or Pg+ or Pv+)))); keep.v: (( & ()) or @@ -6174,7 +6370,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or -( & (({K+ or AF-} & {@MV+}) or Pa+ or Pg+ or Pv+)) or +( & (({K+ or AF-} & ) or Pa+ or Pg+ or Pv+)) or ({K+ or ({@MV+} & (Pa+ or Pg+ or Pv+))} & ); keeping.v: & ; keeping.g: ( & ) or ; @@ -6183,8 +6379,8 @@ track.i: Vk- & {OF+}; : - ((K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+ or Vs+}) or - [[@MV+ & O*n+]]) & {@MV+}; + ((K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+ or Vs+}) + or [[@MV+ & O*n+]]) & ; set.v-d: @@ -6193,7 +6389,7 @@ (() & ) or ) or ( & ) or - ( & {K+ or Vs+} & {@MV+}) or + ( & {K+ or Vs+} & ) or or ({K+ or Vs+} & ); sets.v: @@ -6206,8 +6402,12 @@ free.i straight.i loose.i: Vs- & {MV+}; +% TH+: "The court held that ..." : - ((K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+ or Vh+}) or [[@MV+ & O*n+]]) & {@MV+}; + ((K+ & {[[@MV+]]} & O*n+) + or TH+ + or ({O+ or } & {K+ or Vh+}) + or [[@MV+ & O*n+]]) & ; hold.v: (( & ()) or @@ -6222,7 +6422,7 @@ (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ) or ( & {K+ or Vh+} & {@MV+}) or + ) or ( & {K+ or Vh+} & ) or or ({K+ or Vh+} & ); holding.v: & ; holding.g: ( & ) or ; @@ -6231,7 +6431,7 @@ : ({@MV+} & ( or TH+ or RSe+ or Z- or )) - or ((O+ or or OX+) & {@MV+} & {}) + or ((O+ or or OX+) & & {}) or ([[@MV+ & O*n+]]); expect.v claim.v: @@ -6251,7 +6451,6 @@ ) or ( & { or THi+ or Z-}) or - or or ({@MV+} & {} & ); expecting.g claiming.g: ( & ) or ; expecting.v claiming.v: & ; @@ -6276,19 +6475,18 @@ ) or ( & { or Z- or @MV+}) or - or - or ({@MV+} & {} & ); + or ( & {} & ); intending.g: ( & ) or ; intending.v: & ; % O+ & TO+: "I dare you to!" % TO+ & Xc+: "try it if you dare to!" -% I+: auxilliary: "no one dared say a word" +% I+: auxiliary: "no one dared say a word" % N+ & TO: "I dare not to say the truth" : - ({N+} & {@MV+} & { or (TO+ & Xc+)}) or + ({N+} & & { or (TO+ & Xc+)}) or ({N+} & I+) or - ((O+ or ) & {@MV+} & { or (TO+ & Xc+)}); + ((O+ or ) & & { or (TO+ & Xc+)}); % SI+ & & I+: "How dare you disobey orders" % & N+ & I+: "He dare not lie to me!" (singular subject) @@ -6318,7 +6516,7 @@ % [TO+]: allows null-infinitive: "Yes, I'd love to." : ({@MV+} & ( or [TO+] or Pg+)) - or ((O+ or or OX+) & {@MV+} & {}) + or ((O+ or or OX+) & & {}) or ([[@MV+ & O*n+]]); like.v: @@ -6338,10 +6536,10 @@ liking.g: ( & ) or ; liking.v: & ; -% ditranstive +% ditransitive : (( or - ( & {O+})) & {@MV+}) or + ( & {O+})) & ) or ({@MV+} & ) or ([[@MV+ & O*n+]]); @@ -6359,7 +6557,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or ( & {O+ or -or [[@MV+ & O*n+]]} & {@MV+}) or ({O+ or [[@MV+ & O*n+]]} or ); +or [[@MV+ & O*n+]]} & ) or ({O+ or [[@MV+ & O*n+]]} or ); offering.g: ( & ) or ; offering.v: & ; @@ -6367,8 +6565,8 @@ % unlike vc-offer, "to" is optional. : (( or - ( & {O+})) & {@MV+}) or - ({@MV+} & {}) or + ( & {O+})) & ) or + ( & {}) or ([[@MV+ & O*n+]]); refuse.v: @@ -6385,7 +6583,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or ( & {O+ or or -[[@MV+ & O*n+]]} & {@MV+}) or ({O+ or [[@MV+ & O*n+]]} & ); +[[@MV+ & O*n+]]} & ) or ({O+ or [[@MV+ & O*n+]]} & ); refusing.g: ( & ) or ; refusing.v: & ; @@ -6393,10 +6591,10 @@ % TO+ & Xc+: allows null-infinitive: "Because I want to." % intransitive: "Try it if you want" : - ({@MV+} & ({} or (TO+ & Xc+))) or - ((O+ or or OX+) & {@MV+} & { or Pv+ or Pa**j+}) or + ( & ({} or (TO+ & Xc+))) or + ((O+ or or OX+) & & { or Pv+ or Pa**j+}) or ([[@MV+ & O*n+]]) or - [[CX- & {@MV+}]]; + [[CX- & ]]; want.v need.v: (( & ()) or @@ -6407,7 +6605,7 @@ (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ); + ) or ; wanted.v-d needed.v-d: (( & ()) or @@ -6416,14 +6614,14 @@ ) or or - or + or or ; wanting.g needing.g: ( & ) or ; wanting.v needing.v: & ; : - ({@MV+} & {}) or - ((O+ or ) & {@MV+} & {}) or + ( & {}) or + ((O+ or ) & & {}) or ([[@MV+ & O*n+]]); choose.v: @@ -6447,14 +6645,14 @@ (() & )) or ( & {}) or or - ({@MV+} & {} & ); + ( & {} & ); choosing.g: ( & ) or ; choosing.v: & ; % is identical to : - ({@MV+} & {}) or - ((O+ or ) & {@MV+} & {}) or + ( & {}) or + ((O+ or ) & & {}) or ([[@MV+ & O*n+]]); prepare.v press.v: @@ -6474,13 +6672,12 @@ ) or or - or or ; preparing.g pressing.g: ( & ) or ; preparing.v pressing.v: & ; : - ((O+ or ) & {@MV+} & {}) or + ((O+ or ) & & {}) or ({@MV+} & (TH+ or or TS+ or (SI*j+ & I*j+))) or Zs- or ([[@MV+ & O*n+]]); @@ -6502,14 +6699,13 @@ ) or ( & { or TSi+}) or - or - or ({@MV+} & {} & ); + or ( & {} & ); requiring.g: ( & ) or ; requiring.v: & ; : ({@MV+} & (TH+ or Zs- or TS+ or )) or - ((O+ or ) & {@MV+} & {}) or + ((O+ or ) & & {}) or ([[@MV+ & {O*n+}]]); command.v order.v urge.v: @@ -6536,8 +6732,8 @@ % ditransitive : ({@MV+} & (TH+ or Pg+)) or - ((O+ or or OX+) & {@MV+} & { or Pa+}) or - (((O+ & (B- or ({[[@MV+]]} & O*n+))) or ([[@MV+ & O*n+]])) & {@MV+}); + ((O+ or or OX+) & & { or Pa+}) or + (((O+ & (B- or ({[[@MV+]]} & O*n+))) or ([[@MV+ & O*n+]])) & ); consider.v: (( & ()) or ( & (() or ())) or @@ -6552,7 +6748,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or ( & -(({@MV+} & ( or Pa+)) or ({O+ or or [[@MV+ & O*n+]]} & {@MV+}))) +(({@MV+} & ( or Pa+)) or ({O+ or or [[@MV+ & O*n+]]} & ))) or ((({@MV+} & ( or Pa+)) or ({O+ or [[@MV+ & O*n+]]})) & ); considering.g: ( & ) or ; @@ -6560,7 +6756,7 @@ : ({@MV+} & (TH+ or )) or - ((O+ or or OX+) & {@MV+} & {}) or + ((O+ or or OX+) & & {}) or ([[@MV+ & O*n+]]); perceive.v: @@ -6577,13 +6773,13 @@ ( & ([]0.2 or ())) or (() & ) or ) or ( & {}) or - or ({@MV+} & {} & ); + or ( & {} & ); perceiving.g: ( & ) or ; perceiving.v: & ; : - ({@MV+} & {TH+ or Z- or }) or - ((O+ or ) & {@MV+} & {}) or + ( & {TH+ or Z- or }) or + ((O+ or ) & & {}) or ([[@MV+ & O*n+]]); report.v: @@ -6603,14 +6799,13 @@ ) or ( & { or Z-}) or - or - or ({@MV+} & {} & ); + or ( & {} & ); reporting.g: ( & ) or ; reporting.v: & ; : - ((O+ or ) & {@MV+} & {TH+ or or }) or - ({@MV+} & {TH+ or Zs-}) or + ((O+ or ) & & {TH+ or or }) or + ( & {TH+ or Zs-}) or ([[@MV+ & O*n+]]); caution.v: @@ -6628,15 +6823,15 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & ((O+ or ) & {@MV+} & {TH+ or or Zs- or })) - or ({@MV+} & {TH+ or or } & ); + or ( & ((O+ or ) & & {TH+ or or Zs- or })) + or ( & {TH+ or or } & ); cautioning.g: ( & ) or ; cautioning.v: & ; : - ((O+ or ) & {@MV+} & {TH+ or or or (OF+ & {@MV+})}) or - ({@MV+} & {TH+ or Zs- or (OF+ & {@MV+})}) or + ((O+ or ) & & {TH+ or or or (OF+ & )}) or + ( & {TH+ or Zs- or (OF+ & )}) or ([[@MV+ & O*n+]]); warn.v advise.v: @@ -6654,14 +6849,14 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {TH+ or or Zs- or or (OF+ & {@MV+})}) or - ({@MV+} & {TH+ or or or OF+} & ); + ( & {TH+ or or Zs- or or (OF+ & )}) or + ( & {TH+ or or or OF+} & ); warning.g advising.g: ( & ) or ; warning.v advising.v: & ; : - ((O+ or ) & {@MV+} & {I*j+ or Pg+}) or - ({@MV+} & {TH+ or Zs- or or (OF+ & {@MV+})}) or + ((O+ or ) & & {I*j+ or Pg+}) or + ( & {TH+ or Zs- or or (OF+ & )}) or ([[@MV+ & O*n+]]); hear.v: @@ -6680,16 +6875,16 @@ (() & ) or ) or ( & {Pg+}) or - ({@MV+} & {Pg+} & ); + ( & {Pg+} & ); hearing.g: ( & ) or ; hearing.v: & ; % Xc+: poor comma choice: "The man that you saw laugh, jumped off a cliff". : - (( or O+) & {@MV+} & {I*j+ or Pg+ or AZ+ or Pv+}) + (( or O+) & & {I*j+ or Pg+ or AZ+ or Pv+}) or ( & {@MV+} & [I*j+ & Xc+]) - or ({@MV+} & {TH+ or Zs- or QI+ or }) + or ( & {TH+ or Zs- or QI+ or }) or ([[@MV+ & O*n+]]); see.v: @@ -6713,45 +6908,49 @@ ( & ([] or ())) or (() & )) or ( & {Pg+ or AZ+}) or - ({@MV+} & {Pg+ or AZ+} & ); + ( & {Pg+ or AZ+} & ); seeing.g: ( & ) or ; seeing.v: & ; -% ditranstive verbs -- taking direct and indirect objects +% ditransitive verbs -- taking direct and indirect objects : - ( or - (B- & {O+}) or - ([[@MV+ & O*n+]])) & {@MV+}; + ( + or (B- & {O+}) + or ([[@MV+ & O*n+]]) + ) & ; -owe.v deliver.v accord.v award.v term.v grant.v begrudge.v assign.v rename.v repay.v -dub.v entitle.v fine.v: +owe.v deliver.v accord.v award.v term.v grant.v begrudge.v +assign.v rename.v repay.v dub.v entitle.v fine.v: (( & ()) or ( & (() or ())) or (() & )); -owes.v delivers.v accords.v awards.v terms.v grants.v begrudges.v assigns.v renames.v -repays.v dubs.v entitles.v fines.v: +owes.v delivers.v accords.v awards.v terms.v grants.v begrudges.v +assigns.v renames.v repays.v dubs.v entitles.v fines.v: (( & ()) or ( & ([]0.2 or ())) or (() & ) or ); -owed.v delivered.v accorded.v awarded.v + +owed.v-d delivered.v-d accorded.v-d awarded.v-d granted.v-d begrudged.v-d assigned.v-d repaid.v-d fined.v-d: (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ) or - ( & {O+ or or [[@MV+ & O*n+]]} & {@MV+}) or - ({O+ or [[@MV+ & O*n+]]} & ); + ) + or ( & {O+ or or [[@MV+ & O*n+]]} & ) + or ({O+ or [[@MV+ & O*n+]]} & ); + owing.v delivering.v according.v awarding.v terming.v granting.v begrudging.v assigning.v renaming.v repaying.v dubbing.v entitling.v fining.v: - & ; + & ; + owing.g delivering.g according.g awarding.g terming.g granting.g begrudging.g assigning.g renaming.g repaying.g dubbing.g entitling.g fining.g: -( & ) or ; + ( & ) or ; % extended linking requirements based on the above termed.v-d dubbed.v-d entitled.v-d renamed.v-d: @@ -6760,10 +6959,30 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {O+ or or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or [[@MV+ & O*n+]]} & ) or ({O+ or [[@MV+ & O*n+]]} & ) or ; +% intransitive: "we deliver!" +deliver.w: + + (( & ()) or + ( & (() or ())) or + (() & )); +delivers.w: + + (( & ()) or + ( & ([] or ())) or + (() & )); + +% & Pa+: "It was delived broken" +delivered.w-d: + + (( & ()) or + ( & ([] or ())) or + (() & )) + or ( & ({Xc+} & Pa+)); + % ditransitive % 'Give' requires both direct *and* indirect object: X gave Y a Z. % 'sent', 'poured': optional indirect object. @@ -6777,7 +6996,7 @@ or (O+ & K+) or (K+ & {[[@MV+]]} & O*n+) or ([[@MV+ & O*n+]]) - } & {@MV+}; + } & ; give.v send.v bring.v lend.v issue.v hand.v pour.v: @@ -6799,7 +7018,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); issued.v-d poured.v-d: @@ -6809,7 +7028,7 @@ (() & ) or ) or or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); gave.v-d: @@ -6823,7 +7042,7 @@ ( & ([] or ())) or (() & )) or or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); giving.g sending.g bringing.g @@ -6842,7 +7061,7 @@ or (O+ & K+) or (K+ & {{[[@MV+]]} & O*n+}) or - ([[@MV+ & O*n+]])} & {@MV+}; + ([[@MV+ & O*n+]])} & ; pass.v buy.v pay.v sell.v deal.v telegraph.v wire.v: (( & ()) or @@ -6856,7 +7075,7 @@ ); % (S- & B-) : allows WV-less attach to "The dog which Chris bought is ugly" -% (({@E-} or {@EA-} or {@AN-}) & A+): "The telegraphed orders never arrived" +% : "The telegraphed orders never arrived" passed.v-d bought.v-d paid.v-d payed.v-d sold.v-d dealt.v-d telegraphed.v-d wired.v-d: @@ -6865,8 +7084,8 @@ (() & ) or ) or (S- & ) or - (({@E-} or {@EA-} or {@AN-}) & A+) or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); passing.g buying.g paying.g selling.g dealing.g telegraphing.g wiring.g: @@ -6885,7 +7104,7 @@ or or (O+ & (Pa+ or K+)) or (K+ & {{[[@MV+]]} & O*n+}) - or ([[@MV+ & O*n+]])} & {@MV+}) + or ([[@MV+ & O*n+]])} & ) or ; % This is not quite right: @@ -6905,13 +7124,13 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & {O+ or or K+ or Pa+ or [[@MV+ & O*n+]]} & {@MV+}) + or ( & {O+ or or K+ or Pa+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or Pa+ or [[@MV+ & O*n+]]} & ); calling.g shouting.g: ( & ) or ; calling.v shouting.v: & ; -% Minimal ditransitive extenstion of words.v.6 +% Minimal ditransitive extension of words.v.6 % ditransitive: "Please paint it lime green" % (O+ & Pa+): "Please paint it green" : @@ -6935,7 +7154,7 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & {O+ or K+ or Pa+} & {@MV+}) + or ( & {O+ or K+ or Pa+} & ) or ({K+} & ) or ; @@ -6949,7 +7168,7 @@ % ditransitive % Writing -- direct and indirect object are optional: % 'he wrote' 'he wrote a letter' 'he wrote me a letter' 'he wrote me' -% 'he wrote me that blah happend' but '*he drew me that blah happened' +% 'he wrote me that blah happened' but '*he drew me that blah happened' % % & TH+: "he wrote her that he loved her" : @@ -6958,7 +7177,7 @@ or (O+ & K+) or (K+ & {{[[@MV+]]} & O*n+}) or ([[@MV+ & O*n+]]) - } & {@MV+}) + } & ) or ({@MV+} & (TH+ or )); write.v charge.v draw.v: @@ -6983,7 +7202,7 @@ (() & ) or ) or ( & ) or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); charged.v-d: @@ -6991,16 +7210,15 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); written.v-d drawn.v-d w/o.v-d: (( & ()) or ( & ([] or ())) or (() & )) or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ) or - or ; writing.v reading.v charging.v drawing.v: @@ -7017,7 +7235,7 @@ or (O+ & K+) or (K+ & {{[[@MV+]]} & O*n+}) or - ([[@MV+ & O*n+]])} & {@MV+}); + ([[@MV+ & O*n+]])} & ); sing.v dance.v cry.v: (( & ()) or ( & (() or ())) or @@ -7050,8 +7268,8 @@ % ditransitive : (( or - ([[@MV+]] & O*n+)) & {@MV+}) or - ((O+ or ) & {@MV+} & {B- or }); + ([[@MV+]] & O*n+)) & ) or + ((O+ or ) & & {B- or }); allow.v: (( & ()) or @@ -7067,16 +7285,16 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & (({O+ or or [[@MV+ & O*n+]]} & {@MV+}) or ({@MV+} & ))) + ( & (({O+ or or [[@MV+ & O*n+]]} & ) or ({@MV+} & ))) or ({O+ or [[@MV+ & O*n+]] or ({@MV+} & )} & ); allowing.g: ( & ) or ; allowing.v: & ; % ditransitive : - ({O+ or } & {@MV+} & {B- or or or TH+ or RSe+ or Zs-}) or + ({O+ or } & & {B- or or or TH+ or RSe+ or Zs-}) or (( or - ([[@MV+ & O*n+]])) & {@MV+}); + ([[@MV+ & O*n+]])) & ); promise.v: (( & ()) or @@ -7093,9 +7311,8 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & (({O+ or or [[@MV+ & O*n+]]} & {@MV+}) or ({@MV+} & ( or or TH+ or RSe+ or Zs-)))) + or ( & (({O+ or or [[@MV+ & O*n+]]} & ) or ({@MV+} & ( or or TH+ or RSe+ or Zs-)))) or - or or ({O+ or [[@MV+ & O*n+]] or ({{@MV+} & ( or or TH+)})} & ); promising.g: ( & ) or ; promising.v: & ; @@ -7108,17 +7325,17 @@ (O+ & K+) or (K+ & (B- or ({[[@MV+]]} & O*n+))) or ([[@MV+ & O*n+]]) or - [[()]]) & {@MV+}); + [[()]]) & ); show.v: (( & ()) or ( & (() or ())) or - (() & )); + (() & )) or ; shows.v: (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ); + ) or ; showed.v-d: (( & ()) or ( & ([]0.2 or ())) or @@ -7131,7 +7348,7 @@ (() & )) or or ( & - (({O+ or K+ or B- or [[@MV+ & O*n+]]} & {@MV+}) or + (({O+ or K+ or B- or [[@MV+ & O*n+]]} & ) or ({@MV+} & (QI+ or or TH+ or RSe+ or Zs-)))) or ({O+ or K+ or [[@MV+ & O*n+]] or ({@MV+} & (QI+ or or TH+))} & ); showing.g: ( & ) or ; @@ -7143,7 +7360,7 @@ or ({ or (B- & {[[@MV+]]} & O*n+) - or ([[@MV+ & O*n+]])} & {@MV+}); + or ([[@MV+ & O*n+]])} & ); teach.v: (( & ()) or @@ -7161,7 +7378,7 @@ (() & ) or ) or ( & - (({O+ or or [[@MV+ & O*n+]]} & {@MV+}) or + (({O+ or or [[@MV+ & O*n+]]} & ) or ({@MV+} & (QI+ or or TH+ or RSe+ or Zs- or )))) or ({O+ or [[@MV+ & O*n+]] or ({@MV+} & (QI+ or or TH+))} & ); teaching.g: ( & ) or ; @@ -7191,7 +7408,7 @@ : (((O+ or ) & (({@MV+} & ) or K+ or [()])) or (K+ & O*n+) or - ([[{K+} & @MV+ & O*n+]])) & {@MV+}; + ([[{K+} & @MV+ & O*n+]])) & ; force.v: (( & ()) or @@ -7208,7 +7425,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & ((K+ & {@MV+}) or + ( & ((K+ & ) or ({@MV+} & ))) or ((K+ or ({@MV+} & )) & ) or ; @@ -7218,31 +7435,37 @@ % ----------------------------------------- : - (B- & {@MV+} & {}) or - (O+ & {@MV+} & {}) or - ([[@MV+ & O*n+ & {@MV+}]]); + (B- & & {}) or + (O+ & & {}) or + ([[@MV+ & O*n+ & ]]); design.v permit.v authorize.v use.v cause.v enable.v pressure.v train.v sentence.v prompt.v spur.v disincline.v -invite.v reelect.v encourage.v draft.v hire.v entice.v inspire.v aid.v -forbid.v employ.v educate.v tempt.v condemn.v commission.v counsel.v induce.v instruct.v +invite.v reelect.v encourage.v draft.v hire.v entice.v inspire.v +aid.v forbid.v employ.v educate.v tempt.v condemn.v commission.v +counsel.v induce.v instruct.v license.v incite.v nominate.v destine.v provoke.v challenge.v exhort.v implore.v motivate.v impel.v: - + (( & ()) or ( & (() or ())) or - (() & )); -designs.v permits.v pressures.v trains.v sentences.v causes.v enables.v -authorizes.v uses.v prompts.v spurs.v disinclines.v -invites.v reelects.v encourages.v drafts.v hires.v entices.v inspires.v aids.v -forbids.v employs.v educates.v tempts.v condemns.v commissions.v counsels.v induces.v + (() & )) + or ( & O+ & Xc+); + +designs.v permits.v pressures.v trains.v sentences.v causes.v +enables.v authorizes.v uses.v prompts.v spurs.v disinclines.v +invites.v reelects.v encourages.v drafts.v hires.v entices.v +inspires.v aids.v forbids.v employs.v educates.v tempts.v +condemns.v commissions.v counsels.v induces.v instructs.v licenses.v incites.v nominates.v destines.v provokes.v challenges.v exhorts.v implores.v motivates.v impels.v: - + (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ); + ) + or ( & O+ & Xc+); + designed.v-d permitted.v-d pressured.v-d trained.v-d sentenced.v-d caused.v-d enabled.v-d authorized.v-d prompted.v-d spurred.v-d invited.v-d disinclined.v-d @@ -7257,11 +7480,12 @@ (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ) or - ( & {}) or - or - or - ({{@MV+} & } & ); + ) + or ( & {}) + or + or ( & O+ & Xc+) + or ({{@MV+} & } & ); + forbade.v-d: (( & ()) or ( & ([]0.2 or ())) or @@ -7300,20 +7524,21 @@ (( & ()) or ( & ([]0.2 or ())) or (() & ) or - ) or - ( & {}) or - ( & ) or - ({@MV+} & {} & ) or - or - ; + ) + or ( & {}) + or ( & ) + or + or ( & O+ & Xc+) + or ( & {} & ); + using.g: ( & ( or MVs-)) or ; % -------------------------------------------------- : - ((O+ or ) & (({@MV+} & {}) or + ((O+ or ) & (( & {}) or ({[[@MV+]]} & (O*n+ or TI+)))) or - ([[@MV+ & O*n+ & {@MV+}]]); + ([[@MV+ & O*n+ & ]]); elect.v appoint.v: (( & ()) or @@ -7330,7 +7555,7 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & (({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]} & {@MV+}) or ({@MV+} & ))) + or ( & (({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]} & ) or ({@MV+} & ))) or (({O+ or TI+ or [[@MV+ & (O*n+ or TI+)]]} or ({@MV+} & )) & ) or ; @@ -7343,7 +7568,7 @@ (( or (O+ & {[[@MV+]]} & TI+) or (B- & {O+ or TI+}) - or ([[@MV+ & O*n+]])) & {@MV+}) + or ([[@MV+ & O*n+]])) & ) or ; name.v designate.v label.v: @@ -7361,7 +7586,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]}) & {@MV+}) or + ( & ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]}) & ) or ({O+ or TI+ or [[@MV+ & (O*n+ or TI+)]]} & ) or ; naming.g designating.g labelling.g labeling.g: ( & ) or ; @@ -7384,7 +7609,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - (( & ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]}) & {@MV+}) or ) or + (( & ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]}) & ) or ) or ({O+ or TI+ or [[@MV+ & (O*n+ or TI+)]]} & ) or ; tagging.g: @@ -7394,9 +7619,9 @@ tagging.v: & ; : - {((O+ or ) & {@MV+} & {}) or + {((O+ or ) & & {}) or @MV+ or - ([[@MV+ & O*n+ & {@MV+}]])}; + ([[@MV+ & O*n+ & ]])}; program.v oblige.v: (( & ()) or @@ -7420,8 +7645,8 @@ programing.v programming.v obliging.v: & ; : - ((O+ or ) & {@MV+} & { or TH+ or }) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & { or TH+ or }) or + ([[@MV+ & O*n+ & ]]); convince.v persuade.v: (( & ()) or ( & (() or ())) or @@ -7438,7 +7663,7 @@ (() & ) or ) or ( & { or TH+ or }) or - ({{@MV+} & ( or TH+ or )} & ); + ({ & ( or TH+ or )} & ); convincing.g persuading.g: ( & ) or ; convincing.v persuading.v: & ; @@ -7451,11 +7676,11 @@ % : (((O+ & {O*n+ or K+}) or ) - & {@MV+} & {TH+ or RSe+ or Zs- or or QI+ or BW-}) - or ({O+ & {@MV+}} & ) + & & {TH+ or RSe+ or Zs- or or QI+ or BW-}) + or ({O+ & } & ) or OF+ or (QI+ & {MV+}) - or ([[@MV+ & {O*n+} & {@MV+}]]); + or ([[@MV+ & {O*n+} & ]]); tell.v: (( & ()) or @@ -7474,7 +7699,7 @@ (() & ) or ) or ( & {TH+ or or RSe+ or Zs- or or QI+ or BW-}) - or ( & O+ & {@MV+}) + or ( & O+ & ) or ({{@MV+} & ( or or QI+ or TH+)} & ); telling.g: ( & ) or ; @@ -7485,9 +7710,9 @@ % (QI+ & {MV+}): "I did not ask why until recently" : - ({(O+ & {O*n+}) or } & {@MV+} + ({(O+ & {O*n+}) or } & & {TS+ or or (QI+ & {MV+}) or BW-}) - or ([[@MV+ & O*n+ & {@MV+}]]); + or ([[@MV+ & O*n+ & ]]); ask.v: (( & ()) or @@ -7505,37 +7730,40 @@ (() & ) or ) or ( & { or QI+ or BW- or TH+ or TS+}) - or ( & O+ & {@MV+}) + or ( & O+ & ) or ({{@MV+} & ( or QI+ or TH+ or TS+)} & ); asking.g: ( & ) or ; asking.v: & ; +% TH+: "it helps that you know already" : - ({O+ or } & {@MV+} & { or I+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ({O+ or } & {@MV+} & { or I+}) + or [[@MV+ & O*n+ & ]]; help.v: (( & ()) or ( & (() or ())) or (() & )); helps.v: - (( & ()) or - ( & ([]0.2 or ())) or - (() & ) or - ); -helped.v-d: - (( & ()) or - ( & ([]0.2 or ())) or - (() & ) or - ) or ( & {}) or -({{@MV+} & } & ); + (( & ( or TH+)) or + ( & ([ or TH+]0.2 or ())) or + (( or TH+) & ) or + ); +helped.v-d: + + (( & ( or TH+)) or + ( & ([ or TH+]0.2 or ())) or + (( or TH+) & ) or + ) + or ( & {}) + or ({{@MV+} & } & ); helping.g: ( & ) or ; helping.v: & ; : - ((O+ or ) & {@MV+} & ( or TH+ or or (OF+ & {@MV+}))) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & {@MV+} & ( or TH+ or or (OF+ & ))) or + ([[@MV+ & O*n+ & ]]); remind.v: (( & ()) or @@ -7551,14 +7779,14 @@ ( & ([]0.2 or ())) or (() & ) or ) or ( & { or TH+ or or -(OF+ & {@MV+})}) or ({{@MV+} & ( or or TH+ or -(OF+ & {@MV+}))} & ); +(OF+ & )}) or ({{@MV+} & ( or or TH+ or +(OF+ & ))} & ); reminding.g: ( & ) or ; reminding.v: & ; : - ((O+ or ) & {@MV+} & {(OF+ & {@MV+}) or TH+ or Zs- or }) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {(OF+ & ) or TH+ or Zs- or }) or + ([[@MV+ & O*n+ & ]]); inform.v reassure.v alert.v guarantee.v notify.v forewarn.v: @@ -7580,7 +7808,7 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & { or TH+ or Zs- or (OF+ & {@MV+})}) + or ( & { or TH+ or Zs- or (OF+ & )}) or ({{@MV+} & ( or TH+ or OF+)} & ) or ; @@ -7591,8 +7819,8 @@ & ; : - ((O+ or ) & {@MV+} & {(OF+ & {@MV+}) or TH+ or Zs- or }) or - ([[@MV+ & O*n+ & {@MV+}]]) or + ((O+ or ) & & {(OF+ & ) or TH+ or Zs- or }) or + ([[@MV+ & O*n+ & ]]) or ({@MV+} & (TH+ or )); assure.v: @@ -7610,15 +7838,15 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & {(OF+ & {@MV+}) or or TH+ or Zs-}) + or ( & {(OF+ & ) or or TH+ or Zs-}) or ({{@MV+} & ( or TH+ or OF+)} & ); assuring.g: ( & ) or ; assuring.v: & ; : - ((O+ or ) & {@MV+} & {I+ or ((K+ or Pp+) & {@MV+})}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {I+ or ((K+ or Pp+) & )}) or + ([[@MV+ & O*n+ & ]]); let.v-d: @@ -7627,7 +7855,7 @@ (() & ) or ) or ( & ) or - ( & ((K+ or Pp+) & {@MV+})) or + ( & ((K+ or Pp+) & )) or ((K+ or Pp+) & ); lets.v: (( & ()) or @@ -7638,13 +7866,13 @@ letting.v: & ; % Abbreviation for "let us" -% Is there any reason to create a defintion such as 's.n: Ox-? +% Is there any reason to create a definition such as 's.n: Ox-? let's let’s: ({Ic-} & Wi- & {N+} & I+) or ({Ic-} & Wi- & N+); : - ((O+ or ) & {@MV+} & {I*j+ or Pg+}) or - ([[@MV+ & O*n+ & {@MV+}]]) or - {@MV+}; + ((O+ or ) & & {I*j+ or Pg+}) or + ([[@MV+ & O*n+ & ]]) or + ; watch.v: (( & ()) or @@ -7668,8 +7896,8 @@ watching.v: & ; : - ((O+ or ) & {@MV+} & {Pg+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {Pg+}) or + ([[@MV+ & O*n+ & ]]); appreciate.v spend.v: (( & ()) or @@ -7695,15 +7923,15 @@ % ditransitive : ((O+ or or OX+) & { - ({@MV+} & {I*j+ or Pa**j+ or B-}) - or ((K+ or AF-) & {@MV+})}) + ( & {I*j+ or Pa**j+ or B-}) + or ((K+ or AF-) & )}) or (( or (K+ & {[[@MV+]]} & O*n+) or K+ or Vm+ or ([[{K+} & @MV+ & O*n+]]) - ) & {@MV+}) + ) & ) or [[()]]; make.v: @@ -7721,19 +7949,19 @@ ( & ([]0.2 or ())) or (() & ) or ) - or ( & (({@MV+} & Pa+) or ({O+ or K+} & {@MV+}))) + or ( & (( & Pa+) or ({O+ or K+} & ))) or ({({@MV+} & Pa+) or K+} & ); built_of built_up_of composed_of constructed_of formed_of made_of made_up_of: - ( & (O+ or ) & {@MV+}) or (O+ & ); + ( & (O+ or ) & ) or (O+ & ); making.g: ( & ) or ; making.v: & ; : (((O+ or ) & {({@MV+} & Pa+) or AF-}) or - ([[@MV+ & O*n+]])) & {@MV+}; + ([[@MV+ & O*n+]])) & ; render.v deem.v: (( & ()) or @@ -7755,7 +7983,7 @@ : (((O+ or ) & {{@MV+} & OF+}) or - ([[@MV+ & O*n+]])) & {@MV+}; + ([[@MV+ & O*n+]])) & ; deprive.v accuse.v acquit.v purge.v disabuse.v exonerate.v absolve.v rob.v convict.v: @@ -7775,7 +8003,7 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {OF+} & {@MV+}) or + ( & {OF+} & ) or or ({{@MV+} & OF+} & ); depriving.g accusing.g acquitting.g purging.g disabusing.g exonerating.g @@ -7788,7 +8016,7 @@ (((O+ or ) & {({@MV+} & OF+) or K+}) or ({K+} & O*n+) or K+ or - ([[{K+} & @MV+ & O*n+]])) & {@MV+}; + ([[{K+} & @MV+ & O*n+]])) & ; clear.v: (( & ()) or @@ -7805,14 +8033,14 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {({@MV+} & OF+) or K+} & {@MV+}) or + ( & {({@MV+} & OF+) or K+} & ) or ({K+ or ({@MV+} & OF+)} & ); clearing.g: ( & ) or ; clearing.v: & ; : - ({(O+ & {O*n+}) or ( & {O+})} & {@MV+} & {TH+ or or RSe+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ({(O+ & {O*n+}) or ( & {O+})} & & {TH+ or or RSe+}) or + ([[@MV+ & O*n+ & ]]); bet.v-d: @@ -7822,7 +8050,7 @@ ) or ( & ) or or - ( & {O+ or } & {@MV+} & {TH+ or or RSe+ or @MV+}); + ( & {O+ or } & & {TH+ or or RSe+ or @MV+}); bets.v: (( & ()) or ( & ([]0.2 or ())) or @@ -7834,15 +8062,15 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {O+ or } & {@MV+} & {TH+ or or RSe+ or @MV+}) or + ( & {O+ or } & & {TH+ or or RSe+ or @MV+}) or ({O- or [[@MV+ & O*n+]] or TH+ or } & ); betting.g: ( & ) or ; betting.v: & ; : ({@MV+} & ) or - ((O+ or ) & {@MV+} & {THi+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {THi+}) or + ([[@MV+ & O*n+ & ]]); bother.v: (( & ()) or @@ -7863,7 +8091,7 @@ bothering.g: ( & ) or ; : - ((O+ or ) & {@MV+} & {THi+}) or + ((O+ or ) & & {THi+}) or ([[@MV+ & O*n+]]); surprise.v alarm.v amaze.v amuse.v annoy.v @@ -7904,8 +8132,8 @@ ( & ) or ; : - ((O+ or or [[@MV+ & O*n+]]) & {@MV+}) or - ((O+ or or OX+) & {@MV+} & ( or [[Pa+]])) or + ((O+ or or [[@MV+ & O*n+]]) & ) or + ((O+ or or OX+) & {@MV+} & ( or [[{Xc+} & Pa+]])) or ({@MV+} & ( or TH+ or or RSe+ or Zs- or (Pa+ & ))); prove.v: @@ -7922,18 +8150,18 @@ ( & ([] or ())) or (() & )) or ( & {THi+ or }) - or + or ( & {dCPu-} & {MV+}) or ({{@MV+} & Pa+} & ); proven.v: ( & ) or - ( & {THi+ or or Pa+}) or + ( & {THi+ or or Pa+ or dCPu-}) or or ({{@MV+} & Pa+} & ); proving.g: ( & ) or ; proving.v: & ; : - ((O+ or or [[@MV+ & O*n+]]) & {@MV+}) or + ((O+ or or [[@MV+ & O*n+]]) & ) or ({@MV+} & (Pg+ or TH+ or or RSe+ or Zs- or TS+ or ((SI*j+ or SFI**j+) & I*j+))); suggest.v anticipate.v recommend.v: @@ -7953,7 +8181,6 @@ ) or ( & {THi+ or TSi+ or Z-}) or - or or ; suggesting.g anticipating.g recommending.g: ( & ) or ; suggesting.v anticipating.v recommending.v: & ; @@ -7962,7 +8189,7 @@ : (( or (B- & {O+}) or - [[@MV+ & O*n+]]) & {@MV+}) or + [[@MV+ & O*n+]]) & ) or ({@MV+} & (Pg+ or TH+ or or RSe+)); deny.v: @@ -7980,15 +8207,15 @@ ( & ([]0.2 or ())) or (() & ) or ) or - ( & {O+ or or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or [[@MV+ & O*n+]]} & ) or ({O+ or ([[@MV+ & O*n+]])} & ); denying.g: ( & ) or ; denying.v: & ; : - ((O+ or ) & {@MV+} & {AZ+}) or + ((O+ or ) & & {AZ+}) or ({@MV+} & (QI+ or Z-)) or - ([[@MV+ & O*n+ & {@MV+}]]); + ([[@MV+ & O*n+ & ]]); describe.v: (( & ()) or @@ -8007,14 +8234,13 @@ ) or ( & {AZ+ or Z-}) or or - or - ({@MV+} & {AZ+} & ); + ( & {AZ+} & ); describing.g: ( & ) or ; describing.v: & ; : - ((O+ or ) & {@MV+} & {AZ+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {AZ+}) or + ([[@MV+ & O*n+ & ]]); portray.v depict.v regard.v view.v characterize.v: (( & ()) or @@ -8033,8 +8259,7 @@ ) or ( & {AZ+}) or or - or - ({@MV+} & {AZ+} & ); + ( & {AZ+} & ); portraying.g depicting.g regarding.g viewing.g characterizing.g: ( & ) or ; portraying.v depicting.v regarding.v viewing.v characterizing.v: @@ -8114,7 +8339,7 @@ seeing_fit letting_go taking_note coming_true coming_clean coming_of_age: ( & ) or ; -: (O+ or or [[@MV+ & O*n+]]) & {@MV+}; +: (O+ or or [[@MV+ & O*n+]]) & ; allow_for bring_about get_rid_of let_go_of take_note_of: (( & ()) or @@ -8234,7 +8459,7 @@ given_up gone_around: & ; keeping_on giving_up going_around: ( & ) or ; -: Pg+ or Pa+ or ({AF-} & {@MV+}); +: Pg+ or Pa+ or ({AF-} & ); end_up: (( & ()) or ( & ([]0.2 or ())) or @@ -8257,6 +8482,7 @@ /en/words/words.v.1.p: or + or or ; % ----------------------------------------------------------------- @@ -8266,11 +8492,11 @@ % The naked Qd+, without a WV+, links to subj-verb-inverts: "are you % insane?", "Are you the one?" % XXX everywhere where Ws+ is used, should probably be !? -: Wa+ or Wi+ or Wn+ or Ww+ or Qd+; -: (Wd+ or Wp+ or Wr+ or Wq+ or Ws+ or Wj+ or Wc+ or We+ or Wt+ or Wo+) & ; +: hWa+ or hWi+ or hWn+ or hWw+ or hQd+; +: (hWd+ or hWp+ or hWr+ or hWq+ or hWs+ or hWj+ or hWc+ or hWe+ or hWt+ or hWo+) & ; % Paraphrasing, quotational complements: -: [()]; +: [()]0.1; % Quote with or without quotation marks. % "This is a test," she said. @@ -8283,7 +8509,7 @@ % QU+ & & QU+: He said, "This is it." % Xc+ or Xe+ or [[()]]: punctuation is commonly missing. : - ({@MV+} & (Xc+ or Xp+) & CP-) + ({@MV+} & (Xc+ or Xp+ or ) & CP-) or ({@MV+} & ((Xd- or Xq-) & (Xc+ or Xp+ or ) & (COq+ or CP- or Eq+ or ))) or [{@MV+} & (Xc+ or Xe+ or [[()]]) & ] @@ -8291,9 +8517,11 @@ & QUd+ & ( or ) & {X+} & QUc+); % Xd- & Xc+: "If I'm right, he thought, this will work." +% CPa- & Xc+: "So thinks everyone" : {@MV+} & (((Xd- or Xq-) & (Xc+ or Xp+ or ) & (COq+ or CPx- or Eq+ or )) + or (CPa- & Xc+) or [(Xc+ or Xe+) & ]); % filler-it: "The President is busy, it seems." @@ -8308,14 +8536,16 @@ [[{@E-} & (((Sp- or I-) & & ) or (SIpj+ & ))]]; /en/words/words.v.10.2: - [[{@E-} & ((Ss- & & ) or (SIsj+ & ))]]; + [[{@E-} & Ss- & & ]] + or [[{@E-} & SIsj+ & ]]; % XXX Why is there a cost on Pvf- ??? /en/words/words.v.10.3: {@E-} & ( ((S- or PP-) & & ) or (SI*j+ & ) - or [Pvf- & ]); + or [Pvf- & ] + or (Pv- & CV- & dCPu-)); read.q-d: {@E-} & (((S- or I- or PP-) & & ) or (SI*j+ & )); @@ -8323,12 +8553,15 @@ wrote.q-d: {@E-} & ((S- & & ) or (SI*j+ & )); -written.q: {@E-} & PP- & ; +written.q: + {@E-} & ((PP- & ) or (Pv- & CV- & dCPu-)); /en/words/words.v.10.4: [[{@E-} & Pg- & ]]; seem.q appear.q: [[{@E-} & (SFp- or If-) & ]]; -seems.q appears.q: [[{@E-} & SFs- & ]]; +seems.q appears.q: + [[{@E-} & SFs- & ]] + or ({@E-} & Pv- & CV- & dCPu-); seemed.q-d appeared.q-d: {@E-} & (SF- or PPf-) & ; seeming.q appearing.q: [[{@E-} & Pgf- & ]]; @@ -8339,7 +8572,10 @@ {@E-} & ((Ss- & & ) or (SIsj+ & )); said.q-d: - {@E-} & (((S- or PP-) & & ) or (SI*j+ & )); + {@E-} & ( + ((S- or PP-) & & ) + or (SI*j+ & ) + or (Pv- & CV- & dCPu-)); saying.q: {@E-} & Pg- & ; @@ -8367,7 +8603,7 @@ % idiomatic "voted yes/no" expressions using the V link. % "he answered yes", "say yes!", "Just say no!" -: {Xc+} & Vv+ & {@MV+}; +: {Xc+} & Vv+ & ; answer.w reply.w say.w vote.w: (( & ()) or ( & (() or ())) or @@ -8446,9 +8682,13 @@ % (using MVp-) is generally preferred. The cost is small, though, % to allow modifiers on conjoined nouns to work well. % e.g. "...went to hell yesterday and heaven on Tuesday" +% +% [Mp- & MVp-]-0.61 prefers a connection to both the noun and the +% verb, helping disambiguate. The weight 0.61 plus 0.4 is greater +% than one, to overcome cost on @MV+. : - or [Mp-]0.4 or Pp- or MVp- + or [Mp-]0.4 or Pp- or MVp- or [Mp- & MVp-]-0.61 or [({Xc+ & {Xd-}} & CO+)] or (Xd- & Xc+ & (MX*x- or MVx-)); @@ -8492,7 +8732,7 @@ or [MVp- & B-]; % -% XXX fixme: MVp- & J+ is wrong: "*I saw John except Fred" +% XXX FIXME: MVp- & J+ is wrong: "*I saw John except Fred" % XXX (The MVp- coming from prep-main-a) % So give J+ a cost, to minimize this, for now ... % Could this be fixable in postprocessing? @@ -8541,6 +8781,11 @@ into: ({JQ+} & (J+ or Mgp+ or QI+) & ) or [MVp- & B-]; +% re: "with regard to" +% Xe+: optional colon +re: + {Xe+} & J+ & ; + % cost on MVa- to give preference to MVl- about: ({JQ+} & (J+ or Mgp+ or QI+) & ) @@ -8552,6 +8797,7 @@ or [MVa-] or (MVl- & (MVp+ or MVa+ or MVs+)); + % Grep also for "just_about", used as syonym for "nearly", "almost" % XXX is this really needed ?? Seems to duplicate other stuff .. just_about nearly_about almost_about right_about: @@ -8657,21 +8903,30 @@ or [({Xc+ & {Xd-}} & CO+)] or (Xd- & Xc+ & (E+ or MVx-))); +% o': silent, un-pronouned f. % (OFd- & Jd+): "I have a lot of cookies", forces "lot" to be determiner; % The Jd+ forces a link to the object as well. % Wj- & JQ+ & J+ & Qd+: "Of which person were you speaking?" % QI+ & CV+: "She kept an organized record of which employees took their vacations" % The QI makes it interrogative, the CV links head word. % (Js+ or Jp+ or Ju+): we explicitly exclude Jw+ as that leads to bad parses. -of: +% +% Mf-: allows "from the Abbey of Stratford Langthorne" so that "of" +% links to "Abbey" instead of something more distant. +% XXX The Mp- below should be removed, and all occurrences of +% Mp+ elsewhere should be replaced by (Mp+ or Mf+) +% Mf- & MVp+: "She was a girl of about John's age" +of o': ({JQ+} & (Js+ or Jp+ or Ju+ or Mgp+ or (QI+ & {CV+})) & (Mp- + or Mf- or OFj- or OFw- or (Xd- & Xc+ & MX*x-) or ( & (Mj- or (Xd- & Xc+ & MX*j-))) or [[({Xc+ & {Xd-}} & CO+)]])) + or (Mf- & MVp+) or (Wj- & JQ+ & J+ & Qd+) or (OFd- & Jd+) or ((OFj- or Mp-) & B-) @@ -8683,7 +8938,7 @@ % MX-PHRASE: The blah, to be blahed, will be blah. % TO- & Xc+: "I'd like to, I want to." (null infinitive) -% give [J+] a cost, so that numeric intervals are peferred +% give [J+] a cost, so that numeric intervals are preferred % I*t+ & TO-: passes on the TO constraint down the line % I+ & MVi-: allows "What is there to do?" % but also incorrectly allows: "He is going to do" @@ -8716,8 +8971,7 @@ besides: {J+ or Mgp+} & ([({Xc+ & {Xd-}} & CO+)] or MVp- or ); throughout: {J+} & ([({Xc+ & {Xd-}} & CO+)] or MVp- or ); -versus: (J+ & Mp-) or (G- & G+); -vs: {Xi+} & G- & G+; +versus v. vs. vs: (J+ & Mp-) or (G- & G+); worth.p: (Mp- & (J+ or OF+)) or (Paf- & Mgp+) or (Pa- & (J+ or B-)); opposite.p: J+ & ; @@ -8754,15 +9008,17 @@ regardless_of as_to irrespective_of: (J+ or QI+) & (MVp- or [({Xc+ & {Xd-}} & CO+)] or (Xd- & Xc+ & (E+ or MVx-))); -as_usual to_date on_average in_turn so_far -in_particular in_response in_general thus_far in_reply: +as_yet to_date so_far thus_far as_usual on_average +in_general in_particular in_response in_reply in_turn: ; +% Mp- & QI+: "decisions such as when to go are taken by the instructor." such_as: - J+ & ( + (J+ & ( MVa- or Mp- - or (Xc+ & Xd- & (MVx- or MX*x-))); + or (Xc+ & Xd- & (MVx- or MX*x-)))) + or (Mp- & QI+); lest: ( or Mgp+ or Mv+) & ( @@ -8779,10 +9035,16 @@ % -------------------------------------------------------- % Preps that specify time-like relations +% +% ({[Mp-]-0.09} & MVp-): If we can identify both the head verb, +% and the head noun, then do so. It is tempting to think that these +% should modify the verb, only, but conjunctions prove otherwise: +% "... went to hell recently, and heaven before that." shows that +% "recently" has to modify "hell", and not "went". recently: {EE- or EF+} & ( - ({Xd- & Xc+} & MVp-) + ({Xd- & Xc+} & {[Mp-]-0.09} & MVp-) or Pp- or E+ or ({Xc+ & {Xd-}} & CO+) @@ -8793,33 +9055,41 @@ or [[Mp-]]); % Wc- & Qd+: "Now, am I right?" +% MJr-: "when, if not now, do you want to do it?" now.r: - ({Xd- & Xc+} & MVp-) + ({Xd- & Xc+} & {[Mp-]-0.09} & MVp-) or Pp- or E+ or ({Xc+ & {Xd-}} & CO+) or (Wc- & (Xc+ or [()]) & Qd+) or EB- + or MJr- or [[Mp-]]; % Wc- & Qd+: "Then, am I right?" +% {Xd-} & MVs- & Xs- & : "I eat, then I sleep" +% JT+ & CO+: "then last week, I changed my mind" +% JT+: "if not next Tuesday, then when do you want to do it?" then.r: - ({Xd- & Xc+} & MVp-) + ({Xd- & Xc+} & {[Mp-]-0.09} & MVp-) or Pp- or E+ - or ({Xc+ & {Xd-}} & CO+) + or ({JT+} & {Xc+ & {Xd-}} & CO+) + or JT+ or (Wc- & (Xc+ or [()]) & Qd+) or EB- or (S+ & Xd- & Xc+ & MVs-) + or ({Xd-} & MVs- & Xs- & ) or [[Mp-]]; % Wt-: "Later." (all by itself) but also: "Later, he left" +% [Mp-]0.4: see notes above ; prefer MVb- when possible. later earlier: ({ECa- or Yt-} & (E+ or - Mp- or + [Mp-]0.4 or Pp- or - MVb- or + ({[Mp-]-0.09} & MVb-) or (Wt- & {Xc+}) or [({Xc+ & {Xd-}} & CO+)] or (Xd- & Xc+ & (MX*x- or MVx-)) or @@ -8869,7 +9139,7 @@ % SFpp+: "nearby are more ruins" % {EE-} & {Xc+} & COp+: "{somewhere} nearby, a mouse scratched" % {EE-} & FM-: "The ticking came from {somewhere} nearby." -nearby close_by: +nearby close_by handy.r: A+ or MVp- or Pp- @@ -8882,9 +9152,12 @@ {J+} & (Mp- or Pp- or MVp- or [({Xc+ & {Xd-}} & CO+)] or FM-); % Consider "Here's the ball." We have two choices: SFst+ as a filler-it, -% or the more questionable . Maybe PF is reasonable.. but -% SFst seems better at the moment. -here: J- or or (SFst+ & ) or ; +% or with subject-verb inversion. Both seem reasonable. +here: + J- + or + or [dSFst+ & ]0.15 + or ; % Wi-: [come] Over here! over_here: Wi-; @@ -8892,15 +9165,16 @@ % EN- & Pp-: "you are halfway there" % EN- & J-: "we stopped about halway there" % Wi-: "There!" -% Wd- & PFt+: "there lay the ball"; the PFt+ prevents connections to +% Wp- & PFt+: "there lay the ball"; the PFt+ prevents connections to % the PFb- on . -there.r: +% "there the remains can be found" +there.r thither: J- or - or ((SFst+ or SFp+ or SFut+) & ) - or SFIst- - or SFIp- - or (Wp- & PFt+) + or [(dSFst+ or dSFp+ or dSFut+) & ].15 + or [dSFIst-].15 + or [dSFIp-].15 + or or OXt- or (EN- & (Pp- or J-)) or Wi-; @@ -8928,8 +9202,8 @@ or (J+ & ( or FM- or )); -ahead_of by_way_of akin_to betwixt vis-a-vis in_lieu_of on_account_of -in_place_of in_search_of: +ahead_of by_way_of akin_to betwixt vis-a-vis vis-à-vis cf. +in_lieu_of on_account_of in_place_of in_search_of: or (J+ & ( or )); @@ -9091,7 +9365,7 @@ or AN+ or Wa-; -% The naked ND- can occur with tiem intervals: +% The naked ND- can occur with time intervals: % "I can't decide between 7:30AM and 9:30AM" % AM.ti PM.ti am.ti pm.ti a.m. p.m. o'clock: /en/words/units.5: @@ -9190,23 +9464,29 @@ % {Dmc- or @M+}: avoid used Dmc together with M when parsing sentences like % "She is two years older than me" -days.n weeks.n moments.n hours.n minutes.n years.n instants.n periods.n months.n -nights.n seconds.n decades.n centuries.n: - {NM+} & (( & - (({Dmc- or @M+} & {WN+ or TH+ or or (R+ & Bp+)} & {@MXp+} & - ( or - or - )) or - Up- or - (YP+ & {Dmc-}) or - (GN+ & (DD- or [()])))) or - [[AN+]]); +% ND- & A- & D- & Jp-: "we walked for a further three hours" +days.n weeks.n moments.n hours.n minutes.n years.n instants.n +periods.n months.n nights.n seconds.n decades.n centuries.n: + ({NM+} & (( & + (({Dmc- or @M+} & {WN+ or TH+ or or (R+ & Bp+)} & {@MXp+} & + ( or + or + )) or + Up- or + (YP+ & {Dmc-}) or + (GN+ & (DD- or [()])))) or + [[AN+]])) or + (ND- & A- & D- & Jp-); % XXX A major problem here is that the dict entries for miles.n, feet.n -% create a mass of parses that are wrong & interfere with the below. +% create a bunch of parses that are wrong & interfere with the below. +% Jp-: "we walked for three kilometers" +% ND- & A- & D- & Jp-: "we walked for a further three kilometers" : -((ND- or [()] or [[EN-]]) & (Yd+ or Ya+ or EC+ or [[MVp-]] or OD-)) or -(ND- & (NIfu+ or NItu- or EQt+ or EQt-)); + ((ND- or [()] or [[EN-]]) & (Yd+ or Ya+ or EC+ or [[MVp-]] or OD-)) + or ((ND- or [()]) & Jp-) + or (ND- & A- & D- & Jp-) + or (ND- & (NIfu+ or NItu- or EQt+ or EQt-)); % AU is abbreviation for "astronomical units" blocks.i feet.i miles.i yards.i inches.i @@ -9241,26 +9521,29 @@ % Add cost to Op-, try to use any other linkage before making % a unit be a plain-old object. +% A- & ND-: "200 square ft of plywood" : -((ND- or NS- or NIe-) & (NIfu+ or NItu-)) or -((ND- or NS- or NIe-) & (AN+ or EQt+ or EQt-)) or -((ND- or NS- or NIe-) & (DD- or EN-) & {Wd-} & ({Mp+} & Sp+ )) or -((ND- or NS- or NIe-) & ([[{DD-} & Op-]] or Jp-) & {Mp+}) or -((ND- or NS- or NIe-) & Xd- & MX- & Xc+) or -((ND- or NS-) & {NJ-} & (EC+ or Y+ or OD- or (Us- & {Mp+}))) or Us-; + ({A-} & (ND- or NS- or NIe-) & (NIfu+ or NItu-)) or + ({A-} & (ND- or NS- or NIe-) & (AN+ or EQt+ or EQt-)) or + ({A-} & (ND- or NS- or NIe-) & {DD- or EN-} & {Wd-} & {Mp+} & Sp+) or + ({A-} & (ND- or NS- or NIe-) & ([{DD-} & Op-] or Jp-) & {Mp+}) or + ({A-} & (ND- or NS- or NIe-) & Xd- & MX- & Xc+) or + ((ND- or NS-) & {NJ-} & (EC+ or Y+ or OD- or (Us- & {Mp+}))) or + Us-; % Abbreviations of scientific units that follow numbers % km².u mi².u in².u ft².u m².u cm².u /en/words/units.1: ; -UNITS: ; +/en/words/units.3: ; +: ; + +% Allows "200 sq. ft. of plywood", "200 cu yds of concrete" +/en/words/units.a: A+; % Units abbreviations that can be followed by a period: % ft. tbsp. yds. /en/words/units.1.dot: {Xi+} & ; -% Abbreviations that form noun-like things "degrees_Fahrenheit" etc. -/en/words/units.3: ND- & ; - % Time unit abbreviations: : or ((ND- or NS-) & {NJ-} & OT-); /en/words/units.4: ; @@ -9303,6 +9586,7 @@ who: (R- & (({MVp+ or MVx+} & RS+) or )) or [QI-] + or SJl+ or SJr- or Jw- or ({EL+} & ((S**w+ & {Bsw+}) or (R+ & B*w+)) & {EW-} & (Ws- or Wq- or QI*d- or BIqd-)) or ({EL+ & {N+}} & Wd-) @@ -9331,6 +9615,7 @@ or (D+ & JQ-) or Jw- or [QI-] + or SJl+ or SJr- or (Xc+ & Ic+); % QI- & (): "I do not know which" @@ -9411,9 +9696,13 @@ or ({Xd- & Xc+} & MVs-))) or ({EW-} & Ww- & Qw+)); -although in_as_much_as whilst whereas whereof wherein: +% Comparative-opener: "although a good worker, he's not a very good manager" +: (O*c+ & {Xc+ & {Xd-}} & COc+); + +although in_as_much_as whereas whereof wherein: ( & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-))) - or ({Xd-} & & (Wd+ or Wp+ or Wr+)); + or ({Xd-} & & (Wd+ or Wp+ or Wr+)) + or ; % QI- & (): "I do not know when" % (Mv- & Cs+): "an examination when it happened revealed chicanery" @@ -9422,11 +9711,17 @@ % Perhaps a better solution might be some special case of WN+ on % examination? (would require 'examination' and all other 'operation' % nouns to be in their own class... +% SJ: "when, but not how, will be decided." +% MJ: "when, if not next Tuesday, do you want to do it?" +% JT- & MJr-: "if not next Tuesday, then when do you want to do it?" when: ((WN- or BIh-) & ) or (( or ) & (BIq- or QI- or (SFsx+ & ))) or (Mv- & ) or [QI-] + or SJl+ or SJr- + or MJl+ + or ({JT-} & MJr- & Qw+) or ({EW-} & (QJ- or QJ+)) or ({EW-} & Ww- & {Qw+}) or (( or Mp+ or Mgp+ or Mv+) & @@ -9441,6 +9736,7 @@ or (QI- & ( or or [()])) or ( & ((SFsx+ & ) or WY- or BIq- or QJ+ or QJ-)) or COa+ + or SJl+ or SJr- or ({EW-} & (QJ- or QJ+)) ); @@ -9454,6 +9750,7 @@ & ( ({EW-} & Wq- & ((Rw+ & WR+) or (R+ & Bsw+) or Qw+)) or [QI-] + or SJl+ or SJr- or ({EW-} & (QJ- or QJ+)) or ( & Bsw+ & QI-) or ((WR+ or or ) & (BIq- or QI- or (SFsx+ & ))) @@ -9462,6 +9759,7 @@ whether: ((QI- or BIq-) & ( or )) or ( & SFsx+ & ) + or SJl+ or SJr- or [[( or MV+) & (({Xd- & Xc+} & MVs-) or ({Xc+ & {Xd-}} & CO*s+))]]; whether_or_not: @@ -9478,6 +9776,7 @@ or ({EW-} & Wq- & (({EL+} & Qw+) or AF+)) or [QI-] or ({EW-} & (QJ- or QJ+)) + or SJl+ or SJr- or (( or ) & (QI- or BIq- or (SFsx+ & ))) ; %%% or ((EAh+ or EEh+) & Ww-); @@ -9485,15 +9784,19 @@ % ---------------------------------------------------- % CONJUNCTIONS & ADVERBS +% "that" as subjunctive or paraphrasing % EBx+: He told me that even his mother likes me % (perhaps this should be changed to a different EB ??) that.j-c: - ({EBx+} & & ([SFsx+ & ] or TH- or [[MVh-]] or RJ*t+ or RJ*t-)) + ({EBx+} & & + ([SFsx+ & ] or TH- or [[MVh-]] or RJ*t+ or RJ*t-)) or (TS- & (SI*j+ or SFI**j+) & I*j+); +% "that" as determiner. % D*u+: "Give me that thing" that.j-d: [{AL-} & D*u+]; +% "that" as adverb. % Xd-: iffy punctuation, e.g "The man, that you saw laugh... that.j-r: ({[Xd-]} & R- & (({MVp+ or MVx+} & RS+) or )) @@ -9501,7 +9804,7 @@ or (EA+ & {HA+}) or DTn+; -% naked Osn-: "give me that" +% "that" as noun. naked Osn-: "give me that" that.j-p: ({[[M+]]} & ) or ; % "that" as topic. Wt is a topic link to LEFT-WALL. "That I did not know". @@ -9529,8 +9832,11 @@ (TH+ or TS+) & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-)); unless though.c even_though: - ( or Mgp+ or Mv+) - & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-) or (Xd- & Xc+ & E+)); + (( or Mgp+ or Mv+) + & (({Xc+ & {Xd-}} & CO*s+) + or ({Xd- & Xc+} & MVs-) + or (Xd- & Xc+ & E+))) + or ; as_if as_though: (( or Mgp+ or Mv+ or Mp+) @@ -9569,7 +9875,7 @@ or ( & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-))))) or (Yt- & ( or Qe+)); -if only_if if_and_only_if iff: +if.r only_if if_and_only_if iff: ( & {Xc+ & {Xd-}} & (Wd- & (Qd+ or Ws+ or Wq+))) or (( or [Mgp+] or [Mv+]) & (({Xd- & Xc+} & MVs-) or ({Xc+ & {Xd-}} & CO*s+))) @@ -9591,9 +9897,12 @@ no_wonder: (Wd+ or Wp+ or Wr+) & Wc-; % Pa+: "it can be pressed into shape while cold" -while: - (( or Mgp+ or Mp+ or Pa+) - & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-) or (Xd- & Xc+ & E+))); +while whilst: + (( or Mgp+ or Mp+ or Pa+) & + (({Xc+ & {Xd-}} & CO*s+) or + ({Xd- & Xc+} & MVs-) or + (Xd- & Xc+ & E+))) or + ; : RJlv+ or RJrv-; @@ -9657,9 +9966,11 @@ % comparatives: % he is bigger, and badder, than the pope. % If one comma is there, then the other must be also. +% SJl- & AJrc+: "She was John's height, or taller" and.j-c or.j-c but.j-c yet.j-c: ((AJlc- & AJrc+) or - (Xd- & AJlc- & AJrc+ & Xc+)) & + (Xd- & AJlc- & AJrc+ & Xc+) or + ({Xd-} & SJl- & {EB+} & AJrc+)) & (((Pam- or Mam- or AFm+) & {@MV+}) or ({[ECa-]} & MVb-) or Am+); @@ -9719,7 +10030,7 @@ % "Where is the sickle and hammer?" (SIs-) % Op- has a cost, so that "they verbed X and verbed Y" gets the VJ link % at zero cost, and the SJ link at higher cost (since a "verbed Y" can be -% understood as a modified noun). Acutally, should probably have some +% understood as a modified noun). Actually, should probably have some % post-processing rule to disallow this XXX to do fix above. Example of % bad SJ usage: "He bangs drums and played piano" i.e "he bangs a played piano" % @@ -9748,17 +10059,20 @@ % "I saw the (dog and pony) show" % % and_not ,_not: "I saw John, not Mary" -% We treat this here as an idiom, even though it's explicitly hanled for -% AJ nd RJ conjunctions. Its just easier, for now. +% We treat this here as an idiom, even though it's explicitly handled for +% AJ nd RJ conjunctions. Kind-of wrong, it's just easier, for now. % % {Jd- & Dm-}: "A number of recommendations and suggestions were made" % with "number of" modifying the and.j-n -% [[]] costs so that above is prefered: (huh????) +% [[]] costs so that above is preferred: (huh????) % "there was enough of the beer and sandwiches" % % XJa-: "Both June and Tom are coming" - -and.j-n but_not and_not ,_not: +% +% Xd- & SJl- & EBb+ & SJr+ & Xc+ & Wd- & Ssx+: EB+ forces a singular subject! +% +% and.j-n but_not but_just_not and_not ,_not just_not: +and.j-n 'n': ( & & {XJa-} & ( or (Spx+ & ) @@ -9779,19 +10093,28 @@ or SIu- or Wa- or [{{Jd-} & Dmu-} & Ou-] - or )); + or )) + or ((Xd- & SJl- & EB+ & SJr+ & Xc+) & (Wd- & Ssx+)) + or (({Xd-} & SJl- & EB+ & SJr+ & {Xc+}) & O-); + +% A zero-copula in a conjunction: +% "that is very nice, but not what I want" +but_not just_not: VJrsi- & O+; % {XJo-}: "I can use either this or that". +% EB+: "the problem, or rather, one of the problems, ..." % or.j-n: ( & & {XJo-} & ( or (S*x+ & ) or SIs- or [Os-] or Wa- or )) or ( & & {XJo-} & - ( or (Spx+ & ) or SIp- or [Op-] or Wa- or )); + ( or (Spx+ & ) or SIp- or [Op-] or Wa- or )) + or ((Xd- & SJl- & EB+ & SJr+ & Xc+) & (Wd- & Ssx+)) + or (({Xd-} & SJl- & EB+ & SJr+ & {Xc+}) & O-); % XJn- "neither this nor that" % XJn- is optional: "I don't like dogs nor cats" but not having it is given -% a cost only because its a kind of a strange construction ... +% a cost only because it's a kind of a strange construction ... % SJl- & SJr+ & SJl+: cascading nor's: "Neither snow nor rain nor heat..." % SI- can be singular or plural: for example: % "There is neither a dog nor a cat here" @@ -9810,16 +10133,28 @@ % Force use of commas: "Mary, but not Louise, is coming to the party" % Not John, but Mary led the way. % XJb-: "not only this but also that" -% XXX FIXME: the EBb and EBY below should probably be replaced by XJb -but.j-n: +% XXX FIXME: the EBb and EBy below should probably be replaced by XJb +but.j-n yet.j-n: ((Xd- & SJl- & EBb+ & SJr+ & Xc+) & (Wd- & Ssx+)) or ((Xd- & SJl- & EBy- & SJr+) & (Wd- & Ssx+)) or - (({Xd-} & SJl- & EBy- & SJr+) & Ou-) or - (({Xd-} & SJl- & XJb- & SJr+) & Ou-); + (({Xd-} & SJl- & (XJb- or EBy- or EBb+) & SJr+) & Ou-); but_also: {Xd-} & SJl- & XJb- & SJr+ & Ou-; +% SJ: "Who, if not Micheal, will provide for your care?" +% MJ: "when, if not tomorrow, do you want to do it?" +% MJr+ & MJR+: "if not next Tuesday, when do you want to do it?" +if.j-n: + ((Xd- & SJl- & EBb+ & SJr+ & Xc+) & (Ws- & S**w+)) or + ((Xd- & SJl- & EBb+ & SJr+ & Xc+) & (Ww- & Qw+)) or + ((Xd- & MJl- & EBb+ & MJr+ & Xc+) & (Ww- & Qw+)) or + ((EBb+ & MJr+ & Xc+ & MJr+) & Ww-); + +% Conditional: if ... then ... +if.j-c: Wd- & & XJc+; +then.j-c: {Xd-} & XJc- & VJr+; + % -------------------------------------------------------------------------- % Conjoined verbs/verb phrases % "Oscar Peterson played piano and wrote music." @@ -9828,17 +10163,34 @@ % "This is a problem Moscow created and failed to solve." % [I-]0.2, []0.2: avoid I links to conjoined non-infinitives. % XXX This is hacky, we should just prevent such infinitive links from -% occuring at all. +% occurring at all. +% {TO+}: "I aim to do something and to help." : - (({Xd-} & VJlsi- & VJrsi+) & (({@MV+} & Ss- & ) or (RS- & Bs-) or ([I-]0.2 & {@MV+} & ) or ({Xd-} & VJrsi-))) or - (({Xd-} & VJlpi- & VJrpi+) & (({@MV+} & Sp- & ) or (RS- & Bp-) or ([I-]0.2 & {@MV+} & ) or ({Xd-} & VJrpi-))) or - (({Xd-} & VJlst- & VJrst+) & ((({@MV+} & Ss- & ) or ([I-]0.2 & {@MV+} & )) & (O+ or (B- & {B+})))) or - (({Xd-} & VJlpt- & VJrpt+) & ((({@MV+} & Sp- & ) or ([I-]0.2 & {@MV+} & )) & (O+ or (B- & {B+})))) or (({Xd-} & VJlh- & VJrh+) & (PP- & {@MV+} & )) or + (({Xd-} & VJlsi- & VJrsi+) & + (({@MV+} & Ss- & ) or + (RS- & Bs-) or + ([I-]0.2 & {@MV+} & ) or + ({Xd-} & VJrsi-))) or + (({Xd-} & VJlpi- & {TO+} & VJrpi+) & + (({@MV+} & Sp- & ) or + (RS- & Bp-) or + ([I-]0.2 & {@MV+} & ) or + ({Xd-} & VJrpi-))) or + (({Xd-} & VJlst- & VJrst+) & + ((({@MV+} & Ss- & ) or + ([I-]0.2 & {@MV+} & )) & + (O+ or (B- & {B+})))) or + (({Xd-} & VJlpt- & VJrpt+) & + ((({@MV+} & Sp- & ) or + ([I-]0.2 & {@MV+} & )) & + (O+ or (B- & {B+})))) or + (({Xd-} & VJlh- & VJrh+) & (PP- & {@MV+} & )) or ((VJlg- & VJrg+) & (J-)) or ((VJlp- & VJrp+) & []0.2) or ((VJls- & VJrs+) & []0.2); and.j-v or.j-v: ; +then.j-v: ; % ditransitive conjunction: "I gave Bob a doll and Mary a gun" % Actually, optionally ditransitive, to cover more cases. @@ -9848,11 +10200,14 @@ {Xd-} & VJd- & O+ & {O*n+}; and.j-o or.j-o: ; +then.j-o: ; % XJn-: neither ... nor ... +% I-: "I don't want that, nor do you" nor.j-v: (VJl*i- & XJn- & VJr*i+ & ({@MV+} & S- & )) or - (VJl*t- & XJn- & VJr*t+ & ({@MV+} & S- & O+ & )); + (VJl*t- & XJn- & VJr*t+ & ({@MV+} & S- & O+ & )) or + ({Xd-} & VJl*i- & VJr*i+ & ({@MV+} & I- & )); % Similar to and, but allows optional comma before "but" % "blah blah, but blah" @@ -9885,7 +10240,7 @@ % The costly [[]] is quite ugly and unappealing, but is % needed to parse "he is either in the 105th nor the 106th battalion". % The problem here is that "either in" seems to be order-reversed from -% "in either", and doing it right would require link-corssing. +% "in either", and doing it right would require link-crossing. either.r: Ds+ or XJo+ @@ -9908,8 +10263,15 @@ for.r: [[(({Xd-} & ) or Wc-) & (Wd+ or Wp+ or Wr+ or Qd+ or Ws+ or Wq+)]]; yet.r: ((({Xd-} & ) or Wc-) & (Wd+ or Wp+ or Wr+)) or E+ or MVa- or ({Xd-} & Xc+ & CO+); -thus therefore: ({Xc+ & {Xd-}} & CO+) or ({Xd-} & & Wd+) or -({Xd- & Xc+} & (E+ or EB-)) or (Xd- & Xc+ & MVa-); +% therefor is a common mis-spelling, unlikely the archaic therefor ... +% : "thus it would seem" +thus therefore therefor: + ((Xc+ & {Xd-}) & CO+) or + [CO+]0.3 or + ({Xd-} & & Wd+) or + ({Xd- & Xc+} & (E+ or EB-)) or + (Xd- & Xc+ & MVa-) or + ; % EBy+ link is for "verbed not X but Y" "I saw not Mary, but John" % @@ -9928,6 +10290,7 @@ or ({@E-} & N-) or NT+ or EBy+ + or or [[((Ma+ or Mg+ or Mv+ or Mp+) & CO+) or (Mg- & Mgn+) or (Mv- & Mvn+) @@ -9937,15 +10300,25 @@ % We include this, though it's not one of the strippable strings n't n’t: N- or EB-; -%ADJECTIVES +% "Just" is tricky... +% COMP-OPENER: "just not a good swimmer, he fell behind" +just_not: ; +% --------------------------------------------------- +% ADJECTIVES % Common disjuncts shared by virtually all adjectives. -: +% +% This one is used for openers and post-nominal modifiers. +: [{@E-} & {@MV+} & & {@MV+}] - or (AJra- & {@MV+}) - or ({@MV+} & AJla+) or ({@E-} & {@MV+} & ([[]] or (Xd- & Xc+ & MX*a-))); +% Conjoined adjectives +: + ({[EA-]-0.1} & AJra- & {@MV+}) or ({@MV+} & AJla+); + +: or ; + % Ordinary adjectives % abject.a abnormal.a abominable.a abortive.a abrasive.a abrupt.a % @@ -9953,45 +10326,51 @@ % % Lots and lots, but not all ordinary adjs can take : % Pa- & : "... is too abrasive to talk to." -% (EAh- & {Qe+}): "How big?" "How tall?" +% [EAh- & {Qe+}]: "How big?" "How tall?" -- large cost, as it otherwise +% causes bizarre parses for "a decidedly jolly good player" : - ({EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {}) + ({EA- or EF+} & ( + ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {}) or ({@MV+} & MJla+) or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or )) - or (EAh- & {Qe+}); + or )) + or ({EF+} & ) + or [[DD- & ]] + or [[{DD-} & ]] + or [EAh- & {Qe+}]; -% PH-: connect, phonetically, to a/an if it is there. -: - & ; +% Macro, for all the common parts of an A+ connection, with +% the phonetic attachement as a variable. -: - & ; + +% PH-: connect, phonetically, to a/an if it is there. +: + ({EA- or ({()} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {Xc+} & A+); +: + ({EA- or ({()} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {Xc+} & A+); +: + ({EA- or ({()} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {Xc+} & A+); /en/words/words.adj.1-vowel : or - ; + or ; /en/words/words.adj.1-const : or - ; + or ; -% Make the given name Frank be prefered to 'frank.a' +% Make the given name Frank be preferred to 'frank.a' % e.g. "Frank felt vindicated when his long time rival Bill revealed that % he was the winner of the competition." frank.a: - [ or ]0.2; + [ or or ]0.2; -% Add a miniscule cost, so that the noun form is prefered... +% Add a minuscule cost, so that the noun form is preferred... % An older formulation of this used Ah- as the link, but I don't see % why. Generic adjective should be OK. Given a cost of 0.04, so -% as to give a slight prefernce for the noun-form, if possible. -HYPHENATED-WORDS.a: - []0.04; +% as to give a slight preference for the noun-form, if possible. +.a: + [ or ]0.04; % Color names. Just like ordinary adjectives, except that the % color names themselves can be modified by other nouns, gerunds, @@ -10007,14 +10386,15 @@ % "Her shoes are fire-engine red" % [A-]0.2: "a big green apple" want "big" to modify "apple", not "green" : - {EA- or EF+} & {(AN- or [A-]0.2) & {Ds-}} & + ({EA- or EF+} & {(AN- or [A-]0.2) & {Ds-}} & (({[[@Ec-]]} & {Xc+} & A+) or ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {}) or ({@MV+} & MJla+) or AA+ or [[DD- & ]] or [[{DD-} & ]] - or ); + or )) + or ({EF+} & {(AN- or [A-]0.2) & {Ds-}} & ); % pinkish brownish not in this list as they can't take the modifiers. auburn.a black.a blue.a blueberry.a @@ -10025,42 +10405,53 @@ ; -% "We caught a through flight", "its a done job" - adjective -- !? probably over-broad. : - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + or + ({EA- or EF+} & ( ((AF+ or Ma- or MJra-) & {@MV+}) or ({@MV+} & MJla+) or AA+ or [[DD- & ]] or - [[{DD-} & ]]); + [[{DD-} & ]])); -done.c gone.c through.c: & ; - -responsible.a accountable.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or - ((Pa- or AF+ or Ma- or Vh- or MJra-) & {@MV+}) or - ({@MV+} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); +% "We caught a through flight", "it's a done job" - adjective -- !? +% probably over-broad. +done.c gone.c through.c: + + or ; + +responsible.a: + ({EA- or EF+} & ( + ((Pa- or Vh-) & {@MV+}) or + )) + or ({EF+} & ) + or + or ; + +accountable.a: + ({EA- or EF+} & ( + ((Pa- or Vh-) & {@MV+}) or + )) + or ({EF+} & ) + or + or ; long.a: - ( & ) + + or or ((Ya- or Yt-) & (Pa- or Ma- or MJra- or MJla+)) or (H- & (BT+ or Yt+)); % Hmm does distant really belong here? % "The river is a mile wide here": Ya- & Pa- & MVp+ wide.a tall.a deep.a distant.a: - ( & ) + or + or or (Ya- & (Pa- or Ma- or MJra- or ) & {@MV+}) or (Ya- & {@MV+} & MJla+); old.a: - ( & ) + or or (Ytm- & (Pa- or Ma- or or MJra- or MJla+)); % ??? adj-op already has MX*a- in it, why do we need a bare MX- here ? @@ -10070,75 +10461,78 @@ % "ages" work like aged, but with a cost. ages.i: []; + % The following all have a very regular pattern, with just one variable % part, the TO TH type section. This regularity should be exploited to % simplify the expressions ... -easy.a hard.a simple.a difficult.a fun.a expensive.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) +easy.a expensive.a: + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & )); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; + +hard.a simple.a difficult.a fun.a: + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + or ({@MV+} & {( or ) & {LE+}} & MJla+) + or ({} & ))) + or ({EF+} & {} & ) + or + or ; % M-: "Do you have an example ready?" % Although we'd like to use Ma- for the above, post-processing prevents this. ready.a: - {EA- or EF+} & ( - ({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + ({EA- or EF+} & ( + ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) - or AA+ or M- - or [[DD- & ]] - or [[{DD-} & ]] - or ({ or } & )); + or ({ or } & ))) + or ({EF+} & { or } & ) + or + or ; silly.a nasty.a pleasant.a dangerous.a cruel.a standard.a safe.a legal.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & )); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; % Identical to above, but starts with vowel unpleasant.a illegal.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & )); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; : - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or THi+ or ) & {LE+}}) + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or THi+ or ) & {LE+}}) or ({@MV+} & {( or THi+ or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & ) - or AJr-); + or ({} & ) + or AJr-)) + or ({EF+} & {} & ) + or ; good.a bad.a nice.a strange.a wonderful.a terrible.a possible.a fair.a -tough.a: - & ; +tough.a great.a: + or ; unusual.a useful.a impossible.a annoying.a unfair.a unuseful.a: - & ; + or ; a_bitch : - ; - -great.a: or ( & ); + + or ({EA- or EF+} & {[[@Ec-]]} & {Xc+} & A+); % Surely this is incomplete... one_and_only: @@ -10146,180 +10540,206 @@ % Identical to below, but starts with vowel. important.a essential.a imperative.a: - or - ({EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Paf- or AF+ or Ma- or MJra-) & (({@MV+} & {(THi+ or or TSi+) & {LE+}}) or )) + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & (({@MV+} & {(THi+ or or TSi+) & {LE+}}) or )) or ((({@MV+} & {(THi+ or or TSi+) & {LE+}}) or ) & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & ))); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; % Identical to above, but starts with consonant crucial.a necessary.a vital.a: - or - ({EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Paf- or AF+ or Ma- or MJra-) & (({@MV+} & {(THi+ or or TSi+) & {LE+}}) or )) + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & (({@MV+} & {(THi+ or or TSi+) & {LE+}}) or )) or ((({@MV+} & {(THi+ or or TSi+) & {LE+}}) or ) & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & ))); - -% XXX FIXME Most of the below need to be sorted into vowel/consonant groups. -% -common.a practical.a original.a normal.a helpful.a striking.a -confusing.a frustrating.a disturbing.a -logical.a illogical.a elegant.a efficient.a awful.a just.a unjust.a -absurd.a natural.a alarming.a acceptable.a unacceptable.a deplorable.a -detestable.a scary.a shocking.a + or ({} & ))) + or ({EF+} & {} & ) + or + or ; + +common.a practical.a normal.a helpful.a striking.a +confusing.a frustrating.a disturbing.a logical.a just.a +natural.a deplorable.a detestable.a scary.a shocking.a poetical.a: - or - ({EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) or ({@MV+} & {(THi+ or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or )); + or )) + or ({EF+} & ) + or + or ; + +original.a illogical.a elegant.a efficient.a awful.a unjust.a +absurd.a alarming.a acceptable.a unacceptable.a: + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) + or ({@MV+} & {(THi+ or ) & {LE+}} & MJla+) + or )) + or ({EF+} & ) + or + or ; +% XXX FIXME Most of the below need to be sorted into vowel/consonant groups. +% surprising.a interesting.a odd.a remarkable.a amazing.a exciting.a depressing.a rare.a embarrassing.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or or ) & {LE+}}) or ({@MV+} & {(THi+ or or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + )) + or ({EF+} & ) + or + or ; crazy.a sane.a insane.a stupid.a ridiculous.a wrong.a curious.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or or THi+ or ) & {LE+}}) + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or or THi+ or ) & {LE+}}) or ({@MV+} & {( or or THi+) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & )); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; wise.a unwise.a smart.a intelligent.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({} & )); + ({} & ))) + or ({EF+} & {} & ) + or + or ; unlikely.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or THi+ or ) & {LE+}}) or ({@MV+} & {( or THi+ or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({} & )); + ({} & ))) + or ({EF+} & {} & ) + or + or ; likely.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or THi+ or ) & {LE+}}) or ({@MV+} & {( or THi+ or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({} & ) or - [E+]); - -apparent.a false.a official.a strict.a significant.a funny.a notable.a untrue.a tragic.a -plain.a urgent.a a_drag a_bummer definite.a evident.a impressive.a incredible.a -inevitable.a mysterious.a pathetic.a probable.a admirable.a commendable.a conceivable.a -insignificant.a miraculous.a self-evident.a undeniable.a plausible.a understandable.a -demonstrable.a hilarious.a improbable.a inexcusable.a outrageous.a paradoxical.a shameful.a -inconceivable.a unbelievable.a astonishing.a disgraceful.a debatable.a arguable.a -lamentable.a regrettable.a -well-known.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({} & ) or + [E+])) + or ({EF+} & {} & ) + or + or ; + +% Just like below, but vowels +apparent.a official.a untrue.a urgent.a +evident.a impressive.a incredible.a inevitable.a admirable.a +insignificant.a undeniable.a understandable.a +improbable.a inexcusable.a outrageous.a +inconceivable.a unbelievable.a astonishing.a arguable.a: + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) or ({@MV+} & {(THi+ or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + )) + or ({EF+} & ) + or + or ; + +% Just like above, but consonants +false.a strict.a significant.a funny.a notable.a tragic.a +plain.a definite.a mysterious.a pathetic.a probable.a +commendable.a conceivable.a miraculous.a self-evident.a +plausible.a demonstrable.a hilarious.a paradoxical.a shameful.a +disgraceful.a debatable.a lamentable.a regrettable.a well-known.a +a_drag a_bummer: + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) or + ({@MV+} & {(THi+ or ) & {LE+}} & MJla+) or + )) + or ({EF+} & ) + or + or ; clear.a unclear.a relevant.a irrelevant.a obvious.a immaterial.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or QIi+ or ) & {LE+}}) or ({@MV+} & {(THi+ or QIi+) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + )) + or ({EF+} & ) + or + or ; + +% Implement a zero-that; viz. +% "We have to make sure [that] everyone is here." +: Cet+ & CV+; -clear.i: {EA- or EF+} & Vm- & TH+; -sure.i certain.i: {EA- or EF+} & Vm- & (TH+ or or (OF+ & {@MV+})); +clear.i: + {EA- or EF+} & Vm- & (TH+ or ); + +sure.i certain.i: + {EA- or EF+} & Vm- & (TH+ or or or (OF+ & {@MV+})); % (AJrc- & {@MV+}): "It is lighter and less costly" -tactful.a conventional.a advisable.a prudent.a sensible.a tactless.a polite.a -impolite.a arrogant.a conceited.a obnoxious.a valuable.a reasonable.a -unreasonable.a traditional.a unnecessary.a tempting.a usual.a -inadvisable.a lovely.a a_mistake ethical.a unethical.a immoral.a -childish.a awkward.a appropriate.a costly.a customary.a desirable.a -dumb.a effective.a fashionable.a energy-intensive.a -foolish.a healthy.a hip.a okay.a OK.a ok.a -painful.a selfish.a sufficient.a advantageous.a boring.a -inappropriate.a insufficient.a irrational.a irresponsible.a +% Exactly the same as below, but consonant. +tactful.a conventional.a prudent.a sensible.a tactless.a polite.a +conceited.a valuable.a reasonable.a traditional.a tempting.a +lovely.a childish.a costly.a customary.a desirable.a +dumb.a fashionable.a foolish.a healthy.a hip.a +painful.a selfish.a sufficient.a boring.a mandatory.a meaningless.a preferable.a senseless.a trivial.a wrongheaded.a premature.a risky.a dishonest.a -hypocritical.a enjoyable.a idiotic.a inconvenient.a unkind.a pointless.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) +hypocritical.a pointless.a +a_mistake: + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & { & {LE+}} & MJla+) - or AA+ - or or (AJrc- & {@MV+}) - or [[DD- & ]] - or [[{DD-} & ]]); + or )) + or ({EF+} & ) + or + or ; + +% Exactly the same as above, but vowel. +advisable.a impolite.a arrogant.a obnoxious.a +unreasonable.a unnecessary.a usual.a +inadvisable.a ethical.a unethical.a immoral.a +awkward.a appropriate.a effective.a energy-intensive.a +okay.a OK.a ok.a advantageous.a +inappropriate.a insufficient.a irrational.a irresponsible.a +enjoyable.a idiotic.a inconvenient.a unkind.a: + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + or ({@MV+} & { & {LE+}} & MJla+) + or (AJrc- & {@MV+}) + or )) + or ({EF+} & ) + or + or ; unknown.a questionable.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {QIi+}) or ({@MV+} & {QIi+} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + )) + or ({EF+} & ) + or + or ; certain.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or or TH+ or QI+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {( or or TH+ or QI+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or or TH+ or QI+ or OF+} & )); + ({ or or TH+ or QI+ or OF+} & ))) + or ({EF+} & { or or TH+ or QI+ or OF+} & ) + or + or ; sure.a unsure.a uncertain.a careful.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or or TH+ or QI+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {( or or TH+ or QI+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or or TH+ or QI+ or OF+} & )); + ({ or or TH+ or QI+ or OF+} & ))) + or ({EF+} & { or or TH+ or QI+ or OF+} & ) + or + or ; % XXX FIXME: many of the below should probably take like the % above... @@ -10328,36 +10748,33 @@ disappointed.a upset.a sorry.a content.a determined.a amused.a amazed.a astonished.a astounded.a pleased.a disgusted.a distressed.a dismayed.a irritated.a embarrassed.a alarmed.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or TH+) & {LE+}}) or ({@MV+} & {( or TH+) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or TH+} & )); - -glad.a fortunate.a unfortunate.a lucky.a unlucky.a happy.a sad.a surprised.a -delighted.a overjoyed.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({ or TH+} & ))) + or ({EF+} & { or TH+} & ) + or + or ; + +glad.a fortunate.a unfortunate.a lucky.a unlucky.a happy.a sad.a +surprised.a delighted.a overjoyed.a: + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or or TH+) & {LE+}}) or ({@MV+} & {( or or TH+) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or or TH+} & )); + ({ or or TH+} & ))) + or ({EF+} & { or or TH+} & ) + or + or ; % common adjectives, taking "to", "of", "that" e.g. "proud that" proud.a scared.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or TH+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {( or TH+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or TH+ or OF+} & )); + ({ or TH+ or OF+} & ))) + or ({EF+} & { or TH+ or OF+} & ) + or + or ; % common adjectives, taking "of" e.g. "tired of", "sick of" etc. tired.a pooped.a full.a sick.a critical.a guilty.a innocent.a @@ -10365,77 +10782,84 @@ capable.a contemptuous.a incapable.a reminiscent.a scornful.a mindful.a short.a appreciative.a complimentary.a born.a worthy.a free.a terrified.a unworthy.a prognostic.a dead.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {OF+ & {@MV+}}) or ({@MV+} & {OF+ & {@MV+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({OF+ & {@MV+}} & )); + ({OF+ & {@MV+}} & ))) + or ({EF+} & {OF+ & {@MV+}} & ) + or + or ; fond.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & OF+ & {@MV+}) or (OF+ & {@MV+} & MJla+) or - (OF+ & )); + (OF+ & ))) + or ({EF+} & OF+ & ) + or ; + afraid.a ashamed.a unafraid.a unashamed.a: - {EA- or EF+} & + ({EA- or EF+} & (((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {((OF+ & {@MV+}) or or TH+ or ) & {LE+}}) or ({@MV+} & {((OF+ & {@MV+}) or or TH+ or ) & {LE+}} & MJla+) or - ({OF+ or or TH+ or } & )); - -apprehensive.a secure.a optimistic.a pessimistic.a annoyed.a confused.a offended.a -insulted.a concerned.a depressed.a doubtful.a -grateful.a mad.a mistaken.a hopeful.a unhappy.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({OF+ or or TH+ or } & ))) + or ({EF+} & {OF+ or or TH+ or } & ) + or ; + +apprehensive.a optimistic.a annoyed.a offended.a +insulted.a unhappy.a +secure.a pessimistic.a confused.a concerned.a depressed.a +doubtful.a grateful.a mad.a mistaken.a hopeful.a: + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {TH+ & {LE+}}) or ({@MV+} & {TH+ & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({TH+} & )); + ({TH+} & ))) + or ({EF+} & {TH+} & ) + or + or ; aware.a unaware.a: - {EA- or EF+} & + ({EA- or EF+} & (((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {(TH+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {(TH+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({TH+ or OF+} & )); + ({TH+ or OF+} & ))) + or ({EF+} & {TH+ or OF+} & ) + or ; -true.a conscious.a confident.a skeptical.a jealous.a suspicious.a envious.a desirous.a +true.a conscious.a confident.a skeptical.a jealous.a +suspicious.a envious.a desirous.a convinced.a unconvinced.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {(TH+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {(TH+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({TH+ or OF+} & )); + ({TH+ or OF+} & ))) + or ({EF+} & {TH+ or OF+} & ) + or + or ; -eager.a reluctant.a able.a unable.a impatient.a eligible.a brave.a anxious.a apt.a desperate +eager.a reluctant.a able.a unable.a impatient.a +eligible.a brave.a anxious.a apt.a desperate keen.a prepared.a willing.a hesitant.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & { & {LE+}}) or ({@MV+} & { & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({} & )); + ({} & ))) + or ({EF+} & {} & ) + or + or ; -former.a: A+ or G+ or (DG- & ); +former.a: ( & A+) or G+ or (DG- & ); latter.a: DG- & ; -overall.a onetime.a outboard.a pinstripe.a goddam.a de_facto de_jure erstwhile.a -foster.a outright.a online.a: A+; -pro_forma ad_hoc bona_fide: A+ or Pa-; -a_priori a_posteriori: A+ or MVa- or ({Xc+ & {Xd-}} & CO+); + +overall.a onetime.a outboard.a pinstripe.a +goddam.a defacto de_facto de_jure erstwhile.a +foster.a outright.a online.a: & A+; + +pro_forma ad_hoc bona_fide: ( & A+) or Pa-; + +a_priori a_posteriori apriori aposteriori: + ( & A+) or MVa- or ({Xc+ & {Xd-}} & CO+); asleep.a awake.a alike.a alive.a ablaze.a adrift.a afire.a aflame.a afloat.a afoot.a aghast.a aglow.a agog.a ajar.a amiss.a askew.a @@ -10459,24 +10883,31 @@ MVp- or E+; +% Max-: "the only apartment available" +% Max passes through post-processing, whereas Ma- is disallowed... outstanding.a available.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or - ((Pa- or AF+ or Max- or MJra-) & {@MV+}) or - ({@MV+} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + ({EA- or EF+} & ( + ((Pa- or Max-) & {@MV+}) or + )) + or ({EF+} & ) + or + or ; due.i effective.i: (TY+ or JT+) & ; -north.a south.a east.a west.a northeast.a northwest.a southeast.a southwest.a: - or -A+ or ({Yd-} & {OF+} & (Pp- or MVp- or Mp- or (Xc+ & Xd- & (MVx- or MX*x-)))) -or [[DD- & ]]; -northern.a southern.a eastern.a western.a northeastern.a northwestern.a southeastern.a -southwestern.a: or A+ or G+; +north.a south.a east.a west.a northeast.a +northwest.a southeast.a southwest.a: + + or ( & A+) + or ({Yd-} & {OF+} & (Pp- or MVp- or Mp- or (Xc+ & Xd- & (MVx- or MX*x-)))) + or [[DD- & ]] + or [[{DD-} & ]]; + +northern.a southern.a eastern.a western.a northeastern.a +northwestern.a southeastern.a southwestern.a: + + or ( & A+) + or G+; % .. is this correct? benumbed.a bespattered.a non_compos_mentis dead_on_arrival @@ -10492,8 +10923,8 @@ %COMPARATIVES AND SUPERLATIVES % Omm-: "I want more" -- the second m blocks O*t+ on -% Non-zero cost on Omm- so that EA+ is prefered. -% Cost of >1.0 on Omm- so that MVm- is prefered for "He runs more". +% Non-zero cost on Omm- so that EA+ is preferred. +% Cost of >1.0 on Omm- so that MVm- is preferred for "He runs more". more: ({ECa-} & (EAm+ or EEm+ or [MVm-] or [EB*m-] or Qe+ or or AJrc- or AJlc+)) or ({OF+} & ( @@ -10506,7 +10937,7 @@ more_of_an: Ds*mv+ or ( & Ds*mx+); % XXX TODO: shouldn't less be a lot more like 'more', above? -% Cost of >1.0 on Om- so that MVm- is prefered for "He runs less". +% Cost of >1.0 on Om- so that MVm- is preferred for "He runs less". less: ({ECa-} & (EAm+ or EEm+ or [MVm-] or [EB*m-] or AJrc- or AJlc+)) or ({ECn-} & (Dmum+ or (Ss+ & ) or Bsm+)) @@ -10540,28 +10971,36 @@ as.e-c: (MVz- & (((O*c+ or S**c+ or ({SFsic+} & Zc+)) & {Mp+}) or Mpc+ or )) or (MVzo- & Ct+ & Bc+ & {U+}) - or (MVzp- & (CX+ or CQ+)) + or (MVzc- & (CX+ or CQ+)) + or (MVzp- & Pa+) or (MVza- & Cta+ & ((AFd+ & {Pa+}) or PFc+)); % prepositional, mostly % MVi- & TO+: "He said it in a voice so loud as to make everyone stare." -% MVs- & Sa*v+: "he left as agreed" -% MVs- & Sa*a+ & CV+: " ..., as shall be proven" +% VCz- & Sa*v+: "he left as agreed" +% VCz- & Sa*a+ & CV+: " ..., as shall be proven" % The punctuation is mandatory, here. % The CV is awkward, as it uses a null-subject. +% XXX Try to get rid of this... % Cz+ & CV+: "the accused, as it shall be shown, is innocent" % use Cz instead of because post-processing kills the % Cs link with a "Unbounded s domain78" error. -% -: Sa*v+ or (Sa*a+ & CV+); +% AZ- & Mg+: "It described the treaty as marking a new stage" +% & CO+: "As we set sail, a gale blew up" +% BIt+: "his statements, as candidate, contradict his actions" as.e: - ((J+ or Mp+ or TI+ or ({SFsic+} & Zs+)) & + ((J+ or Mp+ or TI+ or Zs+) & (({Xc+ & {Xd-}} & CO+) or ({Xd- & Xc+} & MVp-))) or ((J+ or Mp+ or BIt+) & ([Mp-] or (Xd- & Xc+ & MX*x-))) or (AZ- & Pa+) - or (( or ) & (({Xc+ & {Xd-}} & CO+) or ({Xd- & Xc+} & MVs-))) - or (Cz+ & CV+) - or ({{Xd-} & MVs-} & & {Xc+}) + or (AZ- & Mg+) + or ({Xd-} & {[hVCz-]-0.05} & Cz+ & CV+) + % or ({Xd-} & hVCz- & Cz+ & CV+) + or ( & (({Xc+ & {Xd-}} & CO+))) + or ((Sa*v+ or (Sa*a+ & CV+)) & {Xc+ & {Xd-}} & CO+) + or (Sa*v+ & {Xd- & {Xc+}} & VCz-) + or [Sa*a+ & CV+ & {Xd- & {Xc+}} & VCz-]-0.05 + or (Sa*a+ & CV+ & {Xd- & {Xc+}}) % needed for MXsr constructions or (MVi- & TO+) or [[(PFc+ or CQ+) & ({Xd- & Xc+} & MVs-)]]; @@ -10569,7 +11008,7 @@ as_possible: MVz-; -% Cc+ & CV+: C links to the head-noun of the followig clause, and CV+ +% Cc+ & CV+: C links to the head-noun of the following clause, and CV+ % links to the head verb. Must form a cycle. % Example: "I run more often than Ben climbs" than.e: @@ -10583,9 +11022,9 @@ % cost on MVa-: "we will arrive much sooner", want "much" to modify "sooner". % ({OFd+} & Dmu+): "I drank much of the beer" -% cost on [[]] so that the above is prefered to an O- link +% cost on [[]] so that the above is preferred to an O- link much: - ({EE-} & ([[MVa-]] or ECa+ or or Qe+)) + ({EE-} & ([[MVa-]] or ( & ECa+) or or Qe+)) or ({EEx- or H-} & ( ECn+ or ({OFd+} & Dmu+) @@ -10604,7 +11043,7 @@ % "cheaper than dirt, slime is better" : ({ECa-} & ( - ((Pam- or Mam- or AFm+ or AJrc-) & {@MV+}) + ((Pam- or Mam- or AFm+ or ({EA-} & AJrc-)) & {@MV+}) or ({[[@Ec-]]} & {Xc+} & Am+) or (Wr- & {@MV+} & MVt+ & Xc+ & PFb+) or AJlc+)) @@ -10683,7 +11122,8 @@ ((Pafm- or AFm+ or Mam- or AJrc-) & {@MV+}) or AA+ or [[DD- & ]] or - )); + )) + or ({ECa- or EF+} & ); than_expected than_imagined than_proposed than_suspected than_realized than_intended than_supposed than_reported than_ever than_usual @@ -10769,68 +11209,85 @@ a_bit a_little_bit the_least_bit: ({EE-} & EA+) - or EC+ or EE+ + or EC+ or ({Xd- & Xc+} & MVa-) or [[]]; % OFd+ & Dm+: "I will wait a little while of time" a_little_while: ({EE-} & EA+) - or EC+ or EE+ + or EC+ or ({Xd- & Xc+} & MVa-) or [[]] or (OFd+ & Dm+); -% -pretty.e extremely very_very very_very_very exceptionally -unbelievably incurably extraordinarily -jolly.e mighty.e damn.e bloody.e exceedingly overly downright plumb -vitally abundantly chronically frightfully genuinely -humanly patently -singularly supremely unbearably unmistakably unspeakably -awfully decidedly demonstrably fashionably frighteningly horrifyingly -indescribably intolerably laughably predominantly +% Basic adverbial adjectives, with phoneme markers. +% This is very simply just "({EE-} & EA+) or EE+" without the phonemes. +: ({EE- or } & EA+) or ( & EE+); +: ({EE- or } & EA+) or ( & EE+); +: & (EA+ or EE+); +: & (EA+ or EE+); +% +% +pretty.e very_very very_very_very +jolly.e mighty.e damn.e bloody.e downright plumb +vitally chronically frightfully genuinely +humanly patently singularly supremely +decidedly demonstrably fashionably +frighteningly horrifyingly +laughably predominantly hugely: + ; + +extremely exceptionally unbelievably incurably extraordinarily +exceedingly overly abundantly +unbearably unmistakably unspeakably +awfully indescribably intolerably unalterably undisputedly unpardonably -unreasonably unusually hugely infernally: - ({EE-} & EA+) or EE+; +unreasonably unusually infernally: + ; notoriously.e: - ({EE-} & EA+) or EE+ or Em+; + or Em+; % --------------------------------------------------------- % Adverbs whose main use is adjectival, but can be used with % adverbs such as "faster", "dumber", etc. % "That one is marginally better". (thus EBm+) % "It's an inherently better method" thus EC+ link -fabulously incomparably inherently marginally moderately -relatively ridiculously -unacceptably unarguably undeniably unimaginably: - ({EE-} & - (EA+ - or EC+ - or EBm-)) - or EE+; +fabulously marginally moderately relatively ridiculously: + + or ({EE- or } & EC+) + or ({EE-} & EBm-); + +incomparably inherently unacceptably unarguably undeniably unimaginably: + + or ({EE- or } & EC+) + or ({EE-} & EBm-); -wide.e: EE+; +wide.e: & EE+; % {EE-} & EE+: "not very much" % [[EE-]]: "It hurts, but not very" % MVl- & MVl+: "we are going to arrive very nearly on time" very.e way.e: - ({EE-} & EA+) - or ({EE-} & EE+) + + or ({EE- or } & EE+) or [[EE-]] or [[La-]] or (MVl- & MVl+); -real.e: [[EA+ or EE+]]; -quite: ({EE-} & EA+) or EE+ or EZ+ or [[Em+]]; +not_very: & EA+; + +real.e: [[]]; +quite: + + or EZ+ + or [[Em+]]; amazingly incredibly: - ({EE-} & EA+) - or EE+ + or EBm- or ({Xd- & Xc+} & Em+) or ({Xc+ & {Xd-}} & CO+) @@ -10838,69 +11295,94 @@ % MVa-: "He is behaving very strangely" strangely: - ({EE-} & EA+) - or EE+ + or EBm- or ({Xd- & Xc+} & Em+) or ({Xc+ & {Xd-}} & CO+) or ({Xd- & Xc+} & {EE-} & MVa-); -rather: EA+ or EE+ or Vw- or ({Xc+ & {Xd-}} & CO+); +% EB-: "The problem, or rather, one of the problems, ..." +rather: + + or Vw- + or ({Xc+ & {Xd-}} & CO+) + or ({Xc+ & {Xd-}} & EB-); particularly: - EA+ or EE+ or Em+ or EB- + + or Em+ or EB- or (MVl- & (MVp+ or MVa+ or MVs+)) or ({Xc+ & {Xd-}} & CO+); -notably: EB- or EA+ or EE+ or ({Xc+ & {Xd-}} & CO+); +notably: + + or EB- or ({Xc+ & {Xd-}} & CO+); + +% Mp- & Ju+: "She was a girl nearly John's age" +% MVp- & Ju+: "She was a girl of nearly John's age" +almost: + + or EN+ or EZ+ or Em+ or EBm- + or (MVl- & (MVp+ or MVa+ or MVs+)) + or ((Mp- or MVp-) & Ju+); -almost nearly: - EA+ or EE+ or EN+ or EZ+ or Em+ or EBm- - or (MVl- & (MVp+ or MVa+ or MVs+)); +nearly: + + or EN+ or EZ+ or Em+ or EBm- + or (MVl- & (MVp+ or MVa+ or MVs+)) + or ((Mp- or MVp-) & Ju+); % The below is similar to "nearly" ... just_about: Em+ or EN+ or EZ+ or EA+; -entirely reasonably highly fairly totally completely terribly: - EA+ or EE+ +entirely: + + or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); + +reasonably highly fairly totally completely terribly: + or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); absolutely: - EA+ or EE+ or EBm- + + or EBm- or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); % allowing as opener also altogether equally: - EA+ - or EE+ + or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )) or ({Xc+ & {Xd-}} & CO+); really.e: - EA+ or EE+ or Em+ or EBm-; + or Em+ or EBm-; surprisingly: - EA+ or EE+ + or ({Xc+ & {Xd-}} & CO+) or ({Xd- & Xc+} & E+) or (Xd- & Xc+ & MVa-); especially: - EA+ or EE+ or EB- or Em+ + + or EB- or Em+ or (MVl- & (MVp+ or MVa+ or MVs+)) or ({Xc+ & {Xd-}} & CO+); -virtually: EA+ or EE+ or EN+ or EZ+ or Em+; +virtually: or EN+ or EZ+ or Em+; wholly fully critically greatly grossly duly unduly: - EA+ or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); + ( & EA+) + or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); + +seemingly: ( & EA+) or Em+; +utterly: ( & EA+) or Em+; -seemingly utterly: EA+ or Em+; barely just_barely scarcely hardly merely truly practically: - Em+ or EBm- or EA+ or Wa-; + Em+ or EBm- or ( & EA+) or Wa- or ({EBm+} & ); partly.e largely.e mostly.e chiefly.e simply.e purely.e solely.e: - Em+ or EA+ or EB- or Wa- + Em+ or ( & EA+) or EB- or Wa- or (MVl- & (MVp+ or MVa+ or MVs+)); % Em+: "It sure is great" @@ -10915,14 +11397,21 @@ % Adverbs like "biochemically". These tend to answer the question "how?" % with a noun-form (as opposed to being verb-derived) A lot of these are % of the "-ically" form -/en/words/words.adv.3: - EA+ +% +/en/words/words.adv.3-const: + ( & EA+) or ({Xd- & Xc+} & (E+ or MVa-)) or EBm- or ({{Xd-} & Xc+} & CO+); -in_part: EB- or (MVl- & (MVp+ or MVa+ or MVs+)); % academically administratively aesthetically +/en/words/words.adv.3-vowel: + ( & EA+) + or ({Xd- & Xc+} & (E+ or MVa-)) + or EBm- + or ({{Xd-} & Xc+} & CO+); + +in_part: EB- or (MVl- & (MVp+ or MVa+ or MVs+)); % --------------------------------------------------------- %ADVERBS WHOSE ONLY (MAIN) USE IS POST_VERBAL @@ -10932,7 +11421,8 @@ inward.e outward.e inwards.e outwards.e anytime.e live.e wholesale.e anew.e forever.e awhile.e aback.e afoul.e afresh.e aloft.e amok.e amuck.e onstage.e -apiece.e askance.e astern.e asunder.e inter_alia mutatis_mutandis par_excellence +apiece.e askance.e astern.e asunder.e +inter_alia mutatis_mutandis par_excellence upside-down.e ab_initio ad_infinitum ad_lib ad_libitum ad_nauseum aground.e astray.e into_account into_effect to_market to_bid from_scratch to_office for_office for_good @@ -11009,9 +11499,10 @@ or [{EA- or EF+} & (Pa- or AF+)] or Yd+; +% EE- & EB-: "..., or more exactly, ..." exactly.e: E+ - or EB- + or ({EE-} & {Xc+ & {Xd-}} & EB-) or EN+ or EW+ or EZ+ @@ -11033,7 +11524,7 @@ initially already somehow again once_again nowadays sometimes nevertheless nonetheless at_first at_best at_present of_late indeed: -({Xd- & Xc+} & MVa-) or E+ or ({Xc+ & {Xd-}} & CO+) or EBm-; + ({Xd- & Xc+} & MVa-) or E+ or ({Xc+ & {Xd-}} & CO+) or EBm-; twice.e: ({Xd- & Xc+} & MVa-) @@ -11049,12 +11540,12 @@ ({Xd- & Xc+} & MVa-) or E+ or ({Xc+ & {Xd-}} & CO+) - or EB-; + or ({Xc+ & {Xd-}} & EB-); also.e: ({Xd- & Xc+} & (E+ or MVa-)) or ({Xc+ & {Xd-}} & CO+) - or EB-; + or ({Xc+ & {Xd-}} & EB-); gradually.e sadly.e broadly.e clearly.e annually.e characteristically.e comparatively.e @@ -11068,26 +11559,27 @@ appropriately.e simply.ee: {EE- or EF+} & ( ({Xd- & Xc+} & (MVa- or E+)) - or ({Xc+ & {Xd-}} & CO+) - or EB- - or Qe+ - or - or [[EA+]]); + or ({Xc+ & {Xd-}} & CO+) + or ({Xc+ & {Xd-}} & EB-) + or Qe+ + or + or [[EA+]]); +% ({Xc+ & {Xd-}} & EB-)): "..., or more precisely, ..." precisely.e specifically.e generally.e: {EE- or EF+} & ( ({Xd- & Xc+} & (MVa- or E+)) - or ({Xc+ & {Xd-}} & CO+) - or EB- - or Qe+ - or - or EW+); + or ({Xc+ & {Xd-}} & CO+) + or ({Xc+ & {Xd-}} & EB-) + or Qe+ + or + or EW+); occasionally.e often.e originally.e: {EE- or EF+} & ( ({Xd- & Xc+} & (MVa- or E+)) or ({Xc+ & {Xd-}} & CO+) - or EB- + or ({Xc+ & {Xd-}} & EB-) or Qe+ or ); @@ -11098,22 +11590,23 @@ % (EEh- & {Qe+}): "How quickly?" % : "Onward came the cavalry" % ECa+: "It is vastly cheaper" +% XXX TODO the PH- should be split into PHc- and PHv- versions. : ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ - or ECa+ + or ({PH-} & ECa+) or ({Xc+ & {Xd-}} & COa+) or Qe+ or or or - or [[EA+]])) + or [[{PH-} & EA+]])) or (EEh- & {Qe+}); -% XXX fixme: there are things in there, like "tall.e" that seem not to +% XXX FIXME: there are things in there, like "tall.e" that seem not to % belong?? -% XXX fixme: the Qe construction is weird: +% XXX FIXME: the Qe construction is weird: % "How slickly did he talk?" % "*How slickly did you say it was?" /en/words/words.adv.1: ; @@ -11148,8 +11641,10 @@ & CO+) or EB- or Qe+ or )) or ({EA- or EF+} & (Pa- or AF+)); certainly possibly probably importantly remarkably interestingly: -{EE-} & (E+ or (Xd- & Xc+ & (E+ or MVa-)) or ({Xc+ & {Xd-}} & CO+) or -({Xc+ & {Xd-}} & EB-)); + {EE-} & (E+ + or (Xd- & Xc+ & (E+ or MVa-)) + or ({Xc+ & {Xd-}} & CO+) + or ({Xc+ & {Xd-}} & EB-)); % --------------------------------------------------------- % ordinary clausal adverbs @@ -11178,7 +11673,7 @@ or ({MVp+} & {Xc+ & {Xd-}} & CO+) or ({Xc+ & {Xd-}} & EBm-); -not_suprisingly if_nothing_else: +not_surprisingly if_nothing_else: E+ or (Xd- & Xc+ & (E+ or MVa-)) or ({Xc+ & {Xd-}} & CO+) @@ -11191,20 +11686,37 @@ % Nearly identical to words.adv.2, but do not force the EBm- % Wt-: single-word sentence: "Evidently" % Wt- & Pv+: "Evidently so" -still.e presumably undoubtedly evidently apparently usually typically perhaps: +% EB- & EA+: "... or perhaps taller" +still.e presumably undoubtedly evidently apparently +usually typically perhaps: E+ or (Xd- & Xc+ & (E+ or MVa-)) or (Wt- & ({Xc+} or Pv+ or N+)) or ({Xc+ & {Xd-}} & CO+) - or EB-; + or (EB- & {[EA+]-0.1}); -in_fact of_course in_effect for_example for_instance e.g. i.e. : +% scilicet +% "it [sc. gouache] was also popular in France" +% MX*tn- & O+: scilicet nouns +% MX*ta- & P+: scilicet adjectives +% MVx- & P+: to verbs "While walking [scilicet in the evening] ..." +% Some of these sc. constructions should forcefully require a comma. +in_fact of_course in_effect for_example for_instance +e.g. eg. i.e. ie. id_est +sc sc. scilicet viz.r viz..r videlicet +that_is that_is_to_say to_wit namely.r in_other_words: E+ or (Xd- & Xc+ & (E+ or MVa-)) or ({Xc+ & {Xd-}} & CO+) - or (EB- & {Xc+}) - or (Xd- & EB- & Xc+) - or ({Xd-} & & (Wd+ or Wp+ or Wr+)); + or ({Xd-} & & (Wd+ or Wp+ or Wr+)) + or [EB- & {Xc+}] + or [Xd- & EB- & Xc+] + or (Xd- & + ((MX*tn- & {Xc+} & O+) or + (MX*ta- & {Xc+} & P+) or + (MVx- & {Xc+} & P+)) + & (Xc+ or )); + % ----------------------------------------------------------- % ADVERBS USABLE POST_VERBALLY OR AS OPENERS @@ -11212,16 +11724,33 @@ % by the UNKNOWN-WORD.a rule -- "Umm, I think he did it." no.e nope.e nah.e no_way yes.e yeah.e yep.e yup.e -ok.e okay.e OK.e fine.e sure.e whatever.e -someday.e sometime.e thereafter.e anyhow.e anyway.e overall.e -afterwards.e afterward.e lengthwise.e worldwide.e nationwide.e -statewide.e world-wide.e nation-wide.e state-wide.e industrywide.e -instead.e maybe.e: +ok.e okay.e OK.e fine.e sure.e whatever.e: ({Xc+ & {Xd-}} & CO+); +% EB-: "Would they have accepted this or, instead, would they have...?" +thereafter.e overall.e lengthwise.e +instead.e anyhow.e anyway.e: + + or ({Xd- & Xc+} & (MVp- or E+ or EB-)); + +% Wa-: Single-word responses to questions. +someday.e sometime.e maybe.e +afterwards.e afterward.e worldwide.e nationwide.e +statewide.e world-wide.e nation-wide.e state-wide.e industrywide.e +the_world_over: + + or ({Xd- & Xc+} & (MVp- or E+)) + or (Wa- & {Wa+}); + +% Comparative form of maybe, similar to "perhaps" +% EB- & EA+: "She was John's age or maybe older" +maybe.c: + EB- & EA+; + % Argumentatives (children gain-saying). not.intj is_too is_not is_so unh_unh: Wa-; +%suppress: DUP-BASE (for seriously.ij) % Openers to directives, commands (Ic+ connection to infinitives) % or single-word interjections, exclamations. % These are semantically important, so they've got to parse! @@ -11281,19 +11810,20 @@ sacre_bleu ay caramba kamoley kamoly moley moly holy_Moses mother_of_God Mother_of_God -mother_of_God mama_mia mamma_mia +mama_mia mamma_mia sonuvabitch son_of_a_bitch heck sodding_hell aw aww awww oh_great oh_wow er err.ij errr um.ij umm anyways honey.ij man.ij baby.ij hush.ij: + or or ({{Ic-} & Wi-} & {{Xd-} & Xc+} & Ic+) or or (({A-} or {E-} or {EE-}) & Wa-); % A single plain hello all by itself. Costly, because these days, -% its not normally a sentence opener. +% it's not normally a sentence opener. % Vv-: "I said, hello!" % Ds- & Jv-: "He greeted me with a loud hello" % Perhaps these should also appear as nouns? hello.n does ... @@ -11318,9 +11848,7 @@ on_arrival by_request in_total in_turn later_on for_now more_often_than_not -anyhow.ij anyway.ij afterwards.ij afterward.ij instead.ij on_second_thought - again_and_again time_and_again over_and_over day_by_day day_after_day step_by_step one_by_one even_so all_of_a_sudden: @@ -11340,10 +11868,11 @@ newly: E+; rightly: {EE-} & E+; necessarily no_longer: E+ or EBm-; -ever: E+ or EBm- or EC+ or MVa-; +ever: E+ or EBm- or EC+ or MVa- or ; -never.e always: {EN-} & (E+ or EB-); -seldom rarely.e: {EE-} & (E+ or EB-); +never.e always: ({EN-} & (E+ or EB-)) or ; +%suppress: DUP-BASE (for rarely.e) +seldom rarely.e: ({EE-} & (E+ or EB-)) or ; % MVa-: "He did just what you asked." % EC+: "I ate just some of the cookies" @@ -11383,6 +11912,7 @@ or (Rnx+ & ) or (MVp+ & Wq- & Q+); +%suppress: DUP-BASE (for rarely.i) never.i at_no_time not_once rarely.i since_when: {MVp+} & Wq- & Q+; @@ -11396,7 +11926,7 @@ or ((MVp+ or MVa+ or MVs+) & (MVl- or ({Xc+ & {Xd-}} & CO+))) or (Rnx+ & ); -not_even: Rnx+ & ; +not_even: (Rnx+ & ) or ; % {EE-} & EE+: "but not too much" too: @@ -11426,7 +11956,7 @@ % ========================================================== % MISCELLANEOUS WORDS AND PUNCTUATION -etc: {Xi-} & Xd- & Xc+ & (MX- or MVa-); +etc etc.: {Xi-} & Xd- & Xc+ & (MX- or MVa-); so_on the_like vice_versa v.v.: ( & ) or or @@ -11434,7 +11964,7 @@ M- or MV-; % Emoticons ... at start or end of sentences ... -EMOTICON : + : CO+ or (Wd- & NIa+) or Wa- @@ -11447,11 +11977,11 @@ % Xp+ is for new sentences. "Who is Obama? Where was he born?" % Xs+ is for dependent clauses starting with "so". % "I stayed so I could see you." -% XXX TODO: afer all WV's work, the WV link should no longer be optional... +% XXX TODO: after all WV's work, the WV link should no longer be optional... % XXX that is, change to just WV+. % : - ( or ) & {CP+} & {(Xx+ or Xp+ or Xs+) & {hWV+}} & {RW+ or Xp+}; + ( or ) & {hCPx+ or hCPi+ or hCPu+} & {(Xx+ or Xp+ or Xs+) & {hWV+}} & {RW+ or Xp+}; % QU+ links to quoted phrases. % ZZZ+ is a "temporary" addition for randomly-quoted crap, and @@ -11459,6 +11989,7 @@ % (Xc+ or [()]): allow missing comma, but at a price. LEFT-WALL: + or hCPa+ or (QUd+ & & (Xc+ or [()]) & QUc+) or [[ZZZ+ & ]]; @@ -11483,7 +12014,7 @@ ({EBb-} & EAxk+ & {HA+}) or ({EZ-} & EExk+) or Em+ - or ((({Xd-} & (MVs- or []) & Xs-) or ({Xc+} & Wc-)) + or ((({Xd-} & ([MVs-]0.5 or ) & Xs-) or ({Xc+} & Wc-)) & ( or []0.5)) or or (Wq- & CQ+) @@ -11493,6 +12024,9 @@ or O- or Js-; +and_so and_thus: + ; + % Is ever needed here? % Should we be using instead of MVs- ?? % Or maybe every use of MVs- should be converted to ??? @@ -11568,17 +12102,19 @@ % % Comma can conjoin nouns only if used in a list of 3 or more items: % "This, that and the other thing" -% However, this is given a cost, so that geographic names are prefered: +% However, this is given a cost, so that geographic names are preferred: % "He went to Gaeta, Italy, and to Paris, France." % % SJ: "I saw John, not Mary" is handled via idiomatic ,_not construction +% cost on []: allow Pa links with commas, e.g. +% "he paced, worried" but lower cost than Xx links % ",": - ({@Xca- or [[[@Xc-]]]} & (({[EBx+]} & Xd+) or Xc-)) + ({[@Xca-]-0.05 or [[[@Xc-]]]} & (({[EBx+]} & Xd+) or Xc-)) or [] or or - or + or []0.5 or (SJl- & SJr+ & SJl+); % :.j @@ -11602,18 +12138,21 @@ or (TH- & ); % Coordinating conjunctions that behave like punctuation. These -% connect whole clauses. Should wee use here? Why not? +% connect whole clauses. Should we use here? Why or why not? % +% then.ij is often used as a time-ordered conjunction: "I eat then I sleep" % not.ij seems to result in bad parses quite often, do we need it? % Xx-: provides coordination to the wall. % The cost on [] is to use the Xx when possible, because % the VC link often does not go leftwards far enough. % (e.g. "John screamed when I arrived but Sue left") % Wc-: "But my efforts to win his heart have failed" -but.ij and.ij or.ij not.ij also.ij but_not and_not and_yet: - [{Xd-} & (Xx- or Wc-) & {Xc+} +% EB+: "would they have accepted this or, instead, would they have ..." +but.ij and.ij or.ij not.ij also.ij then.ij but_not and_not and_yet: + [{Xd-} & (Xx- or Wc-) & {Xc+} & {EB+} & (Wdc+ or Qd+ or Ws+ or Wq+ or Ww+) & ]1.1; +%suppress: DUP-BASE (for ..y) % (NI- & WV- & W+): Optionally numbered, bulleted lists ..y *.j "•" ⁂ ❧ ☞ ◊ ※ "….j" ○ 。 ゜ ✿ ☆ * ◕ ● ∇ □ ◇ @ ◎: (Wd- & W+) @@ -11621,7 +12160,8 @@ % 、 is the "enumeration-comma" used like a dash ... % ‧ is the "middle dot" -– ━ ー --.r -.r 、 ~.r ~.r ‧.r : +% The four dashes are e28092 e28093 e28094 e28095 +‒ – — ― ━ ー --.r -.r 、 ~.r ~.r ‧.r : [[]] or ({@Xca-} & (({EBx+} & Xd+) or Xc-)) or (Wd- & W+) @@ -11665,9 +12205,14 @@ % The percent sign following a number (also basis pt, per mil) % Also -- see above, for handling of 12ft. 12in. not just 12% -"%" ‰ ‱ : (ND- & {DD-} & & ) or (ND- & (OD- or AN+)); +% AN- & Jp-: "... the concentration in v/v %" +"%" ‰ ‱ : + (ND- & {DD-} & & ) + or (ND- & (OD- or AN+)) + or ({E- or EA-} & A+) + or (AN- & Jp-); -% See also /en/words/currency for curency names that follow a number. +% See also /en/words/currency for currency names that follow a number. $ USD.c US$.c C$.c AUD.c AUD$.c HK.c HK$.c £ ₤ € ¤ ₳ ฿ ¢ ₵ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ₸ ₮ ₩ ¥ ៛ 호점 † †† ‡ § ¶ © ® ℗ № "#": @@ -11698,25 +12243,51 @@ % foo: F+; % ------------------------------------------------------------------------- -% Common typographical errors -% Asign a cost of 1.7 for no good reason. Feel free to change this. - -then.#than: [than.e]1.7; -than.#then-r: [then.r]1.7; -than.#then-i: [then.i]1.7; - -% rather_then: rather_than; - -there.#their: [their.p]1.7; -% theres.#theirs: [theirs.p]1.7; - -% there.#they're: [they're]1.7; -% all.#all_of: [all_of]1.7; +% Common typographical (spelling) errors +% +% Assign a cost of 1.65 for no very good reason. Feel free to change this. +% .. well, some of this is dangerous. For example, setting the cost too +% low causes correct "than" usages with Z link (and missing wall) to be +% priced higher then "typos" with "then". +% +% These perform a kind-of poor-man's rewriting. A more principled, +% elevated design would define and use an "operator grammar" type +% approach. See https://en.wikipedia.org/wiki/Operator_grammar + +then.#than: [than.e]0.65; +than.#then-r: [then.r]1.65; +than.#then-i: [then.i]1.65; +than.#then-ij: [then.ij]0.65; + +rather_then.#rather_than: rather_than; + +there.#their: [their.p]0.65; +% theres.#theirs: [theirs.p]0.65; + +% The apostrophes prevent the dictionary from being read in. +% there.#they're: [they're]0.65; +% all.#all_of: [all_of]0.65; +% its.#it's: [it's]0.65; +% lets.#let's: [let's]0.65; % Using "or" instead of "nor" is a common mistake. % "Neither this or that one will do" or.#nor-j-n: [nor.j-n]; +% Hmm. "there lie the books" smells like present-tense: the books are +% there right now. "there lay the books" suggest past-continuous: they +% are not just there now, but always have been. So is it really a typo +% to say "lie" instead of "lay"? +lie.#lay-v-d: [lay.v-d]0.5; + +% i before e unless the weighty neighbor is a German loan word. +beleive.#believe-v: [believe.v]0.1; + +% Common shorts +thru.#through-r: [through.r]0.05; +nite.#night: [night.r or night.u or night.i or night.n]0.05; +tonite.#tonight: [tonight]0.05; + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Morphology guessing for unknown words. % Given a particular kind of ending to a word, try to guess @@ -11729,7 +12300,7 @@ % ING-WORDS.g: ( & ) or ( & ) % or or ; -ING-WORDS.g: +.g: [[( & )] or [( & )] or [({@E- or EA-} & A+)] or @@ -11743,7 +12314,7 @@ AN+ or {AN-})]0.1; -ED-WORDS.v-d: +.v-d: [ (( & ()) or ( & ([]0.2 or ())) or @@ -11751,13 +12322,13 @@ ) or or ]0.1; -S-WORDS.v: [ +.v: [ (( & ()) or ( & ([]0.2 or ())) or (() & ) or ) ]0.1; -S-WORDS.n: +.n: [( & (({NM+ or Dmc-} & & ( or Bpm+)) or ({NM+ or Dmc-} & ) or @@ -11766,7 +12337,7 @@ Up-)) or [[AN+]]]0.1; -LY-WORDS.e: +.e: [{EE- or EF+} & ( ({Xd- & Xc+} & MVa-) or Em+ @@ -11784,36 +12355,36 @@ % guessed nouns that can be mass or countable % (-in, -ine, -ion, -yl, -ose, -ol, -ide, -ity) -MC-NOUN-WORDS.n: +.n: []0.1; -% guessed nouns that are signular countable (-on, -or) -C-NOUN-WORDS.n: +% guessed nouns that are singular countable (-on, -or) +.n: []0.1; % guessed adjectives (-ous, -ar, -ic) -ADJ-WORDS.a: - []0.1; +.a: + [ or ]0.1; % guessed adjectives/adverbs suffixed by "fold" with or without hyphen -FOLD-WORDS: +: [({EN-} & (MVp- or EC+ or A+)) or Em+]0.1; % latin (postposed) adjectives considered as mass nouns % in the current version (is this right???) -LATIN-ADJ-WORDS.a: +.a: []0.1; % latin (postposed) adjectives or latin plural noun % always considered as nouns in the current version % XXX maybe should be same as words.n.2.x instead of ??? -LATIN-ADJ-P-NOUN-WORDS: +: []0.1; % latin (postposed) adjectives or latin singular noun % always considered as nouns in the current version % XXX this is with weird plural-like stuff ?? is this right? -LATIN-ADJ-S-NOUN-WORDS: +: [ & (AN+ or ({NM+ or D*u-} & & ( or )) @@ -11825,7 +12396,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Guessing of unknown words, if none of the above rules applied. -UNKNOWN-WORD.n: +.n .n: & (AN+ or ({NM+ or ({Jd-} & D*u-)} & & ( or )) @@ -11835,12 +12406,13 @@ or (GN+ & (DD- or [()])) or U-); -UNKNOWN-WORD.v: - {@E-} & ((Sp- & ) or (RS- & Bp-) or (I- & ) or ({Ic-} & Wa- & )) & {O+ or } & {@MV+}; +.v .v: + {@E-} & ((Sp- & ) or (RS- & Bp-) or (I- & ) or ({Ic-} & Wa- & )) & {O+ or } & ; -% Add a miniscule cost, so that the noun-form is prefered, when -% availble. -UNKNOWN-WORD.a: []0.04; +% Add a miniscule cost, so that the noun-form is preferred, when +% available. +.a .a: + [ or ]0.04; % These are the link-types that are not subject to the length limit. % Always use "+" for these. Some of these are obvious. Some deserve @@ -11851,9 +12423,18 @@ % many others, the object of this sentence" requires a long O link % from "puts" to "object". % +% VJ+ is unlimited because some sentences have very long clauses: +% "He obtained the lease of the manor of Great Burstead Grange (near +% East Horndon) from the Abbey of Stratford Langthorne, and purchased +% the manor of Bayhouse in West Thurrock." +% UNLIMITED-CONNECTORS: - S+ & O+ & CO+ & C+ & Xc+ & MV+ & TH+ & W+ - & RW+ & Xp+ & Xx+ & CP+ & SFsx+ & WV+ & CV+; + S+ & O+ & CO+ & C+ & Xc+ & MV+ & TH+ & W+ & + RW+ & Xp+ & Xx+ & CP+ & SFsx+ & WV+ & CV+ & + VJ+ & SJ+; + +% The YS, YP, PH and ZZZ connectors can never be longer than one. +LENGTH-LIMIT-1: YS+ & YP+ & PH+ & ZZZ+; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Extensions by Peter Szolovits, psz@mit.edu, as a part of the work for @@ -11863,7 +12444,7 @@ % subject_to status_post in_conjunction_with sensu - in_relation_to neath amidst across_from circa astride + in_relation_to neath amidst across_from circa ca. c. astride previous_to together_with as_regards s/p aka amongst unto apropos_of w.i W.i: ({JQ+} & (J+ or Mgp+) & ) or (MVp- & B-); diff -Nru link-grammar-5.3.16/data/en/4.0.dict.m4 link-grammar-5.5.0/data/en/4.0.dict.m4 --- link-grammar-5.3.16/data/en/4.0.dict.m4 2017-02-12 09:48:12.000000000 +0000 +++ link-grammar-5.5.0/data/en/4.0.dict.m4 2018-04-27 00:13:40.000000000 +0000 @@ -19,8 +19,8 @@ % % %***************************************************************************% -% Dictionary version number is 5.3.15 (formatted as V5v3v15+) -: V5v3v15+; +% Dictionary version number is 5.5.0 (formatted as V5v5v0+) +: V5v5v0+; : EN4us+; % _ORGANIZATION OF THE DICTIONARY_ @@ -126,10 +126,28 @@ % noun-main-s -- singular % XXX FIXME: is often used with and sub % has a R+ & B+ on it. The problem here is that R+ & B+ should not -% be used with the J- here. This needs to be refactored to prevent -% this, or at least, cost it in some way. +% be used with the J- and sometimes not with the O-. So, for example, +% "I still remember the room I kissed him" - room is the object, and +% the "I" should not be relative. +% This needs to be refactored to prevent this, at least for J. How to +% handle the R+ & B+ on inappropriate O- is less clear. +% +% (Js- & {Mf+}): Allows constructions involving "of" to link locally, +% e.g. "Got it from the Abbey of Stratford Langthorne" +% links "of" to "Abbey" instead of "it". +% +% Ss*s+: blocks links to Ss*t- which accepts singular subject, but +% requires plural object. See "are.v" for examples. +% : - (Ss+ & ) or SIs- or Js- or Os- + (Ss*s+ & ) or SIs- or (Js- & {Mf+}) or Os- + or + or ; + +% noun-main-e: used for proper names (named entities) +% Os*e- is used to allow certain adjectival uses. +: + (Ss*s+ & ) or SIs- or (Js- & {Mf+}) or Os*e- or or ; @@ -204,7 +222,7 @@ % [@AN-].1: add a tiny cost so that A- is preferred to AN- when there % is a choice. The is because some nouns are also listed as adjectives, % and we want to use the adjective version A- link in such cases. -% [@AN- & @A-] has cost so that G links are prefered. +% [@AN- & @A-] has cost so that G links are preferred. % {[@AN-].1} & {@A- & {[[@AN-]]}}; : (@A- & {[[@AN-]]}) @@ -263,9 +281,6 @@ : TOo+ & IV+; : TOt+ & B+; -% Connects verb to coordinating conjunction. -: VC+; - : Ce+ & CV+; : Cet+ & CV+; : Cs+ & CV+; @@ -278,16 +293,19 @@ % Fronted prepositional and participle phrases, used with % subject-object inversion. Wp connect to preps only. -: [Wp- & PF+]; +% Why is there a cost? How big should the cost be? +% PFd+: prevent links to PFt- +: [dWp- & (dPFb+ or dPFd+)]0.1; % The use of COa here needs to be carefully re-examined; it is used much too freely. % COa+ is used to block links to COd- -% Xc+ & Ic+: connect to imperatives (infinitve verbs): "Anyhow, don't" +% Xc+ & Ic+: connect to imperatives (infinitive verbs): "Anyhow, don't" % Wc- & Xc+ & Qd+: subject-object inversion: "anyhow, am I right?" +% This gets a fairly stiff cost if the comma is missing. : {[[Wa-]]} & ((Xc+ & Ic+) or - (Wc- & (Xc+ or [()]) & Qd+) or + (Wc- & (Xc+ or [()]1.2) & Qd+) or ({Xd-} & (Xc+ or [[()]]) & [COa+])); % Just pure singular entities, no mass nouns @@ -299,10 +317,10 @@ % % We do NOT tag these with , a this messes up first-word % processing in tokenize.c. So for example, we do *not* want "There" -% in "There they are" tagged as an entity, just because its capitalized. +% in "There they are" tagged as an entity, just because it's capitalized. % We really do want to force the lower-case usage, because the lower case -% is in the dict, and its the right word to use. (The only entities that -% should be tagged as such are those that are in the dicts, in thier +% is in the dict, and it's the right word to use. (The only entities that +% should be tagged as such are those that are in the dicts, in their % capitalized form, e.g. "Sue.f" female given name as opposed to "sue.v" % verb in the sentence "Sue went to the store.") % @@ -316,7 +334,7 @@ % % MX+ & : country names: "...went to Paris, France" % -INITIALS : + : ({NM+} & ({G-} & {[MG+]} & (({DG- or [[GN-]] or [[@A- & @AN-]] or [[{@A-} & {D-}]] or ({@A-} & Jd- & Dmc-)} & (( & (JG- or )) @@ -329,17 +347,17 @@ or ; % As above, but with a tiny extra cost, so that a dictionary word is -% prefered to the regex match (i.e. for a common noun starting a +% preferred to the regex match (i.e. for a common noun starting a % sentence). However, the other regex matches (e.g. MC-NOUN-WORDS) % should have a cost that is even higher (so that we take the % capitalized version before we take any other matches.) -CAPITALIZED-WORDS: []0.05; +: []0.06; % Capitalized words that seem to be plural (by ending with an s, etc) % -- But not all words that end with an 's' are plural: % e.g. Cornwallis ... and some of these can take a singular determiner: % "a Starbucks" -PL-CAPITALIZED-WORDS: +: ({NM+} & {G-} & {[MG+]} & (({DG- or [[GN-]] or [[{@A-} & ({Dmc-} or {Ds-})]] or ({@A-} & Jd- & Dmc-) } & ([ & (JG- or )] @@ -361,7 +379,7 @@ % "Tom" is a given name, but can also be a proper name, so e.g. % "The late Mr. Tom will be missed." which needs A-, D- links % Wa-: A single exclamation: "Tom! Hey, Tom! Oh, hello John!" -% is trikcy when used with [[...]] connectors. +% is tricky when used with [[...]] connectors. % Careful for bad parses of % "This is the dog and cat Pat and I chased and ate" % "actress Whoopi Goldberg and singer Michael Jackson attended the ceremony" @@ -371,7 +389,7 @@ : {G-} & {[MG+]} & (({DG- or [GN-]2.1 or [[{@A-} & {D-}]]} & - (({@MX+} & {NMr+} & (JG- or or )) + (({@MX+} & {NMr+} & (JG- or or )) or YS+ or YP+)) or AN+ @@ -431,7 +449,7 @@ or Us- or ({D*u-} & Wa-)); -GREEK-LETTER-AND-NUMBER pH.i x.n: ; + pH.i x.n: ; % Same as pattern used in words.n.4 -- mass nouns or countable nouns : ; @@ -454,24 +472,9 @@ or Up- or ({Dmc-} & Wa-)); -%for YEAR-DATE year numbers -: - NMd- - or ({EN-} & (NIfn+ or NItn-)) - or NN+ - or AN+ - or Wa- - or ((Xd- & TY- & Xc+) or TY-) - or ({EN- or NIc-} - & (ND+ - or OD- - or ({{@L+} & DD-} - & ([[Dmcn+]] - or (( or TA-) & (JT- or IN- or [[]])))))); - % Number abbreviations: no.x No.x % pp. paragraph, page art article -% RR roural route +% RR rural route No.x No..x no.x no..x Nos.x Nos..x nos.x nos..x Nr.x Nr..x Nrs.x Nrs..x nr.x nr..x nrs.x nrs..x Num.x Num..x num.x num..x pp.x pp..x @@ -503,7 +506,8 @@ Blvd.y Blvd..y Pkwy.y Pkwy..y Hwy.y Hwy..y AG.y Assn.y Assn..y Corp.y Corp..y Co.y Co..y Inc.y Inc..y PLC.y -Pty.y Pty..y Ltd.y Ltd..y LTD.y Bldg.y Bldg..y and_Co GmBH.y: +Pty.y Pty..y Ltd.y Ltd..y LTD.y Bldg.y Bldg..y and_Co GmBH.y +et_al et_al. : ({[X-]} & G-) & {[[Xi+]]} & {[MG+]} & (({DG- or [[GN-]] or [[{@A-} & {D-}]]} & (({@MX+} & (JG- or )) or @@ -515,8 +519,12 @@ % Titles, e.g. Joe Blow, Esq. or Dr. Smarty Pants, Ph.D. -% Gack. See absurdely large collection at: +% Gack. See absurdly large collection at: % http://en.wikipedia.org/wiki/List_of_post-nominal_letters +% The generic case of strings of all-uppercase are handled +% by the regexes ALL-UPPER and INITIALS. The below are mixed-case +% or other very common degrees. +.y .y Jr.y Jr..y Sr.y Sr..y Esq.y Esq..y AB.y A.B..y AIA.y A.I.A..y BA.y B.A..y BFA.y B.F.A..y BS.y B.S..y BSc.y B.Sc..y @@ -541,8 +549,9 @@ G+); % The generic category for strings containing a hyphen -PART-NUMBER.n -HYPHENATED-WORDS.n: +.n +.n +.n: [[({@AN-} & {@A-} & (({NM+ or D-} & (( & ( or )) @@ -566,7 +575,7 @@ % % ({NMa+} & AN+): He takes vitamin D supplements. % -% XXX TODO fixme: there are many gerund-like nouns in here (e.g. "reading") +% XXX TODO FIXME: there are many gerund-like nouns in here (e.g. "reading") % which screw things up when linking to "be" (e.g. "I have to be reading now") % by appearing as objects (O-) connector when really the verb form (Pg-) % is what should be happening. So rip these words out... (similar remarks for @@ -632,7 +641,7 @@ /en/words/words.n.2.s : or ; -PL-GREEK-LETTER-AND-NUMBER: ; +: ; % plural nouns not ending in "s" % almost exactly identical to except that there is @@ -688,6 +697,7 @@ % If PH is possible, then it is preferred. See PH below for explanation. : [PHc-]-0.1 or (); : [PHv-]-0.1 or (); +: {PH-}; : @@ -701,9 +711,9 @@ % nouns that are mass only % absolutism.n absorption.n abstinence.n abundance.n academia.n -/en/words/words.n.3-vowel: +/en/words/words.n.3-vowel /en/words/words.lang.3-vowel: or ; -/en/words/words.n.3-const: +/en/words/words.n.3-const /en/words/words.lang.3-const: or ; % Gonna treat these as mass nouns, not sure if this is correct. @@ -720,7 +730,7 @@ % allow these to take AN+ links (we want to have red.a get used instead). % But we do need these as nouns, so as to parse 'she prefers red'. % However, assign a cost, so that 'her shoes are red' gets red.a (with -% the Pa link) perfered over red.n (with the O link). +% the Pa link) preferred over red.n (with the O link). % % Doesn't seem to need a noun-and-x to make this work ... % In other respects, these are kind-of-like mass nouns... @@ -748,7 +758,7 @@ or Wa-; % SINGULAR ENTITIES FOR ENTITY EXTRACTION -% This must appear after other categories so it doesnt interfere with those. +% This must appear after other categories so it doesn't interfere with those. /en/words/entities.national.sing: or ; @@ -841,6 +851,7 @@ or Us- or ({Ds-} & Wa-)); +% ----------------------------------------------------------------- % determiner constructions, with a dangling of: "a number of", "a lot of" % "I have a number of cookies" % "a pride of lions" "a litter of kittens" all take determiners @@ -873,33 +884,48 @@ or ({Ds**c-} & ) or Us-)); -% This gets a cost, so that the {Jd-} link for measures.1 is prefered. +% This gets a cost, so that the {Jd-} link for measures.1 is preferred. kind_of: [] or EA+ or EE+ or Wa-; -% This gets a cost, so that the {Jd-} link for measures.1 is prefered. +% This gets a cost, so that the {Jd-} link for measures.1 is preferred. type_of sort_of breed_of species_of: [] or [Us-] or [Wa-]; -% This gets a cost, so that the {Jd-} link for measures.2 is prefered. +% This gets a cost, so that the {Jd-} link for measures.2 is preferred. kinds_of types_of sorts_of breeds_of species_of: [{{@AN-} & @A-} & U+ & (({Dmc-} & & ( or )) or ({Dmc-} & ) or Up-)]; -percent.u: +% A+: "It has twice the percent value" +percent.u parts.u: ( & ((ND- & {DD-} & & ( or B*x+)) or + or (ND- & {DD-} & ) or - U-)) or - (ND- & (OD- or AN+ or YS+)); + U-)) + or (ND- & (OD- or AN+ or YS+)) + or ({E- or EA-} & A+); + +% Weight and voume in chemistry +% "An example would be a 5% weight per volume NaCl solution." +weight.u volume.u: + AN- & Mp+ & AN+; + +% "It is a a 5%(w/v) NaCl solution." +% "We measure the concentration in v/v %" +v/v vol/vol volume/volume w/v weight/vol weight/volume: + ({{Xd- & Xc+} & AN-} & AN+) + or EA+; +% ----------------------------------------------------------------- % This set of disjuncts should probably be split up and refined. % "shame.n", "crux.n" are here because they need the Ss*t connector % to pick up "that" in "The crux of it is that we must act first." @@ -971,7 +997,7 @@ proof.n doubt.n suspicion.n hope.n knowledge.n relief.n disclosure.n fear.n principle.n concern.n philosophy.n risk.n threat.n conviction.n theory.n speculation.n news.n belief.n contention.n thought.n myth.n -discovery.n rumor.n probability.n fact.n feeling.n comment.n +discovery.n rumor.n probability.n fact.n feeling.n comment.n process.n perception.n sense.n realization.n view.n consensus.n notification.n rule.n danger.n warning.n suggestion.n: ( & ( @@ -1098,7 +1124,7 @@ ; % Regex-based guessing of unknown words, ending in -ity -acy -ance -NOUN-TO-WORDS.n: +.n: ; % Nouns formerly classified as mass nouns (words.n.3) but can take "to" @@ -1114,9 +1140,9 @@ refusals.n pledges.n urges.n missions.n rights.n desires.n needs.n ambitions.n capacities.n mandates.n promises.n abilities.n options.n commitments.n intents.n opportunities.n plans.n plots.n -responsibilities.n chances.n campaigns.n offers.n pressures.n obligations orders.n -temptations.n vows.n permits.n impetuses.n proclivities.n propensities.n moves.n -votes.n bills.n incentives.n: +responsibilities.n chances.n campaigns.n offers.n pressures.n +obligations orders.n temptations.n vows.n permits.n impetuses.n +proclivities.n propensities.n moves.n votes.n bills.n incentives.n: ( & (({{Jd-} & Dmc-} & {@M+} & {( or (R+ & Bp+)) & {[[@M+]]}} & {@MXp+} & ( or @@ -1276,10 +1302,12 @@ % ==================================================================== %PRONOUNS +% MXs+: "he, the shop owner, ..." she he: - {[[R+ & Bs+]]} & ((Ss+ & ) or SIs- or SJls+); + {[[R+ & Bs+]]} & (({MXs+} & Ss+ & ) or SIs- or SJls+); % The E- is for "It's either us or them" ... not ideal, but OK +% See also me.p below. me him: J- or Ox- or ({[[E-]]} & SJl+) or SJr-; @@ -1326,11 +1354,13 @@ % yisser yousser ye'r: Irish English second-person possessive -- % https://en.wikipedia.org/wiki/Irish_English -its my.p your their.p our thy yisser.p yousser ye'r: +its my.p your their.p our thy.p yisser.p yousser ye'r: DP+ or ({AL-} & {@L+} & (D+ or DD+)); +% Possessive version of me % Cost on D, DD: avoids use as determiner on "Make me coffee" +%suppress: DUP-BASE (for me.p) me.p: DP+ or [{AL-} & {@L+} & (D+ or DD+)]; @@ -1365,35 +1395,39 @@ % (Osm- & {@M+}): "is this it?" "Do you have it ready?" % "leave it alone" "make it rain" % (the m prevents links as indirect object) +% MX+ & Ss+: "it -- fried rice -- is very popular" it: [J-] or (Osm- & {@M+}) or (Jd- & Dmu- & Os-) or (Jd- & Dmu- & {Wd-} & S+) - or ((Ss+ or SFsi+) & ) + or ({MX+} & (Ss+ or SFsi+) & ) or SIs- or SFIsi- or OXi- or Vp- or SJls+ or SJrs-; % O*c: "we will go faster than they" +% MXp+: "they, the twins, did it" they: - ({[[R+ & Bp+]]} & ((Sp+ & ) or SIp-)) or + ({[[R+ & Bp+]]} & (({MXp+} & Sp+ & ) or SIp-)) or Ox- or SJlp+ or SJrp-; % DD+: "... how we two should work together" +% MXp+: "We, the undersigned, ..." we: - (Sp+ & ) + ({MXp+} & Sp+ & ) or Ox- or DD+ or SIp- or SJlp+ or SJrp-; -% XXX why is this marked plural (Sp, SIp) ?? a cheap hack to make I've work? +% XXX why is this marked plural (Sp, SIp) ?? a cheap hack to make "I've" work? % We use here to prevent lower-case magnling by % the tokenizer. % SJrp-: allows only "...and I", disallows "I and ..." +% MXs+: "I, Joseph, rang the bell" I.p: - or ((Sp*i+ or SX+) & ) + or ((({MXs+} & Sp*i+) or SX+) & ) or SIp*i- or SJr- or SJl+ @@ -1463,7 +1497,7 @@ or DTn+ or Wa-; -% [[]] costs so that ditranstive verbs don't suffer: +% [[]] costs so that ditransitive verbs don't suffer: % "I taught these mice to jump", taught is ditransitive, we don't want % "these" to be the object. See also "those" % (Jd- & Dmu- & Op-): "I gave him a number of these" @@ -1476,7 +1510,7 @@ or or Wa-; -% [[]] costs so that ditranstive verbs don't suffer, +% [[]] costs so that ditransitive verbs don't suffer, % and get the D+ link instead of the O- link. % See also "these" those: @@ -1521,14 +1555,16 @@ % A naked costs more than one with other links, % so that ditransitive verbs don't get spurious links to all.a -% XXX can this be tighetend up?? -% costs no mater what, so that Ofd+ is prefered. +% XXX can this be tightened up?? +% costs no mater what, so that Ofd+ is preferred. % [E+]0.5: all modifying a verb probably is not right. +% Wa-: "All the people!" as a response to a question. all.a: ({EN-} & ( [E+]0.5 or Dm+ or NIc+ + or (ALx+ & (Jp+ or Ju+) & Wa-) or ( (@M+ or (ALx+ & (Jp+ or Ju+)) or (R+ & B+) or EL+) & []) @@ -1595,7 +1631,7 @@ : PHc+ or [()]0.2; : PHv+ or [()]0.2; % -% XXX why doesn't this clash with a.eq ?? +% See also a.eq below. a: ({(AA- & HA-) or ALa- or [[Lf+]]} & (Ds**c+ or ( & Ds**x+))) or NN+ or NSa+ or NIm+; @@ -1626,7 +1662,7 @@ % [[ & ]] or or Wa-; -% NNumeric modifier: "a couple of thousand dollars" +% Numeric modifier: "a couple of thousand dollars" a_couple_of: NN+ or ND+ or NIn+; @@ -1685,7 +1721,7 @@ or MVa-; % "a part.n" should cover most cases. Perhaps [[OF+ & ]] should be -% reomved?? Anyway, its costed to give OFd+ priority. Likewise, should probably +% removed?? Anyway, it's costed to give OFd+ priority. Likewise, should probably % retire as well, right? part.i: (OFd+ & Dm+) @@ -1708,7 +1744,7 @@ or [[]] or Wa-; -% costly so that OFd+ is prefered. +% costly so that OFd+ is preferred. rest.i: [[DD- & ]]; @@ -1869,7 +1905,8 @@ % For number, cost is added to the roles to prioritize % postmodifier and numeric determiner roles. - +% [[A+]]: "the five seat washed out" +% two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty twenty-one twenty-two twenty-three twenty-four @@ -1891,6 +1928,7 @@ oh-one oh-two oh-three oh-four oh-five oh-six oh-seven oh-eight oh-nine o-one o-two o-three o-four o-five o-six o-seven o-eight o-nine zero-one zero-two zero-three zero-four zero-five zero-six zero-seven zero-eight zero-nine +four-score several: NA+ or NMw- or @@ -1901,7 +1939,8 @@ ({{@L+} & DD-} & (Dmcn+ or ( & []))))) or (NA- & { & }) or (NA- & Xd- & TY- & Xc+) - or Wa-; + or Wa- + or [[A+]]; oh.zero: (NA- & NA+); zero.n: (NA- & NA+) or NN+ or Ds+ or ( & ) or Wa-; @@ -1914,7 +1953,7 @@ % as effectively Dmcn and ND are the "same thing" more or less. % % ({ND+} & NIfn+) or (NItn- & {ND+}): "between 7:30AM and 9:30AM" -NUMBERS FRACTION: + : NMn- or ({EN-} & (({ND+} & NIfn+) or (NItn- & {ND+}))) or NN+ @@ -1930,17 +1969,17 @@ % HMS-TIME consists of HH:MM:SS(AM|PM) type expressions % and should probably have a narrower set of parse patterns than numbers in % general. e.g. should not have EQ links XXX todo -- fix this. -HMS-TIME: NUMBERS & {TZ+}; +: & {TZ+}; % Allowing postposed roman numerals only for now. % e.g "Pope Pious XII" -ROMAN-NUMERAL-WORDS.rn: +.rn: NMr- or ((Wd- or NMr-) & NIa+); % nouns that look like roman numerals. Limited requirements to avoid % excessive ambiguity. -ROMAN-NUMERAL-WORDS.n: {@MX+} & (); +.n: {@MX+} & (); % NMa-: Allow post-posed letter modifiers: e.g. "Vitamin A" % Wd- & NIa+: Allow numbered, bulleted lists: "B: Press button firmly" @@ -1951,30 +1990,61 @@ A.id B.id C.id D.id E.id F.id G.id H.id J.id K.id L.id M.id N.id O.id P.id Q.id R.id S.id T.id U.id V.id W.id X.id Y.id Z.id: - or NMa- - or ((Wd- or NMa-) & NIa+) + or [NMa-]0.5 + or [(Wd- or NMa-) & NIa+]0.5 or Ju-; % Avoid having I.id interfere with pronoun I. I.id: [[ or NMa- or (Wd- & NIa+)]]; +% Variables: "suppose that X or Y is responsible." +J.n K.n L.n M.n N.n P.n Q.n R.n S.n T.n V.n W.n X.n Y.n Z.n: + + or + or ; + +% Given-name initials: +A. B. C. D. E. F. G. H. I. J. K. L. M. N. +O. P. Q. R. S. T. U. V. W. X. Y. Z. : + + or ({G-} & G+); + % Days of month % Note, however, this over-rides generic numbers in this range % and so is a superset of the generic numbers disjuncts % The following should match NUMBERS with the addition of "or TM-". 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31: -NUMBERS or TM- or [[G+]]; + or TM- or [[G+]]; % Ordinals - day-of-month expressions. -% Used only in espressions such as "December 2nd" +% Used only in expressions such as "December 2nd" % Must use regex here as well, to avoid conflict with other regexes first.ti second.ti third.ti fourth.ti fifth.ti sixth.ti seventh.ti eighth.ti ninth.ti tenth.ti eleventh.ti twelfth.ti thirteenth.ti fourteenth.ti fifteenth.ti sixteenth.ti seventeenth.ti eighteenth.ti nineteenth.ti twentieth.ti twenty-first.ti twenty-second.ti twenty-third.ti twenty-fourth.ti twenty-fifth.ti twenty-sixth.ti twenty-seventh.ti twenty-eighth.ti twenty-ninth.ti thirtieth.ti -thirty-first.ti DAY-ORDINALS.ti: TM-; +thirty-first.ti .ti: TM-; + +% For YEAR-DATE year numbers +% AN+ is given a cost, because attaches incorrectly to +% measurements of various kinds, where the number is not actually a +% date, and ND is the preferred linkage. +% This needs cleanup, I think ...!? +: + NMd- + or ({EN-} & (NIfn+ or NItn-)) + or NN+ + or [AN+] + or Wa- + or ((Xd- & TY- & Xc+) or TY-) + or ({EN- or NIc-} + & (ND+ + or OD- + or ({{@L+} & DD-} + & ([[Dmcn+]] + or (( or TA-) & (JT- or IN- or [[]])))))); % Years w/o apostrophe: e.g. 47 Ford Fairlane or 57 Chevy 01 02 03 04 05 06 07 08 09: or [[G+]]; @@ -1986,11 +2056,11 @@ 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99: - NUMBERS or or [[G+]]; + or or [[G+]]; % the DECADE-DATE regex matches 1950s 1950's 1950’s etc. % A+: It's an old 50's love song -DECADE-DATE + '00s '10s '20s '30s '40s '50s '60s '70s '80s '90s ‘00s ‘10s ‘20s ‘30s ‘40s ‘50s ‘60s ‘70s ‘80s ‘90s 00's 10's 20's 30's 40's 50's 60's 70's 80's 90's: @@ -1999,7 +2069,7 @@ % year numbers % 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 -YEAR-DATE: NUMBERS or or [[G+]]; +: or or [[G+]]; % Years: e.g. '47 Ford Fairlane or '57 Chevy '00 '01 '02 '03 '04 '05 '06 '07 '08 '09 @@ -2091,18 +2161,20 @@ & (ND+ or NIn+ or OD- or ({DD-} & ([[Ds+]] or Dmcn+ or ( & ))))); -and_a_half: (NW- or NSn-) & -(NNy+ or ({EN- or NIc-} & (ND+ or NIn+ or ({DD-} & -(Dmcn+ or ( & )))))); +and_a_half: + (NW- or NSn- or ND-) & + (NNy+ or ({EN- or NIc-} & (ND+ or NIn+ or ({DD-} & + (Dmcn+ or ( & )))))); quarter.i: -NS- & {EN-} & (NF+ or ( & )); + NS- & {EN-} & (NF+ or ( & )); thirds.m fourths.m quarters.m fifths.m sixths.m sevenths.m eighths.m ninths.m tenths.m: -NW- & {EN-} & (NF+ or ( & )); + NW- & {EN-} & (NF+ or ( & )); -first.a: L- or Pa- or E+ or MVa- or ({Xc+ & {Xd-}} & CO+) or A+ or [Jp-] or -TT+ or ((DD- or [[NSa-]]) & & {} & ); +first.a: + L- or Pa- or E+ or MVa- or ({Xc+ & {Xd-}} & CO+) or A+ or [Jp-] or + TT+ or ((DD- or [[NSa-]]) & & {} & ); last.a dead_last dead_fucking_last DFL: L- @@ -2115,8 +2187,9 @@ or A+ or [Jp-]; -second.a: L- or Pa- or MVa- or ({Xc+ & {Xd-}} & CO+) or -(DD- & & {} & ) or NR+ or A+; +second.a: + L- or Pa- or MVa- or ({Xc+ & {Xd-}} & CO+) or + (DD- & & {} & ) or NR+ or A+; % This uses the L link for superlatives, but leads to strange parses: % "We celebrated their eleventh anniversary" parses differently @@ -2172,7 +2245,7 @@ % prefer G+ over A+ in general, as these are typically parts of names. % ({Ds-} & AJla+): "he is in either the X or the Y battalion" 1º.a 2º.a 3º.a 4º.a 5º.a 6º.a 7º.a 8º.a 9º.a -DAY-ORDINALS.a ORDINALS.a: +.a .a: Pa- or MVa- or ({Xc+ & {Xd-}} & CO+) or @@ -2222,7 +2295,7 @@ ninety-first.ord ninety-second.ord ninety-third.ord ninety-fourth.ord ninety-fifth.ord ninety-sixth.ord ninety-seventh.ord ninety-eighth.ord ninety-ninth.ord -DAY-ORDINALS.ord ORDINALS.ord : +.ord .ord : (Wd- & {M+} & Ss*o+); % TODO: un-parenthesized cases, e.g. @@ -2230,6 +2303,7 @@ % - the strength was in the order of gerE > cotD > yfhP P2 > yfhP P1 % also remember "-->" +%suppress: DUP-BASE (for a.eq) A.eq B.eq C.eq D.eq E.eq F.eq G.eq H.eq I.eq J.eq K.eq L.eq M.eq N.eq O.eq P.eq Q.eq R.eq S.eq T.eq U.eq V.eq W.eq X.eq Y.eq Z.eq a.eq b.eq c.eq d.eq e.eq f.eq g.eq h.eq i.eq j.eq k.eq l.eq m.eq @@ -2241,7 +2315,7 @@ or_so: ND- & {{@L+} & DD-} & (Dmcn+ or ( & )); % Allows parsing of "dollars per day" or "mL/sec" but is somewhat -% inconsistent with the equation persing otherwise described below. +% inconsistent with the equation parsing otherwise described below. % XXX overall, eqn parsing could be strengthened. per "/.per": Us+ & Mp-; @@ -2260,7 +2334,7 @@ % : links verb to wall or to controlling phrase. % : links verbs to singular subjects % : links verbs to plural subjects -% : links to infinitve +% : links to infinitive % : to plural subjects or infinitives % : to singular or plural subject % : to past-participles @@ -2274,18 +2348,20 @@ % IV connects infinitives to the head-verb % VC connects the head-word to a subsequent coordinating conjunction. % -% There are some other such connectors that don't quite fit this patten: -% AF, Z, and in many cases B (for example TOt+ & B+) for this reason, we -% have to have a costly null [[()]] below, although we would really really +% There are some other such connectors that don't quite fit this pattern: +% AF, Z, and in many cases B (for example TOt+ & B+). For this reason, we +% have to have a costly null [()] below, although we would really really % like to get rid of it. But that would take a lot of Z and B and AF link % fiddling about, so we have to live with this for now. % % Also: CP-, Eq+ and COq+ all connect to verbs, and are so disjoined % with % -: ((dWV- or dCV- or dIV-) & {VC+}) or [[()]]; +: ((dWV- or dCV- or dIV-) & {VC+}) or [()]; % : (dWV- or dCV- or dIV-) & {VC+}; +: {@MV+} & {VC+}; + % When we are done, remove the option costly NULL below. : hWV+ or [[()]]; % : hWV+; @@ -2294,7 +2370,7 @@ % Pv- & no wall: "John felt vindicated" % The problem here is that for passives (i.e. to-be), The Pv should get the wall % but in the other cases it should not. We could/should tighten this up by using -% Pvp+ on to-be, using Pvv for the others, and demaninds the wall only for Pvp. +% Pvp+ on to-be, using Pvv for the others, and demand the wall only for Pvp. % XXX FIXME, the above needs fixing. % % : PP- & WV-: "I have seen it". @@ -2303,16 +2379,16 @@ % XXX FIXME: for certain transitive verbs, we really want verb-ico to be % in the form (I- & B- & ) for example: "that I did not know". % -: {@E-} & ((Ss- & ) or (RS- & Bs-)); -: {@E-} & ((Sp- & ) or (RS- & Bp-)); -: {@E-} & ((S- & ) or (RS- & B-)); +: {@E-} & ((Ss- & {hPFt-} & ) or (RS- & Bs-)); +: {@E-} & ((Sp- & {hPFt-} & ) or (RS- & Bp-)); +: {@E-} & ((S- & {hPFt-} & ) or (RS- & B-)); : {@E-} & PP- & {}; : {@E-} & (Pg- or Mg-); % Pv- & OFj+: "knowledge was gained of the activities" -: {@E-} & ((Pv- & {} & {OFj+}) or Mv-) & {@MV+}; -: {@E-} & ((Pv- & {} & {K+}) or Mv-) & {@MV+}; -: {@E-} & ((Pv- & {}) or Mv-); +: {@E-} & ((Pv- & {hPFt-} & {} & {OFj+}) or Mv-) & ; +: {@E-} & ((Pv- & {hPFt-} & {} & {K+}) or Mv-) & ; +: {@E-} & ((Pv- & {hPFt-} & {}) or Mv-); : or ; % used only in "as " constructions, which behave @@ -2321,19 +2397,23 @@ % % Sa*v-: "He did it as expected" % Sa*v- & MXsr-: "the movie, as filmed, is too long" -% The cost on MXsr+ is to give MVs preference for +% The cost on MXsr+ is to give MVs preference for % "She sang well, as planned" -% {MV+}: "the man, as acquiescing as he was, set a bad precedent." -% I want to use MVz+ for above, but it doesn't work... +% {MVz+}: "the man, as acquiescing as he was, set a bad precedent." % Pv- & CV- & MXsr-: "The accused, as shall be proven, is innocent" +% S- & CV- & MXsr-: "The accused, as I suspected, is innocent" +% I*x- & CV- & MXsr-: "The accused, as I will show, is innocent" +% The x on I*x blocks I*v, so that Pv- is used. : - (Sa*v- or (Pv- & CV-)) & {Xd-} & {[MXsr-]0.1 & {MV+}} & {Xc+}; + ((Sa*v- or EAy- or ({E-} & Pv- & CV-)) & {Xd-} & {[MXsr-]0.1 & {MVz+}} & {Xc+}) + or ({E-} & S- & CV- & {Xd-} & MXsr- & {Xc+}) + or ({E-} & I*x- & CV- & {Xd-} & MXsr- & {Xc+}); % Cost: "He was driven home" should NOT use driven.v as adjective! % From what I can tell, can be used anywhere that % can be... except forr said.v-d : - ({@E-} & [A+]0.5) + (({@E-} or {@EA-} or {@AN-}) & [A+ or Pa-]0.5) or ; % Wi- & {NM+}: imperative numbered lists: "Step 5. Do this." @@ -2341,15 +2421,16 @@ % allows a really weird subject-object inversion to proceed: % e.g. "In the corner lay it" with it as object. That's just % wrong... but this requires lots of places to fix. -: {@E-} & I- & ; -: {@E-} & ((I- & {} & {@E-}) or ({[CO-]} & Wi- & {NM+})); +% VJrpi-: "I aim to do something and also to help." +: {@E-} & I- & ( or VJrpi-); +: {@E-} & ((I- & { or VJrpi-} & {@E-}) or ({[CO-]} & Wi- & {NM+})); : or ; -: {@E-} & PF- & {} & SI+; -: {@E-} & PF- & {} & SIp+; +: {@E-} & hPF- & {} & hSI+; +: {@E-} & hPF- & {} & hSIp+; % is meant to be a generic replacement in the naked B- in -% many transitive verb constructions. For quetions, we need to force +% many transitive verb constructions. For questions, we need to force % a verb-wall connector; this is what the (B*w- or B*m-) & % part does. For the other B- forms, we don't need the wall. To force % the wall, we just list all the others. @@ -2414,7 +2495,7 @@ % Iq-: "The big question is did he do it?" % Xd- & Iq-: "The big question is, did he do it?" : Rw- or ({{Xd-} & Iq-} & (Qd- or ((Qw- or Qe-) & ))) or [()]; -% Just like above, but no aux, shuld always be anded with I+. +% Just like above, but no aux, should always be anded with I+. % The idea here is that the verb on the other end of the I+ will % connect to the wall. : Rw- or ({{Xd-} & Iq-} & (Qd- or Qw- or Qe-)) or [()]; @@ -2428,7 +2509,7 @@ : {@E-} & (((S- or SFsi-) & ) or (RS- & B-)); : {@E-} & PPf- & ; : {@E-} & (Pgf- or Mg-); -: {@E-} & ((Pvf- & ) or Mv-) & {@MV+}; +: {@E-} & ((Pvf- & ) or Mv-) & ; : {@E-} & ((Pvf- & ) or Mv-); % These are the verb-form expressions for special verbs that can take @@ -2436,7 +2517,7 @@ % These are used almost exclusively with auxiliary verbs. % This is why they don't have & in them: we don't want the % auxiliary attaching to the wall, we want only the main verb doing this. -% The Ss- or Sp- prevent attachements to Sa- for "as.e" phrases. +% The Ss- or Sp- prevent attachments to Sa- for "as.e" phrases. : {@E-} & (Sp- or SFp- or If- or (RS- & Bp-) or Wi-); : {@E-} & (Ss- or SFs- or (RS- & Bs-)); : {@E-} & (Ss- or SFs- or SFu- or (RS- & Bs-)); @@ -2458,12 +2539,16 @@ % Almost identical to the above, except that the verb attaches to the % wall. We cannot use verb-s for this, since the SFsi prevents the parse % of sentences like "there appears to be a problem". +% If- blocks the Ix+ on would, be : {@E-} & (((Sp- or SFp- or If-) & ) or (RS- & Bp-) or Wi-); : {@E-} & (((Ss- or SFs-) & ) or (RS- & Bs-)); : {@E-} & (((Ss- or SFs- or SFu-) & ) or (RS- & Bs-)); : {@E-} & (((S- or SF- or PPf-) & ) or (RS- & B-)); : {@E-} & (((S- or SF-) & ) or (RS- & B-)); +% Almost like above, but without the RS- & B- +: {@E-} & (Ss- or SFs- or SFu-) & ; + % conjoined verbs. % VJr == left % VJl == right @@ -2478,10 +2563,10 @@ % The conjunction should take an object if both verbs are transitive, % e.g. "I saw and greeted Sue", which should parse as % "I (saw and greeted) Sue". -% VJ**i == intranstive +% VJ**i == intransitive % VJ**t == transitive % -% s == singluar, pl == plural, sp == singular or plural +% s == singular, pl == plural, sp == singular or plural % g == gerund : {@E-} & VJrs-; : {@E-} & VJls+; @@ -2544,7 +2629,7 @@ (($2) & ) or )) -% present tense, but allows transitive connectinos to 'and' +% present tense, but allows transitive connections to 'and' define(`VERB_S_T',`'VERB_x_T(, $1)) % past tense macro, intransitive variation @@ -2612,10 +2697,10 @@ % O+ & : "did" is not an auxiliary, and so needs the wall. : (( - or (O+ & ) + or (O+ & {@MV+} & ) or [[@MV+ & O*n+]] or Vd+ - or ({N+} & (CX- or [[()]]))) & {@MV+}) + or ({N+} & (CX- or [[()]]))) & ) or ({N+} & I*d+); % I used verb-and-sp-i but maybe verb-and-pl is better? @@ -2624,20 +2709,22 @@ % Naked I*d-: "How do you do?", "what is there to do?" % I*t- & O+: forces use of object in to-do expressions. % "Are you really going to do it to them?" +% { or VJrpi-}: "I aim to help and also to do something" do.v: ({@E-} & (Sp- or SFp- or (RS- & Bp-) or ({Ic-} & Wi-)) & ) or ( & ([] or ())) or ( & ) or ((SIp+ or SFIp+) & (( & I*d+) or CQ-)) - or ({@E-} & I*t- & O+ & {@MV+} & IV-) - or ({@E-} & I- & (( or O+ or [[@MV+ & O*n+]] or CX-) & {@MV+}) & {}) + or ({@E-} & I*t- & O+ & IV- & ) + or ({@E-} & I- & (( or O+ or [[@MV+ & O*n+]] or CX-) & ) & { or VJrpi-}) or ({@E-} & I- & CV-) or ({@E-} & I*d- & {}); +% Ss- & : "so it does!" % Ss- & & @MV+: "he does as he pleases." does.v: VERB_X_S() - or ({@E-} & Ss- & & @MV+) + or ({@E-} & Ss- & & ) or ((SIs+ or SFIs+) & (( & I*d+) or CQ-)); % Ss- & & @MV+: "he did as he pleased." @@ -2645,7 +2732,7 @@ did.v-d: ( & ) or ( & ) - or ({@E-} & Ss- & & @MV+) + or ({@E-} & Ss- & & ) or ( & ) or ( & ) or ((SI+ or SFI+) & (( & I*d+) or CQ-)); % @@ -2654,7 +2741,7 @@ % Os+ & @MV+: "I've done that chore many times" : - or (O+ & {@MV+}) + or (O+ & ) or [[@MV+ & O*n+]] or Vd+; @@ -2662,7 +2749,7 @@ done.v: VERB_PP() or - or ( & ) + or ( & {}) or (S- & ); % Pa- & Pg+: "I am done working", "I am through being mad" @@ -2671,7 +2758,7 @@ or (AJra- & {@MV+}) or ({@MV+} & AJla+); -doing.v: & (O+ or or [[@MV+ & O*n+]] or Vd+) & {@MV+}; +doing.v: & (O+ or or [[@MV+ & O*n+]] or Vd+) & ; doing.g: ((O+ or or [[@MV+ & O*n+]] or Vd+) & {@MV+} & ) or ; better.i fine.i ok.i okay.i OK.i poorly.i well.i: {EE-} & Vd-; @@ -2685,8 +2772,8 @@ or ({@E-} & {Ic-} & Wi- & {I*d+}); doesn't doesn’t: - (( & (SIs+ or SFIs+) & I*d+) or - ) & (I*d+ or or [[()]]); + (( & (SIs+ or SFIs+) & I*d+) or ) + & (I*d+ or or [[()]]); didn't.v-d didn’t.v-d: (( & (SI+ or SFI+)) or ) @@ -2707,7 +2794,7 @@ : ({@MV+} & ( or [TO+]) & ) or ((B- or (O+ & )) & {@MV+} & {[I*j+ or Pv+]}) - or (([[@MV+ & O*n+]] or CX- or [[()]]) & {@MV+}) + or (([[@MV+ & O*n+]] or CX- or [[()]]) & ) or ({N+} & PP+); have.v: @@ -2732,30 +2819,35 @@ ( & ( or ((O+ or ) & {@MV+} & {[I*j+ or Pv+]}) or - (([[@MV+ & O*n+]] or CX-) & {@MV+}))) or + (([[@MV+ & O*n+]] or CX-) & ))) or [[(SI*j+ or SFI**j+) & PP+ & ((Xd- & VCq- & Xc+) or VCq- or ({{Xd-} & Xc+} & COp+))]]; %we'd they'd I'd he'd she'd you'd: (PP+ or ({Vw+} & I+)) & ; -’d 'd: S- & (PP+ or I+); + +% S- & I+: "I'd love to" +% RS- & Bs- & PP+: "He looked at the girl who'd been knitting" +’d 'd: + (S- & (PP+ or I+)) + or (RS- & Bs- & PP+); having.v: & ; having.g: ( & ) or ; % PP is disjoined with because when PP is used, has/have/had -% is an auxiliarry verb, an should not get a wall connection! +% is an auxiliary verb, an should not get a wall connection! hasn't hasn’t: (( & (SIs+ or SFIs+)) or ()) - & (PP+ or ((([[O+]] & {@MV+}) or [[()]]) & )); + & (PP+ or ((([[O+]] & ) or [[()]]) & )); haven't haven’t: (( & (SIp+ or SFIp+)) or ({@E-} & (Sp- or SFp- or (RS- & Bp-)))) - & (PP+ or ((([[O+]] & {@MV+}) or [[()]]) & )); + & (PP+ or ((([[O+]] & ) or [[()]]) & )); hadn't.v-d hadn’t.v-d: (( & (SI+ or SFI+)) or ({@E-} & (S- or SFs- or SFp- or (RS- & B-)))) - & (PP+ or ((([[O+]] & {@MV+}) or [[()]]) & )); + & (PP+ or ((([[O+]] & ) or [[()]]) & )); % Give [K+] a cost so as to prefer Pp+ in general %: @@ -2801,7 +2893,7 @@ or ( & OF+) or (Osi+ & R+ & Bs+ & ) or (Opi+ & R+ & Bp+ & ) - or ([[()]] & )) & {@MV+}) + or ([[()]] & )) & ) or ( & Pp+ & {THi+ or @MV+}) or THb+ or @@ -2817,7 +2909,7 @@ ({@EBm+} & ((([B**t-] or [K+] or BI+ or OF+ or PFb- or (Osi+ & R+ & Bs+) or (Opi+ & R+ & Bp+) or - [[()]]) & {@MV+}) or + [[()]]) & ) or (Pp+ & {THi+ or @MV+}) or THb+ or or @@ -2827,23 +2919,21 @@ % O*m+ allows "If only there were more!" % THb+ allows "It is your fault that you're a failure." -% The cost on @MV+ causes attachements to the object to be prefered -% over attachments to the copula; for example, prepositions should -% almost surely attach via Mp+ link to the object, as opposed to -% using an MVp+ link to the copula. Example: +% The @MV+ allows modifiers (e.g. prep modifiers) to attach both to +% the object and to the copula. Example: % "There is neither pine nor apple in the pineapple." % "There is no ham in the hamburger, and neither pine nor apple in the pineapple." : - {@EBm+} & (O*t+ or O*m+) & {[@MV+]} & {THb+}; + {@EBm+} & (O*t+ or O*m+) & {@MV+} & {THb+}; : - {@EBm+} & (Opt+ or Omm+) & {[@MV+]} & {THb+}; + {@EBm+} & (Opt+ or Omm+) & {@MV+} & {THb+}; : - {@EBm+} & (Ost+ or Opt+ or Omm+) & {[@MV+]} & {THb+}; + {@EBm+} & (Ost+ or Opt+ or Omm+) & {@MV+} & {THb+}; : - {@EBm+} & Out+ & {[@MV+]} & {THb+}; + {@EBm+} & Out+ & {@MV+} & {THb+}; : or ( & ); : or ( & ); @@ -2869,20 +2959,20 @@ ( & ) or ( & ) or ( & ) - or ( & (SIs*x+ or SIs*b+ or SFIs+) & {}) + or ({} & (SIs*x+ or SIs*b+ or SFIs+) & {}) or (Sa*a- & Pv+) or (Ss*w- & & Pp+ & TO+ & IV+) or (EQ*r- & S- & & EQ*r+); % Similar to above, but no S-O inversion, and no equation. -% Also, a cost, so that possesive 's is preferred. +% Also, a cost, so that possessive 's is preferred. % Also, this can be a contraction for "has": "he has" -> "he's" % & PP+: "He's gone to Boston" (viz "He has gone to Boston") % But also, some contractions are prohibited: % *That's just the kind of person he's -- so: Ss- & PF- not allowed % SIs+ & PFb-: "Where's the ball?" 's.v ’s.v: - [( & + [( & (({@EBm+} & ((( (O*t+ & ) or [K+ & ] @@ -2915,7 +3005,7 @@ are.v: ({@E-} & (Spx- or SFp- or (RS- & Bp-)) & ) or ({@E-} & Ss*t- & & ) - or ( & (SIpx+ or SFIp+) & {}); + or ({} & (SIpx+ or SFIp+) & {}); %we're they're I'm you're: %(({@EBm+} & (((O*t+ or K+ or BI+ or OF+ or Pp+) & {@MV+}) or or Pa+)) or @@ -2923,7 +3013,7 @@ % they're we're ’re 're: - Spx- & (({@EBm+} & ((((O*t+ & ) or K+ or BI+ or OF+ or Pp+) & {@MV+}) or or (Pa+ & ) )) or + Spx- & (({@EBm+} & ((((O*t+ & ) or K+ or BI+ or OF+ or Pp+) & ) or or (Pa+ & ) )) or ({N+} & (Pg+ or Pv+))); % yisser: "you're" Irish English @@ -2948,7 +3038,7 @@ or ( & ) or [[(SI*j+ or SFI**j+) & & ((Xd- & VCq- & Xc+) or VCq- or ({{Xd-} & Xc+} & COp+))]]; -% Ss*w-: allows Wh subjets: "Who am I?" +% Ss*w-: allows Wh subjects: "Who am I?" am.v: ({@E-} & SX- & ) or ( & SXI+ & {}) @@ -2958,7 +3048,7 @@ % I'm == I am ’m 'm: - SX- & (({@EBm+} & (((O*t+ or K+ or BI+ or OF+ or Pp+) & {@MV+}) or or Pa+)) or + SX- & (({@EBm+} & (((O*t+ or K+ or BI+ or OF+ or Pp+) & ) or or Pa+)) or ({N+} & (Pg+ or Pv+))) & ; % S*x- used for passive participles: "this action be taken". @@ -2984,7 +3074,7 @@ being.v: ((({@EBm+} & (((O*t+ or [B**t-] or Pp+ or K+ or OF+ or BI+ or or THb+) - & {@MV+}) + & ) or Pa+)) or AF- or Pv+) & ) @@ -3021,7 +3111,7 @@ % Sa*a- & Ix+: "..., as shall be proven" will.v can.v may.v must.v could.v might.v shall.v shalt.v: ((SI+ or SFI+) & (( & I+) or CQ-)) - or ({N+} & & (I+ or (CX- & {@MV+}) or or [[()]])) + or ({N+} & & (I+ or (CX- & ) or or [[()]])) or (Sa*a- & Ix+) or ( & {N+} & {@E-} & I+) or ({N+} & {@E-} & I+ & ); @@ -3036,14 +3126,14 @@ % : "You know you should." should.v: ((SI+ or SFI+) & (( & I+) or CQ-)) or - ({N+} & & (I+ or (CX- & {@MV+}) or or [[()]])) or + ({N+} & & (I+ or (CX- & ) or or [[()]])) or ( & I+) or (I+ & ) or [[(SI*j+ or SFI**j+) & I+ & ((Xd- & VCq- & Xc+) or VCq- or ({{Xd-} & Xc+} & COp+))]]; % : "I sure wish he would." would.v: ((SI+ or SFI+) & (( & {Vw+} & I+) or CQ-)) or - ({N+} & & (({RT+} & I+) or (CX- & {@MV+}) or or [[()]])) or + ({N+} & & (({RT+} & I+) or (CX- & ) or or [[()]])) or ( & I+) or (I+ & ); % TO+: "I ought to." @@ -3122,8 +3212,8 @@ % the verb form: 1=plural-infinitive, 2=singular, 3=past("ed"), % 4=progressive("-ing"), 5=gerund("-ing".) -% abbreviations for ditransitive and optionally ditranstive verbs -% ditranstive verbs take a direct and indirect object +% abbreviations for ditransitive and optionally ditransitive verbs +% ditransitive verbs take a direct and indirect object % e.g. "I gave her a rose" % B- & O+ & O*n+: "What are the chances you'll give her a kiss?" % O+ & @MV+ & O*n+: "I gave him for his birthday a very expensive present" @@ -3142,7 +3232,7 @@ % ----------------------------------------------------------- % common intransitive verbs -: {@MV+}; +: ; % XXX Hmmm. There is a fair number of verbs in here that are "weakly" % transitive, i.e. are transitive in various rare usages: @@ -3159,19 +3249,29 @@ % also could be sorted out. % % accrue.v ache.v acquiesce.v ad-lib.v adhere.v adjoin.v alight.v -/en/words/words.v.1.1: VERB_PLI({[[O+]]} & ); +% +% : "here rest the remains of St. Stephen" +% XXX Some but not all of these verbs allow inversion; basically, +% anything that is not locative or directive won't allow inversion. +% +/en/words/words.v.1.1: + VERB_PLI({[[O+]]} & ) + or ; % accounts.v accrues.v aches.v acquiesces.v ad-libs.v adheres.v % : Locative subj-obj inversion "far out in the sea lives a fish" % XXX Some but not all of these verbs allow inversion; basically, % anything that is not locative or directive won't allow inversion. +% /en/words/words.v.1.2: VERB_S_I() or ; % accounted.v accrued.v ached.v acquiesced.v ad-libbed.v adhered.v +% Pa+: "He ad-libbed, uninterrupted." /en/words/words.v.1.3: - VERB_SPPP_I({[[O+]]} & ) + VERB_SPPP_I(({[[O+]]} & ) + or ({Xc+} & Pa+)) or ; % : "It was rusted closed" @@ -3190,7 +3290,7 @@ % -------------------------------------------------------------- % intransitive verbs that can take particles like "up" and "out" -: {K+} & {@MV+}; +: {K+} & ; % barge.v booze.v bottom.v bow.v branch.v breeze.v brim.v bulge.v cave.v /en/words/words.v.5.1: VERB_PLI(); @@ -3205,6 +3305,7 @@ lay.v-d: VERB_SP_I() or ; lain.v: VERB_PP(); + /en/words/words.v.5.4: ( & ) or or @@ -3212,9 +3313,9 @@ % -------------------------------------------------------------- -% irregular -- coming is in words.v.5.4 ... +% irregular : - ({(K+ & {Pa+}) or Pv+ or [[Pg+]] or } & {@MV+}) + ({(K+ & {Pa+}) or Pv+ or [[Pg+]] or } & ) or ({@MV+} & Pa+); come.v: VERB_PLI() @@ -3224,10 +3325,17 @@ comes.v: VERB_S_I() or ; came.v-d: VERB_SPPP_I() or ; +% : "he got what he had coming" +coming.v: + ( & ) + or VERB_SPPP_T() + or + or ; + % -------------------------------------------------------------- % optionally transitive verbs % abdicate.v abide.v abort.v accelerate.v acclimate.v acclimatize.v -: {O+ or or [[@MV+ & O*n+]]} & {@MV+}; +: {O+ or or [[@MV+ & O*n+]]} & ; /en/words/words.v.2.1: VERB_PLI(`'); /en/words/words.v.2.2: VERB_S_T(`'); @@ -3236,11 +3344,14 @@ % : only about half of the words in words.v.2.3 should get % this, the other half clearly should not. I'm too lazy to sort it out, % right now. +% has as a part of it. +% Pa+: "he paced, worried" +% & Pa+: "she was posed reclining" /en/words/words.v.2.3: - VERB_SPPP_T(`') - or + VERB_SPPP_T(` or ({Xc+} & Pa+)') + or ( & {{Xc+} & Pa+}) + or ( & O+ & Xc+) or - or or ; % Pa+: "The vase landed, unbroken" @@ -3304,7 +3415,6 @@ VERB_SPPP_T(`') or ( & {{@MV+} & Pa+}) or - or or ; judging.v: & ; @@ -3313,7 +3423,7 @@ % -------------------------------------------------------------- -: {K+ or OD+} & {@MV+}; +: {K+ or OD+} & ; rise.v fall.v:VERB_PLI(); rises.v falls.v: VERB_S_I() or ; rose.v-d fell.v-d: VERB_SPPP_I() or ; @@ -3349,10 +3459,10 @@ % [A+]0.5: He was xxx'ed there should have xxx as verb not adjective. % : - ((K+ & {[[@MV+]]} & (O*n+ or Pa+ or Pv+)) + ((K+ & {[[@MV+]]} & (O*n+ or ({Xc+} & (Pa+ or Pv+)))) or ({O+ or } & {K+}) or [[@MV+ & O*n+]] - ) & {@MV+}; + ) & ; /en/words/words.v.6.1: VERB_PLI(); @@ -3360,7 +3470,7 @@ /en/words/words.v.6.2: VERB_S_T(); /en/words/words.v.6.3: VERB_SPPP_T() or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or or ({K+} & ); @@ -3447,7 +3557,7 @@ or ((O+ or ) & ({@MV+} & Pa**j+)) or ({@E-} & & O+ & {Pa**j+ or K+}) or [[@MV+ & O*n+]] - ) & {@MV+}; + ) & ; catch.v drive.v strike.v: VERB_PLI(); @@ -3511,7 +3621,7 @@ or or [[@MV+ & O*n+]] or ({@E-} & & O+) - ) & {@MV+}; + ) & ; /en/words/words.v.4.1 : VERB_PLI(); /en/words/words-medical.v.4.1: VERB_PLI(); @@ -3520,20 +3630,19 @@ /en/words/words-medical.v.4.2: VERB_S_T(); % is too broad for most of these, but is OK for many. +% is part of /en/words/words.v.4.3: VERB_SPPP_T() - or - or (({@E-} or {@EA-} or {@AN-}) & [A+]0.5) - or + or ( & {{Xc+} & Pa+}) + or or ; % !?XXX many of the new additions fail some common sentences, such as: % The candle guttered. It plummeted to the bottom. Prices plummeted! /en/words/words-medical.v.4.3: VERB_SPPP_T() - or - or (({@E-} or {@EA-} or {@AN-}) & [A+]0.5) - or + or ( & {{Xc+} & Pa+}) + or or ; /en/words/words.v.4.4 @@ -3548,7 +3657,7 @@ overran.v-d mistook.v-d underwrote.v-d: VERB_SP_T(); -hit.v-d misread.v-d shed.v-d rid.v-d overcome.v-d offset.v-d +hit.v-d misread.v-d shed.v-d rid.v-d overcome.v-d overrun.v-d upset.v-d undercut.v-d: VERB_SPPP_T() or ( & ) or @@ -3571,7 +3680,7 @@ or or [[@MV+ & O*n+]] or ({@E-} & & O+) - ) & {@MV+}; + ) & ; frank.v : VERB_PLI(); @@ -3591,7 +3700,7 @@ intoned.v-d roped.v-d: VERB_SPPP_T() or - or (({@E-} or {@EA-} or {@AN-}) & A+) + or or ; corralling.v crowning.v decanting.v inking.v intoning.v roping.v: & ; @@ -3609,11 +3718,11 @@ bakes.v dictates.v kisses.v slices.v: VERB_S_T(); -% A+: "she gave him some slcied bread" +% A+: "she gave him some sliced bread" baked.v-d sliced.v-d: VERB_SPPP_T() or - or (({@E-} or {@EA-} or {@AN-}) & A+) + or or ; dictated.v-d kissed.v-d: VERB_SPPP_T() @@ -3629,14 +3738,14 @@ % A+: "She gave him some buttered bread" : - or ((O+ & K+) & {@MV+}) + or ((O+ & K+) & ) or ; butter.v: VERB_PLI(); butters.v: VERB_S_I(); buttered.v-d: VERB_SPPP_I() or - or (({@E-} or {@EA-} or {@AN-}) & A+); + or ; buttering.v: & ; buttering.g: @@ -3650,12 +3759,14 @@ % % B- & O+ & {K+}: % what are the chances she will TRACK him down to the farm? -% +% Pa+: "he cut out after fifth period" +% K+ & Pa+: "it washed up, unbroken" : ((K+ & {[[@MV+]]} & O*n+) or ((O+ or ) & {K+}) or ({@E-} & & O+ & {K+}) - or [[@MV+ & O*n+]]) & {@MV+}; + or ({K+} & {Xc+} & Pa+) + or [[@MV+ & O*n+]]) & ; /en/words/words.v.8.1: VERB_PLI(); /en/words/words.v.8.2: VERB_S_T(); @@ -3668,26 +3779,27 @@ forgone.v curretted.v forsworn.v oversewn.v over-eaten.v foresworn.v overeaten.v: VERB_PP() or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or or - or ({K+} & ); % : only about half of the words in words.v.8.3 should get % this, the other half clearly should not. I'm too lazy to sort it out, % right now. +% Pa+: "it washed up unbroken" /en/words/words.v.8.3: VERB_SPPP_T() or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or or - or ({K+} & ); -cut.v: - VERB_SPPP_T() or - ( & ) or - ( & {K+} & {@MV+}) or - or ({K+} & ); +cut.v-d: + VERB_SPPP_T() + or ( & ) + or ( & {K+} & ) + or ( & O+ & Xc+) + or + or ({K+} & ); /en/words/words.v.8.4: & ; % : has O+ in it; so must not have S+ in @@ -3697,7 +3809,7 @@ ; % -------------------------------------------------------------- -: (((O+ or ) & {OD+}) or [[@MV+ & O*n+]]) & {@MV+}; +: (((O+ or ) & {OD+}) or [[@MV+ & O*n+]]) & ; raise.v lower.v up.v: VERB_PLI(); raises.v lowers.v ups.v: VERB_S_T(); raised.v-d lowered.v-d upped.v-d: @@ -3748,13 +3860,22 @@ happened.v-d occured.v-d occurred.v-d: VERB_SPPP_I(); happening.v occuring.v occurring.v: ( & ) or ; +% : a subset of , used for urges/desires +% Allows "he does as he pleases" type constructions, using the +% CP link in a paraphrasing-like way. +% O+: "... as it pleases him" +% Pv-: "... as it was hoped" (XXX why not PP-, here ???) +: + {E-} & (S- or PP- or Pv-) & & {Xd-} & [dCPu-]-0.05 & + ({O+ & } or {@MV+ & Xc+}); + % ditransitve : ({@MV+} & {TH+ or or RSe+ or }) or ; wish.v: VERB_PLI(); -wishes.v: VERB_S_I(); -wished.v-d: VERB_SPPP_I(); +wishes.v: VERB_S_I() or ; +wished.v-d: VERB_SPPP_I() or ; wishing.v: & ; wishing.g: ( & ) or ; @@ -3765,43 +3886,60 @@ ({@MV+} & {TH+ or or RSe+ or }) or [[O+ & {@MV+}]]; -hope.v agree.v pretend.v swear.v pray.v vow.v vote.v: VERB_PLI(); -hopes.v agrees.v pretends.v swears.v prays.v vows.v votes.v: VERB_S_I(); -pretended.v-d prayed.v-d: VERB_SPPP_I(); +hope.v agree.v pretend.v swear.v pray.v vow.v vote.v: + VERB_PLI(); +hopes.v agrees.v pretends.v swears.v prays.v vows.v votes.v: + VERB_S_I() or ; +pretended.v-d prayed.v-d: + VERB_SPPP_I() or ; % The ( & THi+) allows "it is hoped that ..." to parse. % Naked Pv+: "..., as was hoped." hoped.v-d voted.v-d vowed.v-d: VERB_SPPP_T() or ( & {THi+}) + or or ; % Naked Pv+: "..., as was agreed." agreed.v-d: VERB_SPPP_T() or ( & {TH+}) + or or ; + swore.v-d: VERB_SP_T(); -sworn.v: VERB_PP() or ; +sworn.v: VERB_PP() or or ; + hoping.v agreeing.v pretending.v swearing.v praying.v vowing.v voting.v: ( & ) or ; +: {@E-} & S- & hPFt- & ; + % XXX Why is there a cost on Pv+ ?? "John appeared vindicated" % N+: "It appears not" +% : "so it seems", "so it appears" +% Ix- & PF- & : "so it would seem" : - {@MV+} & {(Pa+ & ) or or THi+ or AF- or N+ or [Pv+]}; -appear.v: VERB_Y_PLI(); -appears.v: VERB_Y_S(); -appeared.v-d: VERB_Y_SPPP(); + {@MV+} & {(Pa+ & ) + or + or THi+ + or AF- + or N+ + or [{Xc+} &Pv+]}; +appear.v: VERB_Y_PLI() or (Ix- & PF- & ); +appears.v: VERB_Y_S() or ; +appeared.v-d: VERB_Y_SPPP() or ; appearing.v: ( & ) or ; % XXX Why is there a cost on Pv+ ?? "John seemed vindicated" % N+: "It seems not" +% : "so seems it" : {@MV+} & ((Pa+ & ) or or LI+ or THi+ or AF- or N+ or [Pv+]); -seem.v: VERB_Y_PLI(); -seems.v: VERB_Y_S(); -seemed.v-d: VERB_Y_SPPP(); +seem.v: VERB_Y_PLI() or (Ix- & PF- & ); +seems.v: VERB_Y_S() or or ; +seemed.v-d: VERB_Y_SPPP() or or ; seeming.v: ( & ) or ; : {@MV+} & { or QI+}; @@ -3813,24 +3951,28 @@ : ({@MV+} & (TH+ or RSe+ or Z- or )); assert.v contend.v remark.v retort.v intimate.v exclaim.v conjecture.v allege.v surmise.v opine.v insinuate.v suppose.v: - VERB_PLI(); + VERB_PLI() or ; + asserts.v contends.v remarks.v retorts.v intimates.v exclaims.v conjectures.v alleges.v surmises.v opines.v insinuates.v supposes.v: - VERB_S_T(); + VERB_S_T() or ; + retorted.v intimated.v exclaimed.v conjectured.v surmised.v-d opined.v-d insinuated.v-d: VERB_SPPP_I(); asserted.v-d contended.v-d remarked.v-d: VERB_SPPP_T() or - ( & THi+); + ( & THi+) or + ; + alleged.v-d: VERB_SPPP_T() or ( & THi+) or ; + supposed.v-d: VERB_SPPP_T() or ( & { or THi+ or Z-}) - or - or ; + or ; asserting.v contending.v remarking.v retorting.v intimating.v exclaiming.v conjecturing.v alleging.v surmising.v opining.v insinuating.v @@ -3871,7 +4013,7 @@ ( & ) or ; -: {@MV+} & { or TH+ or RSe+ or (OF+ & {@MV+}) or BW-}; +: {@MV+} & { or TH+ or RSe+ or (OF+ & ) or BW-}; dream.v complain.v: VERB_PLI(); dreams.v complains.v: VERB_S_I(); dreamt.v-d dreamed.v-d complained.v-d: VERB_SPPP_I(); @@ -3884,7 +4026,7 @@ % O+ & O*n: "She will think it an act of kindness." % O+ & Pa**j: "She will think it true." : - ({@MV+} & { or TH+ or RSe+ or Z- or (OF+ & {@MV+}) or BW-}) + ({@MV+} & { or TH+ or RSe+ or Z- or (OF+ & ) or BW-}) or (O+ & {@MV+} & {O*n+ or Pa**j+}); think.v: VERB_PLI(); @@ -3932,7 +4074,7 @@ % go.w: {E-} & (Wi- or S-) & I+; % B-: "which way did it go?" -: {K+ or [[Pa+]] or [Pg+] or I*g+ or } & {@MV+}; +: {K+ or [[{Xc+} & Pa+]] or [Pg+] or I*g+ or } & ; go.v: VERB_PLI(); % SFs-: "There goes the cutest guy ever!", needs O*t to survive PP. @@ -3958,7 +4100,7 @@ % XXX TODO maybe need VJ and-able links for going etc. ??? % : "there is going to be a meeting" going.v goin'.v: - (( or ({K+ or [[Pa+]]} & {@MV+})) & ) or + (( or ({K+ or [[{Xc+} & Pa+]]} & )) & ) or or ; @@ -3971,31 +4113,31 @@ or ({@E-} & Sp*i- & WV- & I*t+); % transitive: "stay the prisoner's execution" -: {({@MV+} & (Pa+ or AF-)) or ({K+} & {@MV+}) or (O+ & {@MV+})}; +: {({@MV+} & (Pa+ or AF-)) or ({K+} & ) or (O+ & )}; stay.v: VERB_PLI(); stays.v: VERB_S_T() or ; stayed.v-d: VERB_SPPP_T() or ; staying.v: ( & ) or ; -: {({@MV+} & Pa+) or ({O+ or } & {K+} & {@MV+})}; +: {({@MV+} & Pa+) or ({O+ or } & {K+} & )}; stand.v sit.v: VERB_PLI(); stands.v sits.v: VERB_S_T() or ; stood.v-d sat.v-d: VERB_SPPP_T() or ; standing.v sitting.v: & ; standing.g sitting.g: ( & ) or or ; -: ({@MV+} & {LI+ or Pa+ or AF-}) or {O+ & K+ & {@MV+}}; +: ({@MV+} & {LI+ or Pa+ or AF-}) or {O+ & K+ & }; sound.v: VERB_PLI(); sounds.v: VERB_S_T(); sounded.v-d: VERB_SPPP_T(); sounding.v: ( & ) or ; % K: "He is acting up" -: {({@MV+} & (LI+ or Pa+)) or ({K+ or AF-} & {@MV+})}; +: {({@MV+} & (LI+ or Pa+)) or ({K+ or AF-} & )}; act.v: VERB_PLI(); acts.v: VERB_S_I(); % "be acted upon quikly" -acted.v-d: VERB_SPPP_I() or ( & {K+} & {@MV+}); +acted.v-d: VERB_SPPP_I() or ( & {K+} & ); acting.v: ( & ) or ; % Pa: The team reigns undefeated @@ -4013,9 +4155,9 @@ % K+ connects to particles. % [Pa+]0.1: prefer MVa to Pa whenever possible: "She look right" % [K+]0.2: prefer Pa+ to K+ whenever possible: "She looked up" -: {({@MV+} & (LI+ or [Pa+]0.1)) - or ({[K+]0.2 or AF-} & {@MV+}) - or ((O+ & K+) & {@MV+})}; +: {({@MV+} & (LI+ or [{Xc+} & Pa+]0.1)) + or ({[K+]0.2 or AF-} & ) + or ((O+ & K+) & )}; look.v: VERB_PLI(); looks.v: VERB_S_T(); looked.v-d: VERB_SPPP_T(); @@ -4026,7 +4168,7 @@ % MV+: "who did you wave to?" : {@MV+} - or ((O+ & K+) & {@MV+}) + or ((O+ & K+) & ) or ; wave.v: VERB_PLI(); waves.v: VERB_S_I(); @@ -4051,13 +4193,13 @@ consisted.v-d: VERB_SPPP_I(); consisting.v: ( & ) or ; -: {K+ or OF+} & {@MV+}; +: {K+ or OF+} & ; die.v: VERB_PLI(); dies.v: VERB_S_I(); died.v-d: VERB_SPPP_I(); dying.v: ( & ) or ; -: {({[[@MV+]]} & OT+) or BT-} & {@MV+}; +: {({[[@MV+]]} & OT+) or BT-} & ; last.v wait.v: VERB_PLI(); lasts.v waits.v: VERB_S_I(); lasted.v-d waited.v-d: VERB_SPPP_I(); @@ -4067,34 +4209,60 @@ % TRANSITIVE COMPLEX VERBS (Those that take O+) : or ({@MV+} & ); + attempt.v undertake.v deserve.v manage.v plot.v prefer.v neglect.v -afford.v commit.v profess.v desire.v: +afford.v commit.v profess.v desire.v please.v: VERB_PLI(); -attempts.v undertakes.v deserves.v manages.v plots.v prefers.v -neglects.v affords.v commits.v professes.v desires.v: + +attempts.v undertakes.v manages.v plots.v prefers.v +neglects.v affords.v commits.v professes.v: VERB_S_T(); -attempted.v deserved.v managed.v plotted.v preferred.v neglected.v afforded.v committed.v -professed.v-d desired.v-d: + +attempted.v managed.v plotted.v preferred.v neglected.v +afforded.v committed.v professed.v-d: VERB_SPPP_T() or or or ; + undertook.v-d: VERB_SP_T(); undertaken.v: VERB_PP() or ; -attempting.g undertaking.g deserving.g -plotting.g preferring.g neglecting.g affording.g committing.g professing.g -desiring.g: -( & ) or ; -managing.g: ( & ) or or ; -attempting.v undertaking.v deserving.v managing.v -plotting.v preferring.v neglecting.v affording.v committing.v professing.v -desiring.v: - & ; + +attempting.g undertaking.g deserving.g preferring.g +neglecting.g affording.g committing.g desiring.g +pleasing.g: + ( & ); + +% : "The constant plotting annoyed Jim" +plotting.g professing.g: + ( & ) or ; + +managing.g: + ( & ) or or ; + +attempting.v undertaking.v deserving.v managing.v plotting.v +preferring.v neglecting.v affording.v committing.v professing.v +desiring.v pleasing.v: + & ; + +% : "he gets as he deserves." +deserves.v desires.v pleases.v: + VERB_S_T() or ; + +deserved.v-d desired.v-d pleased.v-d: + VERB_SPPP_T() or + or + or + or + ; % like but with particle -: ({@MV+} & ) or -((((O+ or ) & {K+}) or (K+ & {[[@MV+]]} & O*n+) or -([[@MV+ & O*n+]])) & {@MV+}); +: + ({@MV+} & ) + or ((((O+ or ) & {K+}) + or (K+ & {[[@MV+]]} & O*n+) + or ([[@MV+ & O*n+]])) & ); + seek.v: VERB_PLI(); seeks.v: VERB_S_T(); sought.v-d: VERB_SPPP_T() or or ; @@ -4102,9 +4270,13 @@ seeking.v: & ; % Naked @MV+: "She volunteered as a nurse." -: {} or ({@MV+} & ) or @MV+; -decline.v fail.v hasten.v volunteer.v aim.v: VERB_PLI(); -declines.v fails.v hastens.v volunteers.v aims.v: VERB_S_T(); +: {} or ({@MV+} & ) or ; + +decline.v fail.v hasten.v volunteer.v aim.v: + VERB_PLI(); + +declines.v fails.v hastens.v volunteers.v aims.v: + VERB_S_T(); declined.v-d hastened.v-d volunteered.v-d aimed.v-d: VERB_SPPP_T() or @@ -4128,10 +4300,10 @@ ({@MV+} & ) or ({({O+ or } & {K+}) or (K+ & {[[@MV+]]} & O*n+) or - [[@MV+ & O*n+]]} & {@MV+}); + [[@MV+ & O*n+]]} & ); fight.v: VERB_PLI(); fights.v: VERB_S_T(); -fought.v-d: VERB_SPPP_T() or ( & {K+} & {@MV+}) or +fought.v-d: VERB_SPPP_T() or ( & {K+} & ) or ({K+} & ); fighting.g: ( & ) or ; fighting.v: & ; @@ -4143,13 +4315,13 @@ threaten.v mean.v arrange.v pledge.v: VERB_PLI(); threatens.v means.v arranges.v pledges.v: - VERB_S_T(); + VERB_S_T() or ; threatened.v-d meant.v-d arranged.v-d pledged.v-d: VERB_SPPP_T() or + or or or - or ; meaning.g arranging.g threatening.g pledging.g: @@ -4203,7 +4375,7 @@ : {} or - ({@MV+} & (TH+ or or or RSe+ or Zs- or QI+ or (OF+ & {@MV+}))); + ({@MV+} & (TH+ or or or RSe+ or Zs- or QI+ or (OF+ & ))); learn.v: VERB_PLI(); learns.v: VERB_S_T(); learned.v-d: VERB_SPPP_T() or ( & {THi+}) or ; @@ -4218,7 +4390,6 @@ VERB_SPPP_T() or ( & {THi+ or TSi+ or Z-}) or - or or ; proposing.g: ( & ) or ; proposing.v: & ; @@ -4300,14 +4471,14 @@ : ((({O+ or } & {K+}) or (K+ & {[[@MV+]]} & O*n+) or - [[@MV+ & O*n+]]) & {@MV+}) or + [[@MV+ & O*n+]]) & ) or ({@MV+} & ( or Pg+)); start.v stop.v try.v: VERB_PLI(); starts.v stops.v tries.v: VERB_S_T(); started.v-d stopped.v-d tried.v-d: VERB_SPPP_T() or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or ({K+} & ); starting.g stopping.g trying.g @@ -4337,7 +4508,6 @@ VERB_SPPP_T() or ( & {THi+}) or - or or ; recognizing.g disputing.g accepting.g calculating.g deducing.g recording.g envisioning.g recounting.g signifying.g clarifying.g disclosing.g @@ -4350,15 +4520,15 @@ undisputed.v: ( & {THi+}); -: {} or ({@MV+} & TH+); +% MVp+: "it repeated for ..." "She provided for ..." +: {} or ({@MV+} & TH+) or MVp+; repeat.v reflect.v provide.v counter.v signal.v: VERB_PLI(); repeats.v reflects.v provides.v counters.v signals.v: VERB_S_T(); repeated.v-d reflected.v-d countered.v-d signaled.v-d signalled.v-d: VERB_SPPP_T() or or - or or - ; + ; provided.v-d: VERB_SPPP_T() or or @@ -4392,7 +4562,6 @@ VERB_SPPP_T() or ( & {THi+}) or - or or ; sensing.v doubting.v reckoning.v reaffirming.v stipulating.v @@ -4443,33 +4612,36 @@ VERB_SPPP_T() or ( & {THi+}) or - or or ; imagining.g: ( & ) or ; imagining.v: & ; -% Pa**j link: The doctor declared him insane. +% Pa**j+: "The doctor declared him insane." +% MVa+: "he will suspect soon", "he suspects already" : - or - ({@MV+} & ( or TH+ or RSe+ or Pg+ or Z-)) or - ((O+ or ) & ({@MV+} & Pa**j+)); + + or ({@MV+} & ( or TH+ or RSe+ or Pg+ or Z-)) + or ((O+ or ) & ({@MV+} & Pa**j+)) + or MVa+; declare.v fear.v conclude.v suspect.v concede.v presume.v foresee.v emphasize.v maintain.v acknowledge.v note.v confirm.v stress.v assume.v: - VERB_PLI(); + VERB_PLI() or ; + declares.v fears.v concludes.v suspects.v concedes.v presumes.v foresees.v emphasizes.v maintains.v acknowledges.v notes.v confirms.v stresses.v assumes.v: - VERB_S_T(); + VERB_S_T() or ; + declared.v feared.v concluded.v suspected.v conceded.v presumed.v emphasized.v maintained.v acknowledged.v noted.v confirmed.v-d stressed.v-d assumed.v-d: VERB_SPPP_T() or ( & {THi+ or ({@MV+} & Pa+) }) or - or or ; -foresaw.v-d: VERB_SP_T(); + +foresaw.v-d: VERB_SP_T() or ; foreseen.v: VERB_PP() or ( & {@MV+ or THi+}) or @@ -4502,12 +4674,12 @@ ({@MV+} & ( or TH+ or RSe+)) or ((({O+ or } & {K+}) or (K+ & {[[@MV+]]} & O*n+) or - [[@MV+ & O*n+]]) & {@MV+}); + [[@MV+ & O*n+]]) & ); rule.v add.v: VERB_PLI(); rules.v adds.v: VERB_S_T(); ruled.v-d added.v-d: VERB_SPPP_T() or - ( & {({@MV+} & (THi+ or )) or ({K+} & {@MV+})}) or + ( & {({@MV+} & (THi+ or )) or ({K+} & )}) or ({K+} & ) or ; ruling.g adding.g: ( & ) or ; @@ -4516,13 +4688,13 @@ % with particle : ({@MV+} & (TH+ or Zs- or )) or - ((((O+ or ) & {K+}) or (K+ & {[[@MV+]]} & O*n+)) & {@MV+}) or + ((((O+ or ) & {K+}) or (K+ & {[[@MV+]]} & O*n+)) & ) or ([[@MV+ & O*n+]]); figure.v: VERB_PLI(); figures.v: VERB_S_T(); figured.v-d: VERB_SPPP_T() or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or ({K+} & ); figuring.g: ( & ) or ; figuring.v: & ; @@ -4535,28 +4707,29 @@ % I- & B- & : "What did John say you should do?" predict.v realize.v discover.v determine.v announce.v say.v mention.v admit.v -recall.v reveal.v state.v observe.v indicate.v +recall.v reveal.v divulge.v state.v observe.v indicate.v analyse.v analyze.v assess.v establish.v evaluate.v examine.v question.v test.v hypothesize.v hypothesise.v document.v envisage.v: VERB_PLI() or (I- & & ); predicts.v realizes.v discovers.v determines.v announces.v says.v -mentions.v admits.v recalls.v reveals.v states.v observes.v indicates.v +mentions.v admits.v recalls.v reveals.v divulges.v states.v observes.v +indicates.v analyses.v analyzes.v assesses.v establishes.v evaluates.v examines.v -questions.v tests.v hypothesizes.v hypothesises.v envisages.v documents.v: +questions.v tests.v hypothesizes.v hypothesises.v envisages.v +documents.v: VERB_S_T(); -predicted.v realized.v discovered.v determined.v announced.v mentioned.v -admitted.v recalled.v revealed.v stated.v observed.v indicated.v -analysed.v analyzed.v assessed.v established.v evaluated.v examined.v -questioned.v tested.v -hypothesized.v-d hypothesised.v-d well-established.v-d documented.v-d -envisaged.v-d: +predicted.v-d realized.v-d discovered.v-d determined.v-d announced.v-d +mentioned.v-d admitted.v-d recalled.v-d revealed.v-d divulged.v-d +stated.v-d observed.v-d indicated.v-d analysed.v-d analyzed.v-d +assessed.v-d established.v-d evaluated.v-d examined.v-d questioned.v-d +tested.v-d hypothesized.v-d hypothesised.v-d well-established.v-d +envisaged.v-d documented.v-d: VERB_SPPP_T() or ( & {THi+}) or - or or ; % the second line is almost, not quite, @@ -4568,13 +4741,13 @@ predicting.g realizing.g discovering.g determining.g announcing.g saying.g mentioning.g admitting.g recalling.g revealing.g -stating.g observing.g indicating.g +divulging.g stating.g observing.g indicating.g analysing.g analyzing.g assessing.g establishing.g evaluating.g examining.g questioning.g testing.g hypothesizing.g hypothesising.g documenting.g envisaging.g: ( & ) or ; predicting.v realizing.v discovering.v determining.v announcing.v saying.v mentioning.v admitting.v recalling.v revealing.v -stating.v observing.v indicating.v +divulging.v stating.v observing.v indicating.v analysing.v analyzing.v assessing.v establishing.v evaluating.v examining.v questioning.v testing.v hypothesizing.v hypothesising.v documenting.v envisaging.v: @@ -4586,37 +4759,40 @@ % esplain: basilect of explain guess.v estimate.v understand.v notice.v explain.v esplain.v demonstrate.v: VERB_PLI(); -guesses.v estimates.v understands.v notices.v explains.v demonstrates.v: +guesses.v estimates.v understands.v notices.v explains.v esplains.v +demonstrates.v: VERB_S_T(); -guessed.v-d understood.v-d noticed.v-d explained.v-d demonstrated.v-d: +guessed.v-d understood.v-d noticed.v-d explained.v-d esplained.v-d +demonstrated.v-d: VERB_SPPP_T() or ( & {THi+}) - or + or or ; estimated.v-d: VERB_SPPP_T() or ( & {THi+}) or - or or - ; + ; guessing.g estimating.g understanding.g noticing.g explaining.g demonstrating.g: ( & ) or ; guessing.v estimating.v understanding.v noticing.v explaining.v demonstrating.v: & ; % (QI+ & {MV+}): "I did not know why until recently" +% MVa+: "He knows already" % MVb+: "He should know better" : {} - or ({@MV+} & (((OF+ or QI+)& {@MV+}) or or TH+ or RSe+ or Zs-)) - or ({Xc+} & MVb+); -know.v: VERB_PLI(); -knows.v: VERB_S_T(); + or ({@MV+} & (((OF+ or QI+) & ) or or TH+ or RSe+ or Zs-)) + or ({Xc+} & (MVa+ or MVb+)); +know.v: VERB_PLI( & {dCPu-}) or ; +knows.v: VERB_S_T() or ; knew.v-d: VERB_SP_T(); -known.v: +% dCPu-: "... as is well known" +known.v well-known.v: VERB_PP() or - ( & {THi+ or or QIi+}) or + ( & {THi+ or or QIi+} & {dCPu-}) or or ; knowing.g: ( & ) or ; @@ -4630,14 +4806,13 @@ VERB_SPPP_T() or ( & {THi+ or TSi+}) or or - or ; requesting.g: ( & ) or ; requesting.v: & ; % XXX why is there a cost on Pv ?? : or - ({@MV+} & (Pa+ or TH+ or or RSe+ or AF- or Vf+ or (LI+ or {@MV+}) or [Pv+])); + ({@MV+} & (Pa+ or TH+ or or RSe+ or AF- or Vf+ or (LI+ or ) or [Pv+])); feel.v: VERB_PLI(); feels.v: VERB_S_T(); felt.v-d: VERB_SPPP_T() or ( & {THi+}) or ; @@ -4659,7 +4834,6 @@ VERB_SPPP_T() or or or - or ; studying.g: ( & ) or ; studying.v: & ; @@ -4672,7 +4846,6 @@ VERB_SPPP_T() or or - or or ; discussing.g: ( & ) or ; discussing.v: @@ -4691,7 +4864,6 @@ VERB_SPPP_T() or or or - or ; opposing.g enjoying.g advocating.g contemplating.g @@ -4703,22 +4875,29 @@ favoring.v: & ; -: {} or ({@MV+} & Pg+); +% MVp+: "he finished at last" +: {} or ({@MV+} & Pg+) or (MVp+ & {VC+}); finish.v practice.v resist.v: VERB_PLI(); finishes.v practices.v resists.v quits.v: VERB_S_T(); % : "I want it finished" finished.v-d practiced.v-d resisted.v-d quitted.v-d: - VERB_SPPP_T() or or or ; + VERB_SPPP_T( or ({Xc+} & Pa+)) + or + or + or ; quit.v-d: VERB_PLI() or - VERB_SPPP_T() or - or - or - ; -finishing.g practicing.g resisting.g -quitting.g: ( & ) or ; -finishing.v practicing.v resisting.v quitting.v: & ; + VERB_SPPP_T( or ({Xc+} & Pa+)) + or + or + or ; + +finishing.g practicing.g resisting.g quitting.g: + ( & ) or ; + +finishing.v practicing.v resisting.v quitting.v: + & ; % Pv-: "I want it over with" over_with: ; @@ -4730,21 +4909,21 @@ or (K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+}) or ((O+ or ) & Pa**j+) - or [[@MV+ & O*n+]]) & {@MV+}) + or [[@MV+ & O*n+]]) & ) or ({@MV+} & (Pa+ or AF-)); turn.v: VERB_PLI(); turns.v: VERB_S_T(); turned.v-d: VERB_SPPP_T() - or ( & {K+} & {@MV+}) + or ( & {K+} & ) or ({K+} & ); turning.v: & ; turning.g: ( & ) or ; % plus TI : - ((O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]] or Pv+) & {@MV+}) + ((O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]] or Pv+) & ) or ({@MV+} & (AF- or Pa+)); become.v: VERB_S_PLI() or ( & ) or ; becomes.v: VERB_S_S(); @@ -4759,7 +4938,7 @@ or ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]] - or [Pv+]} & {@MV+}); + or [Pv+]} & ); remain.v: VERB_PLI(); remains.v: VERB_S_T(); remained.v-d: VERB_SPPP_T(); @@ -4771,21 +4950,21 @@ ({@MV+} & (AF- or Pa+)) or ((({O+ or } & {K+}) or (K+ & {[[@MV+]]} & O*n+) - or [[@MV+ & O*n+]]) & {@MV+}); + or [[@MV+ & O*n+]]) & ); grow.v: VERB_PLI() or ; grows.v: VERB_S_T() or ; grew.v-d: VERB_SP_T() or ; grown.v: VERB_PP() or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or or ({K+} & ); growing.g: ( & ) or or ; growing.v: & ; % plus OF -: {O+ or or [[@MV+ & O*n+]] or ({@MV+} & OF+)} & {@MV+}; +: {O+ or or [[@MV+ & O*n+]] or ({@MV+} & OF+)} & ; approve.v: VERB_PLI(); approves.v: VERB_S_T(); approved.v-d: VERB_SPPP_T() or or @@ -4794,7 +4973,7 @@ approving.v: & ; % plus OF -: (O+ or or [[@MV+ & O*n+]] or ({@MV+} & OF+)) & {@MV+}; +: (O+ or or [[@MV+ & O*n+]] or ({@MV+} & OF+)) & ; dispose.v conceive.v: VERB_PLI(); disposes.v conceives.v: VERB_S_T(); disposed.v-d conceived.v-d: VERB_SPPP_T() or or @@ -4807,13 +4986,13 @@ ((K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+}) or [[@MV+ & O*n+]] - or OF+) & {@MV+}; + or OF+) & ; speak.v: VERB_PLI(); speaks.v: VERB_S_T(); spoke.v-d: VERB_SP_T(); spoken.v: VERB_PP() or - ( & {K+} & {@MV+}) or + ( & {K+} & ) or ({K+} & ) or ; speaking.v: & ; @@ -4822,7 +5001,7 @@ % @MV+: "The coffee tastes (the same) as it did last year." (do not want O for "the same") : - or ({@MV+} & ((LI+ & {@MV+}) or AF- or Pa+ or OF+)) + or ({@MV+} & ((LI+ & ) or AF- or Pa+ or OF+)) or @MV+; taste.v: VERB_PLI(); tastes.v: VERB_S_T(); @@ -4832,7 +5011,7 @@ : {} - or ({@MV+} & ((LI+ & {@MV+}) or AF- or Pa+ or OF+)) + or ({@MV+} & ((LI+ & ) or AF- or Pa+ or OF+)) or @MV+; reek.v smell.v: VERB_PLI(); reeks.v smells.v: VERB_S_T(); @@ -4840,9 +5019,9 @@ reeking.g smelling.g: ( & ) or ; reeking.v smelling.v: & ; -% plus partcle and Vt +% plus particle and Vt : - (((K+ & {[[@MV+]]} & O*n+) or ((O+ or ) & {K+ or Vt+}) or [[@MV+ & O*n+]]) & {@MV+}) or + (((K+ & {[[@MV+]]} & O*n+) or ((O+ or ) & {K+ or Vt+}) or [[@MV+ & O*n+]]) & ) or ({O+} & (OT+ or BT-) & {@MV+} & { or }) or (OXii+ & Vtg+ & {@MV+} & TH+) or @MV+; @@ -4852,7 +5031,7 @@ took.v-d: VERB_S_SP(); taken.v: ( & ) - or ( & {K+} & {@MV+}) + or ( & {K+} & ) or or ({K+} & ) or (Pvf- & & Vtg+ & THi+); @@ -4867,16 +5046,17 @@ : ((K+ & {[[@MV+]]} & O*n+) or ((O+ or ) & (K+ or Pp+ or WR-)) or - (Vp+ & (Zs- or MVa+))) & {@MV+}; + (Vp+ & (Zs- or MVa+))) & ; % I- & WR- & & O+: "where did you put it?" +% bare : "The problem, put simply, is money." put.v-d: VERB_SPPP_T() or ( & ) or ({@E-} & I- & WR- & & O+) or - ( & (K+ or Pp+ or WR-) & {@MV+}) or - ((K+ or Pp+) & ); + ( & (K+ or Pp+ or WR-) & ) or + ({K+ or Pp+} & ); puts.v: VERB_S_T(); putting.v: & ; putting.g: ( & ) or ; @@ -4885,12 +5065,12 @@ : (( or (K+ & O*n+) or - ( & {O+})) & {@MV+} & {}) or + ( & {O+})) & & {}) or ([[@MV+ & O*n+]]); cost.v-d: VERB_S_PLI() or VERB_S_SPPP(); costed.v-d: VERB_SPPP_T() or - ( & (({K+} & {@MV+}) or Pa+ or Pg+)) or + ( & (({K+} & ) or Pa+ or Pg+)) or ({K+ or Pa+ or Pg+} & ); costs.v: VERB_S_S(); costing.v: & ; @@ -4898,30 +5078,30 @@ % ditransitive : - ( & {@MV+}) or + ( & ) or (K+ & {[[@MV+]]} & O*n+) or ( & O+) or - ((O+ or ) & (({@MV+} & (Pa+ or AF- or Pg+)) or ({K+} & {@MV+}))) or + ((O+ or ) & (({@MV+} & (Pa+ or AF- or Pg+)) or ({K+} & ))) or ([[@MV+ & O*n+]]) or ({@MV+} & (TH+ or or RSe+)); find.v: VERB_PLI(); finds.v: VERB_S_T(); found.v-d: VERB_SPPP_T() or - ( & (({K+ or AF-} & {@MV+}) or Pa+ or Pg+)) or + ( & (({K+ or AF-} & ) or Pa+ or Pg+)) or ({K+ or Pa+ or Pg+} & ); finding.v: & ; finding.g: ( & ) or ; -% ditranstive +% ditransitive : - ((O+ or ) & (({K+} & {@MV+}) or ({@MV+} & (Pa+ or AF- or Pv+)))) + ((O+ or ) & (({K+} & ) or ({@MV+} & (Pa+ or AF- or Pv+)))) or (( or (K+ & {[[@MV+]]} & O*n+) or K+ or ( & O+) - ) & {@MV+}) - or ({@MV+} & (Pa+ or AF- or Pp+ or )); + ) & ) + or ({@MV+} & (Pa+ or AF- or Pp+ or or VC+)); get.v: VERB_PLI(); gets.v: VERB_S_T(); @@ -4936,34 +5116,36 @@ gotten.v: VERB_PP() or - ( & {K+ or Pp+} & {@MV+}) or + ( & {K+ or Pp+} & ) or ({K+ or Pp+} & ); getting.v gettin'.v gettin.v: & & ; getting.g: ( & ) or ; % Pa+: "He left unarmed" : - ((O+ or ) & (({K+} & {@MV+}) or ({@MV+} & {Pa+ or AF- or Pv+ or Pg+}))) - or ({@MV+} & (Pa+ or Pv+)) - or ({(K+ & {[[@MV+]]} & O*n+) or ([[@MV+ & O*n+]])} & {@MV+}); + ((O+ or ) & + (({K+} & ) + or ({@MV+} & {Pa+ or AF- or Pv+ or Pg+}))) + or ({@MV+} & {Xc+} & (Pa+ or Pv+)) + or ({(K+ & {[[@MV+]]} & O*n+) or ([[@MV+ & O*n+]])} & ); leave.v: VERB_PLI(); leaves.v: VERB_S_T(); left.v-d: VERB_SPPP_T() - or ( & (({K+ or AF-} & {@MV+}) or Pv+ or Pa+ or Pg+)) + or ( & (({K+ or AF-} & ) or Pv+ or Pa+ or Pg+)) or ({K+ or ({@MV+} & (Pv+ or Pa+ or Pg+))} & ); leaving.v leavin'.v: & ; leaving.g leavin'.g: ( & ) or ; : - ((O+ or (K+ & {[[@MV+]]} & O*n+) or [[@MV+ & O*n+]] or Vk+) & {@MV+}) - or ({O+ or } & (([K+] & {@MV+}) or ({@MV+} & (Pa+ or AF- or Pg+ or Pv+)))); + ((O+ or (K+ & {[[@MV+]]} & O*n+) or [[@MV+ & O*n+]] or Vk+) & ) + or ({O+ or } & (([K+] & ) or ({@MV+} & (Pa+ or AF- or Pg+ or Pv+)))); keep.v: VERB_PLI(); keeps.v: VERB_S_T(); kept.v-d: VERB_SPPP_T() or -( & (({K+ or AF-} & {@MV+}) or Pa+ or Pg+ or Pv+)) or +( & (({K+ or AF-} & ) or Pa+ or Pg+ or Pv+)) or ({K+ or ({@MV+} & (Pa+ or Pg+ or Pv+))} & ); keeping.v: & ; keeping.g: ( & ) or ; @@ -4972,13 +5154,13 @@ track.i: Vk- & {OF+}; : - ((K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+ or Vs+}) or - [[@MV+ & O*n+]]) & {@MV+}; + ((K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+ or Vs+}) + or [[@MV+ & O*n+]]) & ; set.v-d: VERB_SPPP_T() or ( & ) or - ( & {K+ or Vs+} & {@MV+}) or + ( & {K+ or Vs+} & ) or or ({K+ or Vs+} & ); sets.v: VERB_S_T(); @@ -4987,12 +5169,16 @@ free.i straight.i loose.i: Vs- & {MV+}; +% TH+: "The court held that ..." : - ((K+ & {[[@MV+]]} & O*n+) or ({O+ or } & {K+ or Vh+}) or [[@MV+ & O*n+]]) & {@MV+}; + ((K+ & {[[@MV+]]} & O*n+) + or TH+ + or ({O+ or } & {K+ or Vh+}) + or [[@MV+ & O*n+]]) & ; hold.v: VERB_PLI(); holds.v: VERB_S_T(); -held.v-d: VERB_SPPP_T() or ( & {K+ or Vh+} & {@MV+}) or +held.v-d: VERB_SPPP_T() or ( & {K+ or Vh+} & ) or or ({K+ or Vh+} & ); holding.v: & ; holding.g: ( & ) or ; @@ -5001,7 +5187,7 @@ : ({@MV+} & ( or TH+ or RSe+ or Z- or )) - or ((O+ or or OX+) & {@MV+} & {}) + or ((O+ or or OX+) & & {}) or ([[@MV+ & O*n+]]); expect.v claim.v: VERB_PLI(); @@ -5010,7 +5196,6 @@ VERB_SPPP_T() or ( & { or THi+ or Z-}) or - or or ({@MV+} & {} & ); expecting.g claiming.g: ( & ) or ; expecting.v claiming.v: & ; @@ -5024,19 +5209,18 @@ VERB_SPPP_T() or ( & { or Z- or @MV+}) or - or - or ({@MV+} & {} & ); + or ( & {} & ); intending.g: ( & ) or ; intending.v: & ; % O+ & TO+: "I dare you to!" % TO+ & Xc+: "try it if you dare to!" -% I+: auxilliary: "no one dared say a word" +% I+: auxiliary: "no one dared say a word" % N+ & TO: "I dare not to say the truth" : - ({N+} & {@MV+} & { or (TO+ & Xc+)}) or + ({N+} & & { or (TO+ & Xc+)}) or ({N+} & I+) or - ((O+ or ) & {@MV+} & { or (TO+ & Xc+)}); + ((O+ or ) & & { or (TO+ & Xc+)}); % SI+ & & I+: "How dare you disobey orders" % & N+ & I+: "He dare not lie to me!" (singular subject) @@ -5055,7 +5239,7 @@ % [TO+]: allows null-infinitive: "Yes, I'd love to." : ({@MV+} & ( or [TO+] or Pg+)) - or ((O+ or or OX+) & {@MV+} & {}) + or ((O+ or or OX+) & & {}) or ([[@MV+ & O*n+]]); like.v: VERB_PLI(); @@ -5064,17 +5248,17 @@ liking.g: ( & ) or ; liking.v: & ; -% ditranstive +% ditransitive : (( or - ( & {O+})) & {@MV+}) or + ( & {O+})) & ) or ({@MV+} & ) or ([[@MV+ & O*n+]]); offer.v: VERB_PLI(); offers.v: VERB_S_T(); offered.v-d: VERB_SPPP_T() or ( & {O+ or -or [[@MV+ & O*n+]]} & {@MV+}) or ({O+ or [[@MV+ & O*n+]]} or ); +or [[@MV+ & O*n+]]} & ) or ({O+ or [[@MV+ & O*n+]]} or ); offering.g: ( & ) or ; offering.v: & ; @@ -5082,14 +5266,14 @@ % unlike vc-offer, "to" is optional. : (( or - ( & {O+})) & {@MV+}) or - ({@MV+} & {}) or + ( & {O+})) & ) or + ( & {}) or ([[@MV+ & O*n+]]); refuse.v: VERB_PLI(); refuses.v: VERB_S_T(); refused.v-d: VERB_SPPP_T() or ( & {O+ or or -[[@MV+ & O*n+]]} & {@MV+}) or ({O+ or [[@MV+ & O*n+]]} & ); +[[@MV+ & O*n+]]} & ) or ({O+ or [[@MV+ & O*n+]]} & ); refusing.g: ( & ) or ; refusing.v: & ; @@ -5097,26 +5281,26 @@ % TO+ & Xc+: allows null-infinitive: "Because I want to." % intransitive: "Try it if you want" : - ({@MV+} & ({} or (TO+ & Xc+))) or - ((O+ or or OX+) & {@MV+} & { or Pv+ or Pa**j+}) or + ( & ({} or (TO+ & Xc+))) or + ((O+ or or OX+) & & { or Pv+ or Pa**j+}) or ([[@MV+ & O*n+]]) or - [[CX- & {@MV+}]]; + [[CX- & ]]; want.v need.v: VERB_PLI(); need.i need'st: {@E-} & ((S- & ) or (RS- & B-)) & (N+ & I+); -wants.v needs.v: VERB_S_T(); +wants.v needs.v: VERB_S_T() or ; wanted.v-d needed.v-d: VERB_SPPP_T() or or - or + or or ; wanting.g needing.g: ( & ) or ; wanting.v needing.v: & ; : - ({@MV+} & {}) or - ((O+ or ) & {@MV+} & {}) or + ( & {}) or + ((O+ or ) & & {}) or ([[@MV+ & O*n+]]); choose.v: VERB_PLI(); @@ -5126,14 +5310,14 @@ VERB_PP() or ( & {}) or or - ({@MV+} & {} & ); + ( & {} & ); choosing.g: ( & ) or ; choosing.v: & ; % is identical to : - ({@MV+} & {}) or - ((O+ or ) & {@MV+} & {}) or + ( & {}) or + ((O+ or ) & & {}) or ([[@MV+ & O*n+]]); prepare.v press.v: VERB_PLI(); @@ -5142,13 +5326,12 @@ VERB_SPPP_T() or or - or or ; preparing.g pressing.g: ( & ) or ; preparing.v pressing.v: & ; : - ((O+ or ) & {@MV+} & {}) or + ((O+ or ) & & {}) or ({@MV+} & (TH+ or or TS+ or (SI*j+ & I*j+))) or Zs- or ([[@MV+ & O*n+]]); @@ -5159,14 +5342,13 @@ VERB_SPPP_T() or ( & { or TSi+}) or - or - or ({@MV+} & {} & ); + or ( & {} & ); requiring.g: ( & ) or ; requiring.v: & ; : ({@MV+} & (TH+ or Zs- or TS+ or )) or - ((O+ or ) & {@MV+} & {}) or + ((O+ or ) & & {}) or ([[@MV+ & {O*n+}]]); command.v order.v urge.v: VERB_PLI(); @@ -5182,12 +5364,12 @@ % ditransitive : ({@MV+} & (TH+ or Pg+)) or - ((O+ or or OX+) & {@MV+} & { or Pa+}) or - (((O+ & (B- or ({[[@MV+]]} & O*n+))) or ([[@MV+ & O*n+]])) & {@MV+}); + ((O+ or or OX+) & & { or Pa+}) or + (((O+ & (B- or ({[[@MV+]]} & O*n+))) or ([[@MV+ & O*n+]])) & ); consider.v: VERB_PLI(); considers.v: VERB_S_T(); considered.v-d: VERB_SPPP_T() or ( & -(({@MV+} & ( or Pa+)) or ({O+ or or [[@MV+ & O*n+]]} & {@MV+}))) +(({@MV+} & ( or Pa+)) or ({O+ or or [[@MV+ & O*n+]]} & ))) or ((({@MV+} & ( or Pa+)) or ({O+ or [[@MV+ & O*n+]]})) & ); considering.g: ( & ) or ; @@ -5195,19 +5377,19 @@ : ({@MV+} & (TH+ or )) or - ((O+ or or OX+) & {@MV+} & {}) or + ((O+ or or OX+) & & {}) or ([[@MV+ & O*n+]]); perceive.v: VERB_PLI(); perceives.v: VERB_S_T(); perceived.v-d: VERB_SPPP_T() or ( & {}) or - or ({@MV+} & {} & ); + or ( & {} & ); perceiving.g: ( & ) or ; perceiving.v: & ; : - ({@MV+} & {TH+ or Z- or }) or - ((O+ or ) & {@MV+} & {}) or + ( & {TH+ or Z- or }) or + ((O+ or ) & & {}) or ([[@MV+ & O*n+]]); report.v: VERB_PLI(); @@ -5216,43 +5398,42 @@ VERB_SPPP_T() or ( & { or Z-}) or - or - or ({@MV+} & {} & ); + or ( & {} & ); reporting.g: ( & ) or ; reporting.v: & ; : - ((O+ or ) & {@MV+} & {TH+ or or }) or - ({@MV+} & {TH+ or Zs-}) or + ((O+ or ) & & {TH+ or or }) or + ( & {TH+ or Zs-}) or ([[@MV+ & O*n+]]); caution.v: VERB_PLI(); cautions.v: VERB_S_T(); cautioned.v-d: VERB_SPPP_T() - or ( & ((O+ or ) & {@MV+} & {TH+ or or Zs- or })) - or ({@MV+} & {TH+ or or } & ); + or ( & ((O+ or ) & & {TH+ or or Zs- or })) + or ( & {TH+ or or } & ); cautioning.g: ( & ) or ; cautioning.v: & ; : - ((O+ or ) & {@MV+} & {TH+ or or or (OF+ & {@MV+})}) or - ({@MV+} & {TH+ or Zs- or (OF+ & {@MV+})}) or + ((O+ or ) & & {TH+ or or or (OF+ & )}) or + ( & {TH+ or Zs- or (OF+ & )}) or ([[@MV+ & O*n+]]); warn.v advise.v: VERB_PLI(); warns.v advises.v: VERB_S_T(); warned.v-d advised.v-d: VERB_SPPP_T() or - ( & {TH+ or or Zs- or or (OF+ & {@MV+})}) or - ({@MV+} & {TH+ or or or OF+} & ); + ( & {TH+ or or Zs- or or (OF+ & )}) or + ( & {TH+ or or or OF+} & ); warning.g advising.g: ( & ) or ; warning.v advising.v: & ; : - ((O+ or ) & {@MV+} & {I*j+ or Pg+}) or - ({@MV+} & {TH+ or Zs- or or (OF+ & {@MV+})}) or + ((O+ or ) & & {I*j+ or Pg+}) or + ( & {TH+ or Zs- or or (OF+ & )}) or ([[@MV+ & O*n+]]); hear.v: VERB_PLI(); @@ -5260,16 +5441,16 @@ heard.v-d: VERB_SPPP_T() or ( & {Pg+}) or - ({@MV+} & {Pg+} & ); + ( & {Pg+} & ); hearing.g: ( & ) or ; hearing.v: & ; % Xc+: poor comma choice: "The man that you saw laugh, jumped off a cliff". : - (( or O+) & {@MV+} & {I*j+ or Pg+ or AZ+ or Pv+}) + (( or O+) & & {I*j+ or Pg+ or AZ+ or Pv+}) or ( & {@MV+} & [I*j+ & Xc+]) - or ({@MV+} & {TH+ or Zs- or QI+ or }) + or ( & {TH+ or Zs- or QI+ or }) or ([[@MV+ & O*n+]]); see.v: VERB_PLI(); @@ -5279,42 +5460,57 @@ seen.v: VERB_PP() or ( & {Pg+ or AZ+}) or - ({@MV+} & {Pg+ or AZ+} & ); + ( & {Pg+ or AZ+} & ); seeing.g: ( & ) or ; seeing.v: & ; -% ditranstive verbs -- taking direct and indirect objects +% ditransitive verbs -- taking direct and indirect objects : - ( or - (B- & {O+}) or - ([[@MV+ & O*n+]])) & {@MV+}; + ( + or (B- & {O+}) + or ([[@MV+ & O*n+]]) + ) & ; -owe.v deliver.v accord.v award.v term.v grant.v begrudge.v assign.v rename.v repay.v -dub.v entitle.v fine.v: +owe.v deliver.v accord.v award.v term.v grant.v begrudge.v +assign.v rename.v repay.v dub.v entitle.v fine.v: VERB_PLI(); -owes.v delivers.v accords.v awards.v terms.v grants.v begrudges.v assigns.v renames.v -repays.v dubs.v entitles.v fines.v: +owes.v delivers.v accords.v awards.v terms.v grants.v begrudges.v +assigns.v renames.v repays.v dubs.v entitles.v fines.v: VERB_S_T(); -owed.v delivered.v accorded.v awarded.v + +owed.v-d delivered.v-d accorded.v-d awarded.v-d granted.v-d begrudged.v-d assigned.v-d repaid.v-d fined.v-d: - VERB_SPPP_T() or - ( & {O+ or or [[@MV+ & O*n+]]} & {@MV+}) or - ({O+ or [[@MV+ & O*n+]]} & ); + VERB_SPPP_T() + or ( & {O+ or or [[@MV+ & O*n+]]} & ) + or ({O+ or [[@MV+ & O*n+]]} & ); + owing.v delivering.v according.v awarding.v terming.v granting.v begrudging.v assigning.v renaming.v repaying.v dubbing.v entitling.v fining.v: - & ; + & ; + owing.g delivering.g according.g awarding.g terming.g granting.g begrudging.g assigning.g renaming.g repaying.g dubbing.g entitling.g fining.g: -( & ) or ; + ( & ) or ; % extended linking requirements based on the above termed.v-d dubbed.v-d entitled.v-d renamed.v-d: VERB_SPPP_T() or - ( & {O+ or or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or [[@MV+ & O*n+]]} & ) or ({O+ or [[@MV+ & O*n+]]} & ) or ; +% intransitive: "we deliver!" +deliver.w: + VERB_PLI(); +delivers.w: + VERB_S_I(); + +% & Pa+: "It was delived broken" +delivered.w-d: + VERB_SPPP_I() + or ( & ({Xc+} & Pa+)); + % ditransitive % 'Give' requires both direct *and* indirect object: X gave Y a Z. % 'sent', 'poured': optional indirect object. @@ -5328,7 +5524,7 @@ or (O+ & K+) or (K+ & {[[@MV+]]} & O*n+) or ([[@MV+ & O*n+]]) - } & {@MV+}; + } & ; give.v send.v bring.v lend.v issue.v hand.v pour.v: VERB_PLI(); @@ -5339,20 +5535,20 @@ sent.v-d brought.v-d lent.v-d handed.v-d: VERB_SPPP_T() or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); issued.v-d poured.v-d: VERB_SPPP_T() or or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); gave.v-d: VERB_SP_T(); given.v: VERB_PP() or or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); giving.g sending.g bringing.g @@ -5371,20 +5567,20 @@ or (O+ & K+) or (K+ & {{[[@MV+]]} & O*n+}) or - ([[@MV+ & O*n+]])} & {@MV+}; + ([[@MV+ & O*n+]])} & ; pass.v buy.v pay.v sell.v deal.v telegraph.v wire.v: VERB_PLI(); passes.v buys.v pays.v sells.v deals.v telegraphs.v wires.v: VERB_S_T(); % (S- & B-) : allows WV-less attach to "The dog which Chris bought is ugly" -% (({@E-} or {@EA-} or {@AN-}) & A+): "The telegraphed orders never arrived" +% : "The telegraphed orders never arrived" passed.v-d bought.v-d paid.v-d payed.v-d sold.v-d dealt.v-d telegraphed.v-d wired.v-d: VERB_SPPP_T() or (S- & ) or - (({@E-} or {@EA-} or {@AN-}) & A+) or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); passing.g buying.g paying.g selling.g dealing.g telegraphing.g wiring.g: @@ -5403,7 +5599,7 @@ or or (O+ & (Pa+ or K+)) or (K+ & {{[[@MV+]]} & O*n+}) - or ([[@MV+ & O*n+]])} & {@MV+}) + or ([[@MV+ & O*n+]])} & ) or ; % This is not quite right: @@ -5412,13 +5608,13 @@ calls.v shouts.v: VERB_S_T(); called.v-d shouted.v-d: VERB_SPPP_T() - or ( & {O+ or or K+ or Pa+ or [[@MV+ & O*n+]]} & {@MV+}) + or ( & {O+ or or K+ or Pa+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or Pa+ or [[@MV+ & O*n+]]} & ); calling.g shouting.g: ( & ) or ; calling.v shouting.v: & ; -% Minimal ditransitive extenstion of words.v.6 +% Minimal ditransitive extension of words.v.6 % ditransitive: "Please paint it lime green" % (O+ & Pa+): "Please paint it green" : @@ -5431,7 +5627,7 @@ colors.v colours.v paints.v: VERB_S_T(); colored.v-d coloured.v-d painted.v-d: VERB_SPPP_T() - or ( & {O+ or K+ or Pa+} & {@MV+}) + or ( & {O+ or K+ or Pa+} & ) or ({K+} & ) or ; @@ -5445,7 +5641,7 @@ % ditransitive % Writing -- direct and indirect object are optional: % 'he wrote' 'he wrote a letter' 'he wrote me a letter' 'he wrote me' -% 'he wrote me that blah happend' but '*he drew me that blah happened' +% 'he wrote me that blah happened' but '*he drew me that blah happened' % % & TH+: "he wrote her that he loved her" : @@ -5454,7 +5650,7 @@ or (O+ & K+) or (K+ & {{[[@MV+]]} & O*n+}) or ([[@MV+ & O*n+]]) - } & {@MV+}) + } & ) or ({@MV+} & (TH+ or )); write.v charge.v draw.v: VERB_PLI(); @@ -5464,17 +5660,16 @@ read.v-d: VERB_SPPP_T() or ( & ) or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); charged.v-d: VERB_SPPP_T() or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ); written.v-d drawn.v-d w/o.v-d: VERB_PP() or - ( & {O+ or or K+ or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or K+ or [[@MV+ & O*n+]]} & ) or ({O+ or K+ or [[@MV+ & O*n+]]} & ) or - or ; writing.v reading.v charging.v drawing.v: @@ -5491,7 +5686,7 @@ or (O+ & K+) or (K+ & {{[[@MV+]]} & O*n+}) or - ([[@MV+ & O*n+]])} & {@MV+}); + ([[@MV+ & O*n+]])} & ); sing.v dance.v cry.v: VERB_PLI(); sings.v dances.v cries.v: VERB_S_T(); sang.v-d danced.v-d cried.v-d: VERB_SP_T(); @@ -5509,30 +5704,29 @@ % ditransitive : (( or - ([[@MV+]] & O*n+)) & {@MV+}) or - ((O+ or ) & {@MV+} & {B- or }); + ([[@MV+]] & O*n+)) & ) or + ((O+ or ) & & {B- or }); allow.v: VERB_PLI(); allows.v: VERB_S_T(); allowed.v-d: VERB_SPPP_T() or - ( & (({O+ or or [[@MV+ & O*n+]]} & {@MV+}) or ({@MV+} & ))) + ( & (({O+ or or [[@MV+ & O*n+]]} & ) or ({@MV+} & ))) or ({O+ or [[@MV+ & O*n+]] or ({@MV+} & )} & ); allowing.g: ( & ) or ; allowing.v: & ; % ditransitive : - ({O+ or } & {@MV+} & {B- or or or TH+ or RSe+ or Zs-}) or + ({O+ or } & & {B- or or or TH+ or RSe+ or Zs-}) or (( or - ([[@MV+ & O*n+]])) & {@MV+}); + ([[@MV+ & O*n+]])) & ); promise.v: VERB_PLI(); promises.v: VERB_S_T(); promised.v-d: VERB_SPPP_T() - or ( & (({O+ or or [[@MV+ & O*n+]]} & {@MV+}) or ({@MV+} & ( or or TH+ or RSe+ or Zs-)))) + or ( & (({O+ or or [[@MV+ & O*n+]]} & ) or ({@MV+} & ( or or TH+ or RSe+ or Zs-)))) or - or or ({O+ or [[@MV+ & O*n+]] or ({{@MV+} & ( or or TH+)})} & ); promising.g: ( & ) or ; promising.v: & ; @@ -5545,16 +5739,16 @@ (O+ & K+) or (K+ & (B- or ({[[@MV+]]} & O*n+))) or ([[@MV+ & O*n+]]) or - [[()]]) & {@MV+}); + [[()]]) & ); -show.v: VERB_PLI(); -shows.v: VERB_S_T(); +show.v: VERB_PLI() or ; +shows.v: VERB_S_T() or ; showed.v-d: VERB_SP_T(); shown.v: VERB_PP() or or ( & - (({O+ or K+ or B- or [[@MV+ & O*n+]]} & {@MV+}) or + (({O+ or K+ or B- or [[@MV+ & O*n+]]} & ) or ({@MV+} & (QI+ or or TH+ or RSe+ or Zs-)))) or ({O+ or K+ or [[@MV+ & O*n+]] or ({@MV+} & (QI+ or or TH+))} & ); showing.g: ( & ) or ; @@ -5566,14 +5760,14 @@ or ({ or (B- & {[[@MV+]]} & O*n+) - or ([[@MV+ & O*n+]])} & {@MV+}); + or ([[@MV+ & O*n+]])} & ); teach.v: VERB_PLI(); teaches.v: VERB_S_T(); taught.v-d: VERB_SPPP_T() or ( & - (({O+ or or [[@MV+ & O*n+]]} & {@MV+}) or + (({O+ or or [[@MV+ & O*n+]]} & ) or ({@MV+} & (QI+ or or TH+ or RSe+ or Zs- or )))) or ({O+ or [[@MV+ & O*n+]] or ({@MV+} & (QI+ or or TH+))} & ); teaching.g: ( & ) or ; @@ -5592,13 +5786,13 @@ : (((O+ or ) & (({@MV+} & ) or K+ or [()])) or (K+ & O*n+) or - ([[{K+} & @MV+ & O*n+]])) & {@MV+}; + ([[{K+} & @MV+ & O*n+]])) & ; force.v: VERB_PLI(); forces.v: VERB_S_T(); forced.v-d willed.v-d: VERB_SPPP_T() or - ( & ((K+ & {@MV+}) or + ( & ((K+ & ) or ({@MV+} & ))) or ((K+ or ({@MV+} & )) & ) or ; @@ -5608,24 +5802,30 @@ % ----------------------------------------- : - (B- & {@MV+} & {}) or - (O+ & {@MV+} & {}) or - ([[@MV+ & O*n+ & {@MV+}]]); + (B- & & {}) or + (O+ & & {}) or + ([[@MV+ & O*n+ & ]]); design.v permit.v authorize.v use.v cause.v enable.v pressure.v train.v sentence.v prompt.v spur.v disincline.v -invite.v reelect.v encourage.v draft.v hire.v entice.v inspire.v aid.v -forbid.v employ.v educate.v tempt.v condemn.v commission.v counsel.v induce.v instruct.v +invite.v reelect.v encourage.v draft.v hire.v entice.v inspire.v +aid.v forbid.v employ.v educate.v tempt.v condemn.v commission.v +counsel.v induce.v instruct.v license.v incite.v nominate.v destine.v provoke.v challenge.v exhort.v implore.v motivate.v impel.v: -VERB_PLI(); -designs.v permits.v pressures.v trains.v sentences.v causes.v enables.v -authorizes.v uses.v prompts.v spurs.v disinclines.v -invites.v reelects.v encourages.v drafts.v hires.v entices.v inspires.v aids.v -forbids.v employs.v educates.v tempts.v condemns.v commissions.v counsels.v induces.v + VERB_PLI() + or ( & O+ & Xc+); + +designs.v permits.v pressures.v trains.v sentences.v causes.v +enables.v authorizes.v uses.v prompts.v spurs.v disinclines.v +invites.v reelects.v encourages.v drafts.v hires.v entices.v +inspires.v aids.v forbids.v employs.v educates.v tempts.v +condemns.v commissions.v counsels.v induces.v instructs.v licenses.v incites.v nominates.v destines.v provokes.v challenges.v exhorts.v implores.v motivates.v impels.v: -VERB_S_T(); + VERB_S_T() + or ( & O+ & Xc+); + designed.v-d permitted.v-d pressured.v-d trained.v-d sentenced.v-d caused.v-d enabled.v-d authorized.v-d prompted.v-d spurred.v-d invited.v-d disinclined.v-d @@ -5636,11 +5836,12 @@ licensed.v-d incited.v-d nominated.v-d destined.v-d provoked.v-d challenged.v-d exhorted.v-d implored.v-d motivated.v-d impelled.v-d: - VERB_SPPP_T() or - ( & {}) or - or - or - ({{@MV+} & } & ); + VERB_SPPP_T() + or ( & {}) + or + or ( & O+ & Xc+) + or ({{@MV+} & } & ); + forbade.v-d: VERB_SP_T(); forbidden.v: VERB_PP() or @@ -5668,26 +5869,27 @@ & ; used.v-d: - VERB_SPPP_T() or - ( & {}) or - ( & ) or - ({@MV+} & {} & ) or - or - ; + VERB_SPPP_T() + or ( & {}) + or ( & ) + or + or ( & O+ & Xc+) + or ( & {} & ); + using.g: ( & ( or MVs-)) or ; % -------------------------------------------------- : - ((O+ or ) & (({@MV+} & {}) or + ((O+ or ) & (( & {}) or ({[[@MV+]]} & (O*n+ or TI+)))) or - ([[@MV+ & O*n+ & {@MV+}]]); + ([[@MV+ & O*n+ & ]]); elect.v appoint.v: VERB_PLI(); elects.v appoints.v: VERB_S_T(); elected.v-d appointed.v-d: VERB_SPPP_T() - or ( & (({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]} & {@MV+}) or ({@MV+} & ))) + or ( & (({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]} & ) or ({@MV+} & ))) or (({O+ or TI+ or [[@MV+ & (O*n+ or TI+)]]} or ({@MV+} & )) & ) or ; @@ -5700,14 +5902,14 @@ (( or (O+ & {[[@MV+]]} & TI+) or (B- & {O+ or TI+}) - or ([[@MV+ & O*n+]])) & {@MV+}) + or ([[@MV+ & O*n+]])) & ) or ; name.v designate.v label.v: VERB_PLI(); names.v designates.v labels.v: VERB_S_T(); named.v-d designated.v-d labelled.v-d labeled.v-d: VERB_SPPP_T() or - ( & ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]}) & {@MV+}) or + ( & ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]}) & ) or ({O+ or TI+ or [[@MV+ & (O*n+ or TI+)]]} & ) or ; naming.g designating.g labelling.g labeling.g: ( & ) or ; @@ -5719,7 +5921,7 @@ tags.v: VERB_S_T(); tagged.v-d: VERB_SPPP_T() or - (( & ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]}) & {@MV+}) or ) or + (( & ({O+ or or TI+ or [[@MV+ & (O*n+ or TI+)]]}) & ) or ) or ({O+ or TI+ or [[@MV+ & (O*n+ or TI+)]]} & ) or ; tagging.g: @@ -5729,9 +5931,9 @@ tagging.v: & ; : - {((O+ or ) & {@MV+} & {}) or + {((O+ or ) & & {}) or @MV+ or - ([[@MV+ & O*n+ & {@MV+}]])}; + ([[@MV+ & O*n+ & ]])}; program.v oblige.v: VERB_PLI(); programs.v obliges.v: VERB_S_T(); @@ -5744,14 +5946,14 @@ programing.v programming.v obliging.v: & ; : - ((O+ or ) & {@MV+} & { or TH+ or }) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & { or TH+ or }) or + ([[@MV+ & O*n+ & ]]); convince.v persuade.v: VERB_PLI(); convinces.v persuades.v: VERB_S_T(); convinced.v-d persuaded.v-d: VERB_SPPP_T() or ( & { or TH+ or }) or - ({{@MV+} & ( or TH+ or )} & ); + ({ & ( or TH+ or )} & ); convincing.g persuading.g: ( & ) or ; convincing.v persuading.v: & ; @@ -5764,11 +5966,11 @@ % : (((O+ & {O*n+ or K+}) or ) - & {@MV+} & {TH+ or RSe+ or Zs- or or QI+ or BW-}) - or ({O+ & {@MV+}} & ) + & & {TH+ or RSe+ or Zs- or or QI+ or BW-}) + or ({O+ & } & ) or OF+ or (QI+ & {MV+}) - or ([[@MV+ & {O*n+} & {@MV+}]]); + or ([[@MV+ & {O*n+} & ]]); tell.v: VERB_PLI(); tell.w: {@E-} & I- & {@MV+} & (QI+ or TH+ or or RSe+ or Zs-) & ; @@ -5776,7 +5978,7 @@ told.v-d: VERB_SPPP_T() or ( & {TH+ or or RSe+ or Zs- or or QI+ or BW-}) - or ( & O+ & {@MV+}) + or ( & O+ & ) or ({{@MV+} & ( or or QI+ or TH+)} & ); telling.g: ( & ) or ; @@ -5787,47 +5989,50 @@ % (QI+ & {MV+}): "I did not ask why until recently" : - ({(O+ & {O*n+}) or } & {@MV+} + ({(O+ & {O*n+}) or } & & {TS+ or or (QI+ & {MV+}) or BW-}) - or ([[@MV+ & O*n+ & {@MV+}]]); + or ([[@MV+ & O*n+ & ]]); ask.v: VERB_PLI(); asks.v: VERB_S_T(); asked.v-d: VERB_SPPP_T() or ( & { or QI+ or BW- or TH+ or TS+}) - or ( & O+ & {@MV+}) + or ( & O+ & ) or ({{@MV+} & ( or QI+ or TH+ or TS+)} & ); asking.g: ( & ) or ; asking.v: & ; +% TH+: "it helps that you know already" : - ({O+ or } & {@MV+} & { or I+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ({O+ or } & {@MV+} & { or I+}) + or [[@MV+ & O*n+ & ]]; help.v: VERB_PLI(); -helps.v: VERB_S_T(); -helped.v-d: VERB_SPPP_T() or ( & {}) or -({{@MV+} & } & ); +helps.v: VERB_S_T( or TH+); +helped.v-d: + VERB_SPPP_T( or TH+) + or ( & {}) + or ({{@MV+} & } & ); helping.g: ( & ) or ; helping.v: & ; : - ((O+ or ) & {@MV+} & ( or TH+ or or (OF+ & {@MV+}))) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & {@MV+} & ( or TH+ or or (OF+ & ))) or + ([[@MV+ & O*n+ & ]]); remind.v: VERB_PLI(); reminds.v: VERB_S_T(); reminded.v-d: VERB_SPPP_T() or ( & { or TH+ or or -(OF+ & {@MV+})}) or ({{@MV+} & ( or or TH+ or -(OF+ & {@MV+}))} & ); +(OF+ & )}) or ({{@MV+} & ( or or TH+ or +(OF+ & ))} & ); reminding.g: ( & ) or ; reminding.v: & ; : - ((O+ or ) & {@MV+} & {(OF+ & {@MV+}) or TH+ or Zs- or }) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {(OF+ & ) or TH+ or Zs- or }) or + ([[@MV+ & O*n+ & ]]); inform.v reassure.v alert.v guarantee.v notify.v forewarn.v: VERB_PLI(); @@ -5838,7 +6043,7 @@ informed.v-d reassured.v-d alerted.v-d guaranteed.v-d notified.v-d forewarned.v-d: VERB_SPPP_T() - or ( & { or TH+ or Zs- or (OF+ & {@MV+})}) + or ( & { or TH+ or Zs- or (OF+ & )}) or ({{@MV+} & ( or TH+ or OF+)} & ) or ; @@ -5849,41 +6054,41 @@ & ; : - ((O+ or ) & {@MV+} & {(OF+ & {@MV+}) or TH+ or Zs- or }) or - ([[@MV+ & O*n+ & {@MV+}]]) or + ((O+ or ) & & {(OF+ & ) or TH+ or Zs- or }) or + ([[@MV+ & O*n+ & ]]) or ({@MV+} & (TH+ or )); assure.v: VERB_PLI(); assures.v: VERB_S_T(); assured.v-d: VERB_SPPP_T() - or ( & {(OF+ & {@MV+}) or or TH+ or Zs-}) + or ( & {(OF+ & ) or or TH+ or Zs-}) or ({{@MV+} & ( or TH+ or OF+)} & ); assuring.g: ( & ) or ; assuring.v: & ; : - ((O+ or ) & {@MV+} & {I+ or ((K+ or Pp+) & {@MV+})}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {I+ or ((K+ or Pp+) & )}) or + ([[@MV+ & O*n+ & ]]); let.v-d: VERB_SPPP_T() or ( & ) or - ( & ((K+ or Pp+) & {@MV+})) or + ( & ((K+ or Pp+) & )) or ((K+ or Pp+) & ); lets.v: VERB_S_T(); letting.g: ( & ) or ; letting.v: & ; % Abbreviation for "let us" -% Is there any reason to create a defintion such as 's.n: Ox-? +% Is there any reason to create a definition such as 's.n: Ox-? let's let’s: ({Ic-} & Wi- & {N+} & I+) or ({Ic-} & Wi- & N+); : - ((O+ or ) & {@MV+} & {I*j+ or Pg+}) or - ([[@MV+ & O*n+ & {@MV+}]]) or - {@MV+}; + ((O+ or ) & & {I*j+ or Pg+}) or + ([[@MV+ & O*n+ & ]]) or + ; watch.v: VERB_PLI(); watches.v: VERB_S_T(); @@ -5896,8 +6101,8 @@ watching.v: & ; : - ((O+ or ) & {@MV+} & {Pg+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {Pg+}) or + ([[@MV+ & O*n+ & ]]); appreciate.v spend.v: VERB_PLI(); appreciates.v spends.v: VERB_S_T(); @@ -5912,34 +6117,34 @@ % ditransitive : ((O+ or or OX+) & { - ({@MV+} & {I*j+ or Pa**j+ or B-}) - or ((K+ or AF-) & {@MV+})}) + ( & {I*j+ or Pa**j+ or B-}) + or ((K+ or AF-) & )}) or (( or (K+ & {[[@MV+]]} & O*n+) or K+ or Vm+ or ([[{K+} & @MV+ & O*n+]]) - ) & {@MV+}) + ) & ) or [[()]]; make.v: VERB_PLI(); makes.v: VERB_S_T(); made.v-d: VERB_SPPP_T() - or ( & (({@MV+} & Pa+) or ({O+ or K+} & {@MV+}))) + or ( & (( & Pa+) or ({O+ or K+} & ))) or ({({@MV+} & Pa+) or K+} & ); built_of built_up_of composed_of constructed_of formed_of made_of made_up_of: - ( & (O+ or ) & {@MV+}) or (O+ & ); + ( & (O+ or ) & ) or (O+ & ); making.g: ( & ) or ; making.v: & ; : (((O+ or ) & {({@MV+} & Pa+) or AF-}) or - ([[@MV+ & O*n+]])) & {@MV+}; + ([[@MV+ & O*n+]])) & ; render.v deem.v: VERB_PLI(); renders.v deems.v: VERB_S_T(); @@ -5950,7 +6155,7 @@ : (((O+ or ) & {{@MV+} & OF+}) or - ([[@MV+ & O*n+]])) & {@MV+}; + ([[@MV+ & O*n+]])) & ; deprive.v accuse.v acquit.v purge.v disabuse.v exonerate.v absolve.v rob.v convict.v: VERB_PLI(); @@ -5959,7 +6164,7 @@ deprived.v accused.v acquitted.v purged.v disabused.v exonerated.v absolved.v robbed.v convicted.v-d: VERB_SPPP_T() or - ( & {OF+} & {@MV+}) or + ( & {OF+} & ) or or ({{@MV+} & OF+} & ); depriving.g accusing.g acquitting.g purging.g disabusing.g exonerating.g @@ -5972,38 +6177,38 @@ (((O+ or ) & {({@MV+} & OF+) or K+}) or ({K+} & O*n+) or K+ or - ([[{K+} & @MV+ & O*n+]])) & {@MV+}; + ([[{K+} & @MV+ & O*n+]])) & ; clear.v: VERB_PLI(); clears.v: VERB_S_T(); cleared.v-d: VERB_SPPP_T() or - ( & {({@MV+} & OF+) or K+} & {@MV+}) or + ( & {({@MV+} & OF+) or K+} & ) or ({K+ or ({@MV+} & OF+)} & ); clearing.g: ( & ) or ; clearing.v: & ; : - ({(O+ & {O*n+}) or ( & {O+})} & {@MV+} & {TH+ or or RSe+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ({(O+ & {O*n+}) or ( & {O+})} & & {TH+ or or RSe+}) or + ([[@MV+ & O*n+ & ]]); bet.v-d: VERB_SPPP_T() or ( & ) or or - ( & {O+ or } & {@MV+} & {TH+ or or RSe+ or @MV+}); + ( & {O+ or } & & {TH+ or or RSe+ or @MV+}); bets.v: VERB_S_T(); betted.v-d: VERB_SPPP_T() or - ( & {O+ or } & {@MV+} & {TH+ or or RSe+ or @MV+}) or + ( & {O+ or } & & {TH+ or or RSe+ or @MV+}) or ({O- or [[@MV+ & O*n+]] or TH+ or } & ); betting.g: ( & ) or ; betting.v: & ; : ({@MV+} & ) or - ((O+ or ) & {@MV+} & {THi+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {THi+}) or + ([[@MV+ & O*n+ & ]]); bother.v: VERB_S_PLI(); bothers.v: VERB_S_S(); @@ -6012,7 +6217,7 @@ bothering.g: ( & ) or ; : - ((O+ or ) & {@MV+} & {THi+}) or + ((O+ or ) & & {THi+}) or ([[@MV+ & O*n+]]); surprise.v alarm.v amaze.v amuse.v annoy.v @@ -6041,8 +6246,8 @@ ( & ) or ; : - ((O+ or or [[@MV+ & O*n+]]) & {@MV+}) or - ((O+ or or OX+) & {@MV+} & ( or [[Pa+]])) or + ((O+ or or [[@MV+ & O*n+]]) & ) or + ((O+ or or OX+) & {@MV+} & ( or [[{Xc+} & Pa+]])) or ({@MV+} & ( or TH+ or or RSe+ or Zs- or (Pa+ & ))); prove.v: VERB_Y_PLI(); @@ -6050,18 +6255,18 @@ proved.v-d: VERB_Y_SPPP() or ( & {THi+ or }) - or + or ( & {dCPu-} & {MV+}) or ({{@MV+} & Pa+} & ); proven.v: ( & ) or - ( & {THi+ or or Pa+}) or + ( & {THi+ or or Pa+ or dCPu-}) or or ({{@MV+} & Pa+} & ); proving.g: ( & ) or ; proving.v: & ; : - ((O+ or or [[@MV+ & O*n+]]) & {@MV+}) or + ((O+ or or [[@MV+ & O*n+]]) & ) or ({@MV+} & (Pg+ or TH+ or or RSe+ or Zs- or TS+ or ((SI*j+ or SFI**j+) & I*j+))); suggest.v anticipate.v recommend.v: VERB_PLI(); @@ -6070,7 +6275,6 @@ VERB_SPPP_T() or ( & {THi+ or TSi+ or Z-}) or - or or ; suggesting.g anticipating.g recommending.g: ( & ) or ; suggesting.v anticipating.v recommending.v: & ; @@ -6079,22 +6283,22 @@ : (( or (B- & {O+}) or - [[@MV+ & O*n+]]) & {@MV+}) or + [[@MV+ & O*n+]]) & ) or ({@MV+} & (Pg+ or TH+ or or RSe+)); deny.v: VERB_PLI(); denies.v: VERB_S_T(); denied.v-d: VERB_SPPP_T() or - ( & {O+ or or [[@MV+ & O*n+]]} & {@MV+}) or + ( & {O+ or or [[@MV+ & O*n+]]} & ) or ({O+ or ([[@MV+ & O*n+]])} & ); denying.g: ( & ) or ; denying.v: & ; : - ((O+ or ) & {@MV+} & {AZ+}) or + ((O+ or ) & & {AZ+}) or ({@MV+} & (QI+ or Z-)) or - ([[@MV+ & O*n+ & {@MV+}]]); + ([[@MV+ & O*n+ & ]]); describe.v: VERB_PLI(); describes.v: VERB_S_T(); @@ -6102,14 +6306,13 @@ VERB_SPPP_T() or ( & {AZ+ or Z-}) or or - or - ({@MV+} & {AZ+} & ); + ( & {AZ+} & ); describing.g: ( & ) or ; describing.v: & ; : - ((O+ or ) & {@MV+} & {AZ+}) or - ([[@MV+ & O*n+ & {@MV+}]]); + ((O+ or ) & & {AZ+}) or + ([[@MV+ & O*n+ & ]]); portray.v depict.v regard.v view.v characterize.v: VERB_PLI(); portrays.v depicts.v regards.v views.v characterizes.v: VERB_S_T(); @@ -6117,8 +6320,7 @@ VERB_SPPP_T() or ( & {AZ+}) or or - or - ({@MV+} & {AZ+} & ); + ( & {AZ+} & ); portraying.g depicting.g regarding.g viewing.g characterizing.g: ( & ) or ; portraying.v depicting.v regarding.v viewing.v characterizing.v: @@ -6170,7 +6372,7 @@ seeing_fit letting_go taking_note coming_true coming_clean coming_of_age: ( & ) or ; -: (O+ or or [[@MV+ & O*n+]]) & {@MV+}; +: (O+ or or [[@MV+ & O*n+]]) & ; allow_for bring_about get_rid_of let_go_of take_note_of: VERB_PLI(); puts_up_with allows_for brings_about gets_rid_of lets_go_of @@ -6220,7 +6422,7 @@ given_up gone_around: & ; keeping_on giving_up going_around: ( & ) or ; -: Pg+ or Pa+ or ({AF-} & {@MV+}); +: Pg+ or Pa+ or ({AF-} & ); end_up: VERB_S_PLI(); ends_up: VERB_S_S(); ended_up: VERB_S_SPPP(); @@ -6231,6 +6433,7 @@ /en/words/words.v.1.p: or + or or ; % ----------------------------------------------------------------- @@ -6240,11 +6443,11 @@ % The naked Qd+, without a WV+, links to subj-verb-inverts: "are you % insane?", "Are you the one?" % XXX everywhere where Ws+ is used, should probably be !? -: Wa+ or Wi+ or Wn+ or Ww+ or Qd+; -: (Wd+ or Wp+ or Wr+ or Wq+ or Ws+ or Wj+ or Wc+ or We+ or Wt+ or Wo+) & ; +: hWa+ or hWi+ or hWn+ or hWw+ or hQd+; +: (hWd+ or hWp+ or hWr+ or hWq+ or hWs+ or hWj+ or hWc+ or hWe+ or hWt+ or hWo+) & ; % Paraphrasing, quotational complements: -: [()]; +: [()]0.1; % Quote with or without quotation marks. % "This is a test," she said. @@ -6257,7 +6460,7 @@ % QU+ & & QU+: He said, "This is it." % Xc+ or Xe+ or [[()]]: punctuation is commonly missing. : - ({@MV+} & (Xc+ or Xp+) & CP-) + ({@MV+} & (Xc+ or Xp+ or ) & CP-) or ({@MV+} & ((Xd- or Xq-) & (Xc+ or Xp+ or ) & (COq+ or CP- or Eq+ or ))) or [{@MV+} & (Xc+ or Xe+ or [[()]]) & ] @@ -6265,9 +6468,11 @@ & QUd+ & ( or ) & {X+} & QUc+); % Xd- & Xc+: "If I'm right, he thought, this will work." +% CPa- & Xc+: "So thinks everyone" : {@MV+} & (((Xd- or Xq-) & (Xc+ or Xp+ or ) & (COq+ or CPx- or Eq+ or )) + or (CPa- & Xc+) or [(Xc+ or Xe+) & ]); % filler-it: "The President is busy, it seems." @@ -6282,14 +6487,16 @@ [[{@E-} & (((Sp- or I-) & & ) or (SIpj+ & ))]]; /en/words/words.v.10.2: - [[{@E-} & ((Ss- & & ) or (SIsj+ & ))]]; + [[{@E-} & Ss- & & ]] + or [[{@E-} & SIsj+ & ]]; % XXX Why is there a cost on Pvf- ??? /en/words/words.v.10.3: {@E-} & ( ((S- or PP-) & & ) or (SI*j+ & ) - or [Pvf- & ]); + or [Pvf- & ] + or (Pv- & CV- & dCPu-)); read.q-d: {@E-} & (((S- or I- or PP-) & & ) or (SI*j+ & )); @@ -6297,12 +6504,15 @@ wrote.q-d: {@E-} & ((S- & & ) or (SI*j+ & )); -written.q: {@E-} & PP- & ; +written.q: + {@E-} & ((PP- & ) or (Pv- & CV- & dCPu-)); /en/words/words.v.10.4: [[{@E-} & Pg- & ]]; seem.q appear.q: [[{@E-} & (SFp- or If-) & ]]; -seems.q appears.q: [[{@E-} & SFs- & ]]; +seems.q appears.q: + [[{@E-} & SFs- & ]] + or ({@E-} & Pv- & CV- & dCPu-); seemed.q-d appeared.q-d: {@E-} & (SF- or PPf-) & ; seeming.q appearing.q: [[{@E-} & Pgf- & ]]; @@ -6313,7 +6523,10 @@ {@E-} & ((Ss- & & ) or (SIsj+ & )); said.q-d: - {@E-} & (((S- or PP-) & & ) or (SI*j+ & )); + {@E-} & ( + ((S- or PP-) & & ) + or (SI*j+ & ) + or (Pv- & CV- & dCPu-)); saying.q: {@E-} & Pg- & ; @@ -6341,7 +6554,7 @@ % idiomatic "voted yes/no" expressions using the V link. % "he answered yes", "say yes!", "Just say no!" -: {Xc+} & Vv+ & {@MV+}; +: {Xc+} & Vv+ & ; answer.w reply.w say.w vote.w: VERB_PLI(); answers.w replies.w says.w votes.w: VERB_S_I(); answered.w-d replied.w said.w-d voted.w-d: VERB_SPPP_I(); @@ -6400,9 +6613,13 @@ % (using MVp-) is generally preferred. The cost is small, though, % to allow modifiers on conjoined nouns to work well. % e.g. "...went to hell yesterday and heaven on Tuesday" +% +% [Mp- & MVp-]-0.61 prefers a connection to both the noun and the +% verb, helping disambiguate. The weight 0.61 plus 0.4 is greater +% than one, to overcome cost on @MV+. : - or [Mp-]0.4 or Pp- or MVp- + or [Mp-]0.4 or Pp- or MVp- or [Mp- & MVp-]-0.61 or [({Xc+ & {Xd-}} & CO+)] or (Xd- & Xc+ & (MX*x- or MVx-)); @@ -6446,7 +6663,7 @@ or [MVp- & B-]; % -% XXX fixme: MVp- & J+ is wrong: "*I saw John except Fred" +% XXX FIXME: MVp- & J+ is wrong: "*I saw John except Fred" % XXX (The MVp- coming from prep-main-a) % So give J+ a cost, to minimize this, for now ... % Could this be fixable in postprocessing? @@ -6495,6 +6712,11 @@ into: ({JQ+} & (J+ or Mgp+ or QI+) & ) or [MVp- & B-]; +% re: "with regard to" +% Xe+: optional colon +re: + {Xe+} & J+ & ; + % cost on MVa- to give preference to MVl- about: ({JQ+} & (J+ or Mgp+ or QI+) & ) @@ -6506,6 +6728,7 @@ or [MVa-] or (MVl- & (MVp+ or MVa+ or MVs+)); + % Grep also for "just_about", used as syonym for "nearly", "almost" % XXX is this really needed ?? Seems to duplicate other stuff .. just_about nearly_about almost_about right_about: @@ -6611,21 +6834,30 @@ or [({Xc+ & {Xd-}} & CO+)] or (Xd- & Xc+ & (E+ or MVx-))); +% o': silent, un-pronouned f. % (OFd- & Jd+): "I have a lot of cookies", forces "lot" to be determiner; % The Jd+ forces a link to the object as well. % Wj- & JQ+ & J+ & Qd+: "Of which person were you speaking?" % QI+ & CV+: "She kept an organized record of which employees took their vacations" % The QI makes it interrogative, the CV links head word. % (Js+ or Jp+ or Ju+): we explicitly exclude Jw+ as that leads to bad parses. -of: +% +% Mf-: allows "from the Abbey of Stratford Langthorne" so that "of" +% links to "Abbey" instead of something more distant. +% XXX The Mp- below should be removed, and all occurrences of +% Mp+ elsewhere should be replaced by (Mp+ or Mf+) +% Mf- & MVp+: "She was a girl of about John's age" +of o': ({JQ+} & (Js+ or Jp+ or Ju+ or Mgp+ or (QI+ & {CV+})) & (Mp- + or Mf- or OFj- or OFw- or (Xd- & Xc+ & MX*x-) or ( & (Mj- or (Xd- & Xc+ & MX*j-))) or [[({Xc+ & {Xd-}} & CO+)]])) + or (Mf- & MVp+) or (Wj- & JQ+ & J+ & Qd+) or (OFd- & Jd+) or ((OFj- or Mp-) & B-) @@ -6637,7 +6869,7 @@ % MX-PHRASE: The blah, to be blahed, will be blah. % TO- & Xc+: "I'd like to, I want to." (null infinitive) -% give [J+] a cost, so that numeric intervals are peferred +% give [J+] a cost, so that numeric intervals are preferred % I*t+ & TO-: passes on the TO constraint down the line % I+ & MVi-: allows "What is there to do?" % but also incorrectly allows: "He is going to do" @@ -6670,8 +6902,7 @@ besides: {J+ or Mgp+} & ([({Xc+ & {Xd-}} & CO+)] or MVp- or ); throughout: {J+} & ([({Xc+ & {Xd-}} & CO+)] or MVp- or ); -versus: (J+ & Mp-) or (G- & G+); -vs: {Xi+} & G- & G+; +versus v. vs. vs: (J+ & Mp-) or (G- & G+); worth.p: (Mp- & (J+ or OF+)) or (Paf- & Mgp+) or (Pa- & (J+ or B-)); opposite.p: J+ & ; @@ -6708,15 +6939,17 @@ regardless_of as_to irrespective_of: (J+ or QI+) & (MVp- or [({Xc+ & {Xd-}} & CO+)] or (Xd- & Xc+ & (E+ or MVx-))); -as_usual to_date on_average in_turn so_far -in_particular in_response in_general thus_far in_reply: +as_yet to_date so_far thus_far as_usual on_average +in_general in_particular in_response in_reply in_turn: ; +% Mp- & QI+: "decisions such as when to go are taken by the instructor." such_as: - J+ & ( + (J+ & ( MVa- or Mp- - or (Xc+ & Xd- & (MVx- or MX*x-))); + or (Xc+ & Xd- & (MVx- or MX*x-)))) + or (Mp- & QI+); lest: ( or Mgp+ or Mv+) & ( @@ -6733,10 +6966,16 @@ % -------------------------------------------------------- % Preps that specify time-like relations +% +% ({[Mp-]-0.09} & MVp-): If we can identify both the head verb, +% and the head noun, then do so. It is tempting to think that these +% should modify the verb, only, but conjunctions prove otherwise: +% "... went to hell recently, and heaven before that." shows that +% "recently" has to modify "hell", and not "went". recently: {EE- or EF+} & ( - ({Xd- & Xc+} & MVp-) + ({Xd- & Xc+} & {[Mp-]-0.09} & MVp-) or Pp- or E+ or ({Xc+ & {Xd-}} & CO+) @@ -6747,33 +6986,41 @@ or [[Mp-]]); % Wc- & Qd+: "Now, am I right?" +% MJr-: "when, if not now, do you want to do it?" now.r: - ({Xd- & Xc+} & MVp-) + ({Xd- & Xc+} & {[Mp-]-0.09} & MVp-) or Pp- or E+ or ({Xc+ & {Xd-}} & CO+) or (Wc- & (Xc+ or [()]) & Qd+) or EB- + or MJr- or [[Mp-]]; % Wc- & Qd+: "Then, am I right?" +% {Xd-} & MVs- & Xs- & : "I eat, then I sleep" +% JT+ & CO+: "then last week, I changed my mind" +% JT+: "if not next Tuesday, then when do you want to do it?" then.r: - ({Xd- & Xc+} & MVp-) + ({Xd- & Xc+} & {[Mp-]-0.09} & MVp-) or Pp- or E+ - or ({Xc+ & {Xd-}} & CO+) + or ({JT+} & {Xc+ & {Xd-}} & CO+) + or JT+ or (Wc- & (Xc+ or [()]) & Qd+) or EB- or (S+ & Xd- & Xc+ & MVs-) + or ({Xd-} & MVs- & Xs- & ) or [[Mp-]]; % Wt-: "Later." (all by itself) but also: "Later, he left" +% [Mp-]0.4: see notes above ; prefer MVb- when possible. later earlier: ({ECa- or Yt-} & (E+ or - Mp- or + [Mp-]0.4 or Pp- or - MVb- or + ({[Mp-]-0.09} & MVb-) or (Wt- & {Xc+}) or [({Xc+ & {Xd-}} & CO+)] or (Xd- & Xc+ & (MX*x- or MVx-)) or @@ -6823,7 +7070,7 @@ % SFpp+: "nearby are more ruins" % {EE-} & {Xc+} & COp+: "{somewhere} nearby, a mouse scratched" % {EE-} & FM-: "The ticking came from {somewhere} nearby." -nearby close_by: +nearby close_by handy.r: A+ or MVp- or Pp- @@ -6836,9 +7083,12 @@ {J+} & (Mp- or Pp- or MVp- or [({Xc+ & {Xd-}} & CO+)] or FM-); % Consider "Here's the ball." We have two choices: SFst+ as a filler-it, -% or the more questionable . Maybe PF is reasonable.. but -% SFst seems better at the moment. -here: J- or or (SFst+ & ) or ; +% or with subject-verb inversion. Both seem reasonable. +here: + J- + or + or [dSFst+ & ]0.15 + or ; % Wi-: [come] Over here! over_here: Wi-; @@ -6846,15 +7096,16 @@ % EN- & Pp-: "you are halfway there" % EN- & J-: "we stopped about halway there" % Wi-: "There!" -% Wd- & PFt+: "there lay the ball"; the PFt+ prevents connections to +% Wp- & PFt+: "there lay the ball"; the PFt+ prevents connections to % the PFb- on . -there.r: +% "there the remains can be found" +there.r thither: J- or - or ((SFst+ or SFp+ or SFut+) & ) - or SFIst- - or SFIp- - or (Wp- & PFt+) + or [(dSFst+ or dSFp+ or dSFut+) & ].15 + or [dSFIst-].15 + or [dSFIp-].15 + or or OXt- or (EN- & (Pp- or J-)) or Wi-; @@ -6882,8 +7133,8 @@ or (J+ & ( or FM- or )); -ahead_of by_way_of akin_to betwixt vis-a-vis in_lieu_of on_account_of -in_place_of in_search_of: +ahead_of by_way_of akin_to betwixt vis-a-vis vis-à-vis cf. +in_lieu_of on_account_of in_place_of in_search_of: or (J+ & ( or )); @@ -7045,7 +7296,7 @@ or AN+ or Wa-; -% The naked ND- can occur with tiem intervals: +% The naked ND- can occur with time intervals: % "I can't decide between 7:30AM and 9:30AM" % AM.ti PM.ti am.ti pm.ti a.m. p.m. o'clock: /en/words/units.5: @@ -7144,23 +7395,29 @@ % {Dmc- or @M+}: avoid used Dmc together with M when parsing sentences like % "She is two years older than me" -days.n weeks.n moments.n hours.n minutes.n years.n instants.n periods.n months.n -nights.n seconds.n decades.n centuries.n: - {NM+} & (( & - (({Dmc- or @M+} & {WN+ or TH+ or or (R+ & Bp+)} & {@MXp+} & - ( or - or - )) or - Up- or - (YP+ & {Dmc-}) or - (GN+ & (DD- or [()])))) or - [[AN+]]); +% ND- & A- & D- & Jp-: "we walked for a further three hours" +days.n weeks.n moments.n hours.n minutes.n years.n instants.n +periods.n months.n nights.n seconds.n decades.n centuries.n: + ({NM+} & (( & + (({Dmc- or @M+} & {WN+ or TH+ or or (R+ & Bp+)} & {@MXp+} & + ( or + or + )) or + Up- or + (YP+ & {Dmc-}) or + (GN+ & (DD- or [()])))) or + [[AN+]])) or + (ND- & A- & D- & Jp-); % XXX A major problem here is that the dict entries for miles.n, feet.n -% create a mass of parses that are wrong & interfere with the below. +% create a bunch of parses that are wrong & interfere with the below. +% Jp-: "we walked for three kilometers" +% ND- & A- & D- & Jp-: "we walked for a further three kilometers" : -((ND- or [()] or [[EN-]]) & (Yd+ or Ya+ or EC+ or [[MVp-]] or OD-)) or -(ND- & (NIfu+ or NItu- or EQt+ or EQt-)); + ((ND- or [()] or [[EN-]]) & (Yd+ or Ya+ or EC+ or [[MVp-]] or OD-)) + or ((ND- or [()]) & Jp-) + or (ND- & A- & D- & Jp-) + or (ND- & (NIfu+ or NItu- or EQt+ or EQt-)); % AU is abbreviation for "astronomical units" blocks.i feet.i miles.i yards.i inches.i @@ -7195,26 +7452,29 @@ % Add cost to Op-, try to use any other linkage before making % a unit be a plain-old object. +% A- & ND-: "200 square ft of plywood" : -((ND- or NS- or NIe-) & (NIfu+ or NItu-)) or -((ND- or NS- or NIe-) & (AN+ or EQt+ or EQt-)) or -((ND- or NS- or NIe-) & (DD- or EN-) & {Wd-} & ({Mp+} & Sp+ )) or -((ND- or NS- or NIe-) & ([[{DD-} & Op-]] or Jp-) & {Mp+}) or -((ND- or NS- or NIe-) & Xd- & MX- & Xc+) or -((ND- or NS-) & {NJ-} & (EC+ or Y+ or OD- or (Us- & {Mp+}))) or Us-; + ({A-} & (ND- or NS- or NIe-) & (NIfu+ or NItu-)) or + ({A-} & (ND- or NS- or NIe-) & (AN+ or EQt+ or EQt-)) or + ({A-} & (ND- or NS- or NIe-) & {DD- or EN-} & {Wd-} & {Mp+} & Sp+) or + ({A-} & (ND- or NS- or NIe-) & ([{DD-} & Op-] or Jp-) & {Mp+}) or + ({A-} & (ND- or NS- or NIe-) & Xd- & MX- & Xc+) or + ((ND- or NS-) & {NJ-} & (EC+ or Y+ or OD- or (Us- & {Mp+}))) or + Us-; % Abbreviations of scientific units that follow numbers % km².u mi².u in².u ft².u m².u cm².u /en/words/units.1: ; -UNITS: ; +/en/words/units.3: ; +: ; + +% Allows "200 sq. ft. of plywood", "200 cu yds of concrete" +/en/words/units.a: A+; % Units abbreviations that can be followed by a period: % ft. tbsp. yds. /en/words/units.1.dot: {Xi+} & ; -% Abbreviations that form noun-like things "degrees_Fahrenheit" etc. -/en/words/units.3: ND- & ; - % Time unit abbreviations: : or ((ND- or NS-) & {NJ-} & OT-); /en/words/units.4: ; @@ -7257,6 +7517,7 @@ who: (R- & (({MVp+ or MVx+} & RS+) or )) or [QI-] + or SJl+ or SJr- or Jw- or ({EL+} & ((S**w+ & {Bsw+}) or (R+ & B*w+)) & {EW-} & (Ws- or Wq- or QI*d- or BIqd-)) or ({EL+ & {N+}} & Wd-) @@ -7285,6 +7546,7 @@ or (D+ & JQ-) or Jw- or [QI-] + or SJl+ or SJr- or (Xc+ & Ic+); % QI- & (): "I do not know which" @@ -7365,9 +7627,13 @@ or ({Xd- & Xc+} & MVs-))) or ({EW-} & Ww- & Qw+)); -although in_as_much_as whilst whereas whereof wherein: +% Comparative-opener: "although a good worker, he's not a very good manager" +: (O*c+ & {Xc+ & {Xd-}} & COc+); + +although in_as_much_as whereas whereof wherein: ( & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-))) - or ({Xd-} & & (Wd+ or Wp+ or Wr+)); + or ({Xd-} & & (Wd+ or Wp+ or Wr+)) + or ; % QI- & (): "I do not know when" % (Mv- & Cs+): "an examination when it happened revealed chicanery" @@ -7376,11 +7642,17 @@ % Perhaps a better solution might be some special case of WN+ on % examination? (would require 'examination' and all other 'operation' % nouns to be in their own class... +% SJ: "when, but not how, will be decided." +% MJ: "when, if not next Tuesday, do you want to do it?" +% JT- & MJr-: "if not next Tuesday, then when do you want to do it?" when: ((WN- or BIh-) & ) or (( or ) & (BIq- or QI- or (SFsx+ & ))) or (Mv- & ) or [QI-] + or SJl+ or SJr- + or MJl+ + or ({JT-} & MJr- & Qw+) or ({EW-} & (QJ- or QJ+)) or ({EW-} & Ww- & {Qw+}) or (( or Mp+ or Mgp+ or Mv+) & @@ -7395,6 +7667,7 @@ or (QI- & ( or or [()])) or ( & ((SFsx+ & ) or WY- or BIq- or QJ+ or QJ-)) or COa+ + or SJl+ or SJr- or ({EW-} & (QJ- or QJ+)) ); @@ -7408,6 +7681,7 @@ & ( ({EW-} & Wq- & ((Rw+ & WR+) or (R+ & Bsw+) or Qw+)) or [QI-] + or SJl+ or SJr- or ({EW-} & (QJ- or QJ+)) or ( & Bsw+ & QI-) or ((WR+ or or ) & (BIq- or QI- or (SFsx+ & ))) @@ -7416,6 +7690,7 @@ whether: ((QI- or BIq-) & ( or )) or ( & SFsx+ & ) + or SJl+ or SJr- or [[( or MV+) & (({Xd- & Xc+} & MVs-) or ({Xc+ & {Xd-}} & CO*s+))]]; whether_or_not: @@ -7432,6 +7707,7 @@ or ({EW-} & Wq- & (({EL+} & Qw+) or AF+)) or [QI-] or ({EW-} & (QJ- or QJ+)) + or SJl+ or SJr- or (( or ) & (QI- or BIq- or (SFsx+ & ))) ; %%% or ((EAh+ or EEh+) & Ww-); @@ -7439,15 +7715,19 @@ % ---------------------------------------------------- % CONJUNCTIONS & ADVERBS +% "that" as subjunctive or paraphrasing % EBx+: He told me that even his mother likes me % (perhaps this should be changed to a different EB ??) that.j-c: - ({EBx+} & & ([SFsx+ & ] or TH- or [[MVh-]] or RJ*t+ or RJ*t-)) + ({EBx+} & & + ([SFsx+ & ] or TH- or [[MVh-]] or RJ*t+ or RJ*t-)) or (TS- & (SI*j+ or SFI**j+) & I*j+); +% "that" as determiner. % D*u+: "Give me that thing" that.j-d: [{AL-} & D*u+]; +% "that" as adverb. % Xd-: iffy punctuation, e.g "The man, that you saw laugh... that.j-r: ({[Xd-]} & R- & (({MVp+ or MVx+} & RS+) or )) @@ -7455,7 +7735,7 @@ or (EA+ & {HA+}) or DTn+; -% naked Osn-: "give me that" +% "that" as noun. naked Osn-: "give me that" that.j-p: ({[[M+]]} & ) or ; % "that" as topic. Wt is a topic link to LEFT-WALL. "That I did not know". @@ -7483,8 +7763,11 @@ (TH+ or TS+) & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-)); unless though.c even_though: - ( or Mgp+ or Mv+) - & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-) or (Xd- & Xc+ & E+)); + (( or Mgp+ or Mv+) + & (({Xc+ & {Xd-}} & CO*s+) + or ({Xd- & Xc+} & MVs-) + or (Xd- & Xc+ & E+))) + or ; as_if as_though: (( or Mgp+ or Mv+ or Mp+) @@ -7525,7 +7808,7 @@ or ( & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-))))) or (Yt- & ( or Qe+)); -if only_if if_and_only_if iff: +if.r only_if if_and_only_if iff: ( & {Xc+ & {Xd-}} & (Wd- & (Qd+ or Ws+ or Wq+))) or (( or [Mgp+] or [Mv+]) & (({Xd- & Xc+} & MVs-) or ({Xc+ & {Xd-}} & CO*s+))) @@ -7547,9 +7830,12 @@ no_wonder: (Wd+ or Wp+ or Wr+) & Wc-; % Pa+: "it can be pressed into shape while cold" -while: - (( or Mgp+ or Mp+ or Pa+) - & (({Xc+ & {Xd-}} & CO*s+) or ({Xd- & Xc+} & MVs-) or (Xd- & Xc+ & E+))); +while whilst: + (( or Mgp+ or Mp+ or Pa+) & + (({Xc+ & {Xd-}} & CO*s+) or + ({Xd- & Xc+} & MVs-) or + (Xd- & Xc+ & E+))) or + ; : RJlv+ or RJrv-; @@ -7613,9 +7899,11 @@ % comparatives: % he is bigger, and badder, than the pope. % If one comma is there, then the other must be also. +% SJl- & AJrc+: "She was John's height, or taller" and.j-c or.j-c but.j-c yet.j-c: ((AJlc- & AJrc+) or - (Xd- & AJlc- & AJrc+ & Xc+)) & + (Xd- & AJlc- & AJrc+ & Xc+) or + ({Xd-} & SJl- & {EB+} & AJrc+)) & (((Pam- or Mam- or AFm+) & {@MV+}) or ({[ECa-]} & MVb-) or Am+); @@ -7675,7 +7963,7 @@ % "Where is the sickle and hammer?" (SIs-) % Op- has a cost, so that "they verbed X and verbed Y" gets the VJ link % at zero cost, and the SJ link at higher cost (since a "verbed Y" can be -% understood as a modified noun). Acutally, should probably have some +% understood as a modified noun). Actually, should probably have some % post-processing rule to disallow this XXX to do fix above. Example of % bad SJ usage: "He bangs drums and played piano" i.e "he bangs a played piano" % @@ -7704,17 +7992,20 @@ % "I saw the (dog and pony) show" % % and_not ,_not: "I saw John, not Mary" -% We treat this here as an idiom, even though it's explicitly hanled for -% AJ nd RJ conjunctions. Its just easier, for now. +% We treat this here as an idiom, even though it's explicitly handled for +% AJ nd RJ conjunctions. Kind-of wrong, it's just easier, for now. % % {Jd- & Dm-}: "A number of recommendations and suggestions were made" % with "number of" modifying the and.j-n -% [[]] costs so that above is prefered: (huh????) +% [[]] costs so that above is preferred: (huh????) % "there was enough of the beer and sandwiches" % % XJa-: "Both June and Tom are coming" - -and.j-n but_not and_not ,_not: +% +% Xd- & SJl- & EBb+ & SJr+ & Xc+ & Wd- & Ssx+: EB+ forces a singular subject! +% +% and.j-n but_not but_just_not and_not ,_not just_not: +and.j-n 'n': ( & & {XJa-} & ( or (Spx+ & ) @@ -7735,19 +8026,28 @@ or SIu- or Wa- or [{{Jd-} & Dmu-} & Ou-] - or )); + or )) + or ((Xd- & SJl- & EB+ & SJr+ & Xc+) & (Wd- & Ssx+)) + or (({Xd-} & SJl- & EB+ & SJr+ & {Xc+}) & O-); + +% A zero-copula in a conjunction: +% "that is very nice, but not what I want" +but_not just_not: VJrsi- & O+; % {XJo-}: "I can use either this or that". +% EB+: "the problem, or rather, one of the problems, ..." % or.j-n: ( & & {XJo-} & ( or (S*x+ & ) or SIs- or [Os-] or Wa- or )) or ( & & {XJo-} & - ( or (Spx+ & ) or SIp- or [Op-] or Wa- or )); + ( or (Spx+ & ) or SIp- or [Op-] or Wa- or )) + or ((Xd- & SJl- & EB+ & SJr+ & Xc+) & (Wd- & Ssx+)) + or (({Xd-} & SJl- & EB+ & SJr+ & {Xc+}) & O-); % XJn- "neither this nor that" % XJn- is optional: "I don't like dogs nor cats" but not having it is given -% a cost only because its a kind of a strange construction ... +% a cost only because it's a kind of a strange construction ... % SJl- & SJr+ & SJl+: cascading nor's: "Neither snow nor rain nor heat..." % SI- can be singular or plural: for example: % "There is neither a dog nor a cat here" @@ -7766,16 +8066,28 @@ % Force use of commas: "Mary, but not Louise, is coming to the party" % Not John, but Mary led the way. % XJb-: "not only this but also that" -% XXX FIXME: the EBb and EBY below should probably be replaced by XJb -but.j-n: +% XXX FIXME: the EBb and EBy below should probably be replaced by XJb +but.j-n yet.j-n: ((Xd- & SJl- & EBb+ & SJr+ & Xc+) & (Wd- & Ssx+)) or ((Xd- & SJl- & EBy- & SJr+) & (Wd- & Ssx+)) or - (({Xd-} & SJl- & EBy- & SJr+) & Ou-) or - (({Xd-} & SJl- & XJb- & SJr+) & Ou-); + (({Xd-} & SJl- & (XJb- or EBy- or EBb+) & SJr+) & Ou-); but_also: {Xd-} & SJl- & XJb- & SJr+ & Ou-; +% SJ: "Who, if not Micheal, will provide for your care?" +% MJ: "when, if not tomorrow, do you want to do it?" +% MJr+ & MJR+: "if not next Tuesday, when do you want to do it?" +if.j-n: + ((Xd- & SJl- & EBb+ & SJr+ & Xc+) & (Ws- & S**w+)) or + ((Xd- & SJl- & EBb+ & SJr+ & Xc+) & (Ww- & Qw+)) or + ((Xd- & MJl- & EBb+ & MJr+ & Xc+) & (Ww- & Qw+)) or + ((EBb+ & MJr+ & Xc+ & MJr+) & Ww-); + +% Conditional: if ... then ... +if.j-c: Wd- & & XJc+; +then.j-c: {Xd-} & XJc- & VJr+; + % -------------------------------------------------------------------------- % Conjoined verbs/verb phrases % "Oscar Peterson played piano and wrote music." @@ -7784,17 +8096,34 @@ % "This is a problem Moscow created and failed to solve." % [I-]0.2, []0.2: avoid I links to conjoined non-infinitives. % XXX This is hacky, we should just prevent such infinitive links from -% occuring at all. +% occurring at all. +% {TO+}: "I aim to do something and to help." : - (({Xd-} & VJlsi- & VJrsi+) & (({@MV+} & Ss- & ) or (RS- & Bs-) or ([I-]0.2 & {@MV+} & ) or ({Xd-} & VJrsi-))) or - (({Xd-} & VJlpi- & VJrpi+) & (({@MV+} & Sp- & ) or (RS- & Bp-) or ([I-]0.2 & {@MV+} & ) or ({Xd-} & VJrpi-))) or - (({Xd-} & VJlst- & VJrst+) & ((({@MV+} & Ss- & ) or ([I-]0.2 & {@MV+} & )) & (O+ or (B- & {B+})))) or - (({Xd-} & VJlpt- & VJrpt+) & ((({@MV+} & Sp- & ) or ([I-]0.2 & {@MV+} & )) & (O+ or (B- & {B+})))) or (({Xd-} & VJlh- & VJrh+) & (PP- & {@MV+} & )) or + (({Xd-} & VJlsi- & VJrsi+) & + (({@MV+} & Ss- & ) or + (RS- & Bs-) or + ([I-]0.2 & {@MV+} & ) or + ({Xd-} & VJrsi-))) or + (({Xd-} & VJlpi- & {TO+} & VJrpi+) & + (({@MV+} & Sp- & ) or + (RS- & Bp-) or + ([I-]0.2 & {@MV+} & ) or + ({Xd-} & VJrpi-))) or + (({Xd-} & VJlst- & VJrst+) & + ((({@MV+} & Ss- & ) or + ([I-]0.2 & {@MV+} & )) & + (O+ or (B- & {B+})))) or + (({Xd-} & VJlpt- & VJrpt+) & + ((({@MV+} & Sp- & ) or + ([I-]0.2 & {@MV+} & )) & + (O+ or (B- & {B+})))) or + (({Xd-} & VJlh- & VJrh+) & (PP- & {@MV+} & )) or ((VJlg- & VJrg+) & (J-)) or ((VJlp- & VJrp+) & []0.2) or ((VJls- & VJrs+) & []0.2); and.j-v or.j-v: ; +then.j-v: ; % ditransitive conjunction: "I gave Bob a doll and Mary a gun" % Actually, optionally ditransitive, to cover more cases. @@ -7804,11 +8133,14 @@ {Xd-} & VJd- & O+ & {O*n+}; and.j-o or.j-o: ; +then.j-o: ; % XJn-: neither ... nor ... +% I-: "I don't want that, nor do you" nor.j-v: (VJl*i- & XJn- & VJr*i+ & ({@MV+} & S- & )) or - (VJl*t- & XJn- & VJr*t+ & ({@MV+} & S- & O+ & )); + (VJl*t- & XJn- & VJr*t+ & ({@MV+} & S- & O+ & )) or + ({Xd-} & VJl*i- & VJr*i+ & ({@MV+} & I- & )); % Similar to and, but allows optional comma before "but" % "blah blah, but blah" @@ -7841,7 +8173,7 @@ % The costly [[]] is quite ugly and unappealing, but is % needed to parse "he is either in the 105th nor the 106th battalion". % The problem here is that "either in" seems to be order-reversed from -% "in either", and doing it right would require link-corssing. +% "in either", and doing it right would require link-crossing. either.r: Ds+ or XJo+ @@ -7864,8 +8196,15 @@ for.r: [[(({Xd-} & ) or Wc-) & (Wd+ or Wp+ or Wr+ or Qd+ or Ws+ or Wq+)]]; yet.r: ((({Xd-} & ) or Wc-) & (Wd+ or Wp+ or Wr+)) or E+ or MVa- or ({Xd-} & Xc+ & CO+); -thus therefore: ({Xc+ & {Xd-}} & CO+) or ({Xd-} & & Wd+) or -({Xd- & Xc+} & (E+ or EB-)) or (Xd- & Xc+ & MVa-); +% therefor is a common mis-spelling, unlikely the archaic therefor ... +% : "thus it would seem" +thus therefore therefor: + ((Xc+ & {Xd-}) & CO+) or + [CO+]0.3 or + ({Xd-} & & Wd+) or + ({Xd- & Xc+} & (E+ or EB-)) or + (Xd- & Xc+ & MVa-) or + ; % EBy+ link is for "verbed not X but Y" "I saw not Mary, but John" % @@ -7884,6 +8223,7 @@ or ({@E-} & N-) or NT+ or EBy+ + or or [[((Ma+ or Mg+ or Mv+ or Mp+) & CO+) or (Mg- & Mgn+) or (Mv- & Mvn+) @@ -7893,15 +8233,25 @@ % We include this, though it's not one of the strippable strings n't n’t: N- or EB-; -%ADJECTIVES +% "Just" is tricky... +% COMP-OPENER: "just not a good swimmer, he fell behind" +just_not: ; +% --------------------------------------------------- +% ADJECTIVES % Common disjuncts shared by virtually all adjectives. -: +% +% This one is used for openers and post-nominal modifiers. +: [{@E-} & {@MV+} & & {@MV+}] - or (AJra- & {@MV+}) - or ({@MV+} & AJla+) or ({@E-} & {@MV+} & ([[]] or (Xd- & Xc+ & MX*a-))); +% Conjoined adjectives +: + ({[EA-]-0.1} & AJra- & {@MV+}) or ({@MV+} & AJla+); + +: or ; + % Ordinary adjectives % abject.a abnormal.a abominable.a abortive.a abrasive.a abrupt.a % @@ -7909,45 +8259,49 @@ % % Lots and lots, but not all ordinary adjs can take : % Pa- & : "... is too abrasive to talk to." -% (EAh- & {Qe+}): "How big?" "How tall?" +% [EAh- & {Qe+}]: "How big?" "How tall?" -- large cost, as it otherwise +% causes bizarre parses for "a decidedly jolly good player" : - ({EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {}) + ({EA- or EF+} & ( + ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {}) or ({@MV+} & MJla+) or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or )) - or (EAh- & {Qe+}); + or )) + or ({EF+} & ) + or [[DD- & ]] + or [[{DD-} & ]] + or [EAh- & {Qe+}]; + +% Macro, for all the common parts of an A+ connection, with +% the phonetic attachement as a variable. +define(`ADJ_PH',`' + ({EA- or ({($1)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {Xc+} & A+)) % PH-: connect, phonetically, to a/an if it is there. -: - & ; - -: - & ; +: ADJ_PH(); +: ADJ_PH(); +: ADJ_PH(); /en/words/words.adj.1-vowel : or - ; + or ; /en/words/words.adj.1-const : or - ; + or ; -% Make the given name Frank be prefered to 'frank.a' +% Make the given name Frank be preferred to 'frank.a' % e.g. "Frank felt vindicated when his long time rival Bill revealed that % he was the winner of the competition." frank.a: - [ or ]0.2; + [ or or ]0.2; -% Add a miniscule cost, so that the noun form is prefered... +% Add a minuscule cost, so that the noun form is preferred... % An older formulation of this used Ah- as the link, but I don't see % why. Generic adjective should be OK. Given a cost of 0.04, so -% as to give a slight prefernce for the noun-form, if possible. -HYPHENATED-WORDS.a: - []0.04; +% as to give a slight preference for the noun-form, if possible. +.a: + [ or ]0.04; % Color names. Just like ordinary adjectives, except that the % color names themselves can be modified by other nouns, gerunds, @@ -7963,14 +8317,15 @@ % "Her shoes are fire-engine red" % [A-]0.2: "a big green apple" want "big" to modify "apple", not "green" : - {EA- or EF+} & {(AN- or [A-]0.2) & {Ds-}} & + ({EA- or EF+} & {(AN- or [A-]0.2) & {Ds-}} & (({[[@Ec-]]} & {Xc+} & A+) or ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {}) or ({@MV+} & MJla+) or AA+ or [[DD- & ]] or [[{DD-} & ]] - or ); + or )) + or ({EF+} & {(AN- or [A-]0.2) & {Ds-}} & ); % pinkish brownish not in this list as they can't take the modifiers. auburn.a black.a blue.a blueberry.a @@ -7981,42 +8336,53 @@ ; -% "We caught a through flight", "its a done job" - adjective -- !? probably over-broad. : - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + or + ({EA- or EF+} & ( ((AF+ or Ma- or MJra-) & {@MV+}) or ({@MV+} & MJla+) or AA+ or [[DD- & ]] or - [[{DD-} & ]]); - -done.c gone.c through.c: & ; + [[{DD-} & ]])); -responsible.a accountable.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or - ((Pa- or AF+ or Ma- or Vh- or MJra-) & {@MV+}) or - ({@MV+} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); +% "We caught a through flight", "it's a done job" - adjective -- !? +% probably over-broad. +done.c gone.c through.c: + + or ; + +responsible.a: + ({EA- or EF+} & ( + ((Pa- or Vh-) & {@MV+}) or + )) + or ({EF+} & ) + or + or ; + +accountable.a: + ({EA- or EF+} & ( + ((Pa- or Vh-) & {@MV+}) or + )) + or ({EF+} & ) + or + or ; long.a: - ( & ) + + or or ((Ya- or Yt-) & (Pa- or Ma- or MJra- or MJla+)) or (H- & (BT+ or Yt+)); % Hmm does distant really belong here? % "The river is a mile wide here": Ya- & Pa- & MVp+ wide.a tall.a deep.a distant.a: - ( & ) + or + or or (Ya- & (Pa- or Ma- or MJra- or ) & {@MV+}) or (Ya- & {@MV+} & MJla+); old.a: - ( & ) + or or (Ytm- & (Pa- or Ma- or or MJra- or MJla+)); % ??? adj-op already has MX*a- in it, why do we need a bare MX- here ? @@ -8026,75 +8392,78 @@ % "ages" work like aged, but with a cost. ages.i: []; + % The following all have a very regular pattern, with just one variable % part, the TO TH type section. This regularity should be exploited to % simplify the expressions ... -easy.a hard.a simple.a difficult.a fun.a expensive.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) +easy.a expensive.a: + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & )); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; + +hard.a simple.a difficult.a fun.a: + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + or ({@MV+} & {( or ) & {LE+}} & MJla+) + or ({} & ))) + or ({EF+} & {} & ) + or + or ; % M-: "Do you have an example ready?" % Although we'd like to use Ma- for the above, post-processing prevents this. ready.a: - {EA- or EF+} & ( - ({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + ({EA- or EF+} & ( + ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) - or AA+ or M- - or [[DD- & ]] - or [[{DD-} & ]] - or ({ or } & )); + or ({ or } & ))) + or ({EF+} & { or } & ) + or + or ; silly.a nasty.a pleasant.a dangerous.a cruel.a standard.a safe.a legal.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & )); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; % Identical to above, but starts with vowel unpleasant.a illegal.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & )); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; : - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or THi+ or ) & {LE+}}) + ({EA- or EF+} & + (((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or THi+ or ) & {LE+}}) or ({@MV+} & {( or THi+ or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & ) - or AJr-); + or ({} & ) + or AJr-)) + or ({EF+} & {} & ) + or ; good.a bad.a nice.a strange.a wonderful.a terrible.a possible.a fair.a -tough.a: - & ; +tough.a great.a: + or ; unusual.a useful.a impossible.a annoying.a unfair.a unuseful.a: - & ; + or ; a_bitch : - ; - -great.a: or ( & ); + + or ({EA- or EF+} & {[[@Ec-]]} & {Xc+} & A+); % Surely this is incomplete... one_and_only: @@ -8102,180 +8471,206 @@ % Identical to below, but starts with vowel. important.a essential.a imperative.a: - or - ({EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Paf- or AF+ or Ma- or MJra-) & (({@MV+} & {(THi+ or or TSi+) & {LE+}}) or )) + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & (({@MV+} & {(THi+ or or TSi+) & {LE+}}) or )) or ((({@MV+} & {(THi+ or or TSi+) & {LE+}}) or ) & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & ))); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; % Identical to above, but starts with consonant crucial.a necessary.a vital.a: - or - ({EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+ & ) - or ((Paf- or AF+ or Ma- or MJra-) & (({@MV+} & {(THi+ or or TSi+) & {LE+}}) or )) + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & (({@MV+} & {(THi+ or or TSi+) & {LE+}}) or )) or ((({@MV+} & {(THi+ or or TSi+) & {LE+}}) or ) & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & ))); - -% XXX FIXME Most of the below need to be sorted into vowel/consonant groups. -% -common.a practical.a original.a normal.a helpful.a striking.a -confusing.a frustrating.a disturbing.a -logical.a illogical.a elegant.a efficient.a awful.a just.a unjust.a -absurd.a natural.a alarming.a acceptable.a unacceptable.a deplorable.a -detestable.a scary.a shocking.a + or ({} & ))) + or ({EF+} & {} & ) + or + or ; + +common.a practical.a normal.a helpful.a striking.a +confusing.a frustrating.a disturbing.a logical.a just.a +natural.a deplorable.a detestable.a scary.a shocking.a poetical.a: - or - ({EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) or ({@MV+} & {(THi+ or ) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or )); + or )) + or ({EF+} & ) + or + or ; + +original.a illogical.a elegant.a efficient.a awful.a unjust.a +absurd.a alarming.a acceptable.a unacceptable.a: + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) + or ({@MV+} & {(THi+ or ) & {LE+}} & MJla+) + or )) + or ({EF+} & ) + or + or ; +% XXX FIXME Most of the below need to be sorted into vowel/consonant groups. +% surprising.a interesting.a odd.a remarkable.a amazing.a exciting.a depressing.a rare.a embarrassing.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or or ) & {LE+}}) or ({@MV+} & {(THi+ or or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + )) + or ({EF+} & ) + or + or ; crazy.a sane.a insane.a stupid.a ridiculous.a wrong.a curious.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or or THi+ or ) & {LE+}}) + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or or THi+ or ) & {LE+}}) or ({@MV+} & {( or or THi+) & {LE+}} & MJla+) - or AA+ - or [[DD- & ]] - or [[{DD-} & ]] - or ({} & )); + or ({} & ))) + or ({EF+} & {} & ) + or + or ; wise.a unwise.a smart.a intelligent.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or or ) & {LE+}}) or ({@MV+} & {( or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({} & )); + ({} & ))) + or ({EF+} & {} & ) + or + or ; unlikely.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or THi+ or ) & {LE+}}) or ({@MV+} & {( or THi+ or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({} & )); + ({} & ))) + or ({EF+} & {} & ) + or + or ; likely.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or THi+ or ) & {LE+}}) or ({@MV+} & {( or THi+ or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({} & ) or - [E+]); - -apparent.a false.a official.a strict.a significant.a funny.a notable.a untrue.a tragic.a -plain.a urgent.a a_drag a_bummer definite.a evident.a impressive.a incredible.a -inevitable.a mysterious.a pathetic.a probable.a admirable.a commendable.a conceivable.a -insignificant.a miraculous.a self-evident.a undeniable.a plausible.a understandable.a -demonstrable.a hilarious.a improbable.a inexcusable.a outrageous.a paradoxical.a shameful.a -inconceivable.a unbelievable.a astonishing.a disgraceful.a debatable.a arguable.a -lamentable.a regrettable.a -well-known.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({} & ) or + [E+])) + or ({EF+} & {} & ) + or + or ; + +% Just like below, but vowels +apparent.a official.a untrue.a urgent.a +evident.a impressive.a incredible.a inevitable.a admirable.a +insignificant.a undeniable.a understandable.a +improbable.a inexcusable.a outrageous.a +inconceivable.a unbelievable.a astonishing.a arguable.a: + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) or ({@MV+} & {(THi+ or ) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + )) + or ({EF+} & ) + or + or ; + +% Just like above, but consonants +false.a strict.a significant.a funny.a notable.a tragic.a +plain.a definite.a mysterious.a pathetic.a probable.a +commendable.a conceivable.a miraculous.a self-evident.a +plausible.a demonstrable.a hilarious.a paradoxical.a shameful.a +disgraceful.a debatable.a lamentable.a regrettable.a well-known.a +a_drag a_bummer: + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or or ) & {LE+}}) or + ({@MV+} & {(THi+ or ) & {LE+}} & MJla+) or + )) + or ({EF+} & ) + or + or ; clear.a unclear.a relevant.a irrelevant.a obvious.a immaterial.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {(THi+ or QIi+ or ) & {LE+}}) or ({@MV+} & {(THi+ or QIi+) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + )) + or ({EF+} & ) + or + or ; + +% Implement a zero-that; viz. +% "We have to make sure [that] everyone is here." +: Cet+ & CV+; + +clear.i: + {EA- or EF+} & Vm- & (TH+ or ); -clear.i: {EA- or EF+} & Vm- & TH+; -sure.i certain.i: {EA- or EF+} & Vm- & (TH+ or or (OF+ & {@MV+})); +sure.i certain.i: + {EA- or EF+} & Vm- & (TH+ or or or (OF+ & {@MV+})); % (AJrc- & {@MV+}): "It is lighter and less costly" -tactful.a conventional.a advisable.a prudent.a sensible.a tactless.a polite.a -impolite.a arrogant.a conceited.a obnoxious.a valuable.a reasonable.a -unreasonable.a traditional.a unnecessary.a tempting.a usual.a -inadvisable.a lovely.a a_mistake ethical.a unethical.a immoral.a -childish.a awkward.a appropriate.a costly.a customary.a desirable.a -dumb.a effective.a fashionable.a energy-intensive.a -foolish.a healthy.a hip.a okay.a OK.a ok.a -painful.a selfish.a sufficient.a advantageous.a boring.a -inappropriate.a insufficient.a irrational.a irresponsible.a +% Exactly the same as below, but consonant. +tactful.a conventional.a prudent.a sensible.a tactless.a polite.a +conceited.a valuable.a reasonable.a traditional.a tempting.a +lovely.a childish.a costly.a customary.a desirable.a +dumb.a fashionable.a foolish.a healthy.a hip.a +painful.a selfish.a sufficient.a boring.a mandatory.a meaningless.a preferable.a senseless.a trivial.a wrongheaded.a premature.a risky.a dishonest.a -hypocritical.a enjoyable.a idiotic.a inconvenient.a unkind.a pointless.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) - or ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) +hypocritical.a pointless.a +a_mistake: + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) or ({@MV+} & { & {LE+}} & MJla+) - or AA+ - or or (AJrc- & {@MV+}) - or [[DD- & ]] - or [[{DD-} & ]]); + or )) + or ({EF+} & ) + or + or ; + +% Exactly the same as above, but vowel. +advisable.a impolite.a arrogant.a obnoxious.a +unreasonable.a unnecessary.a usual.a +inadvisable.a ethical.a unethical.a immoral.a +awkward.a appropriate.a effective.a energy-intensive.a +okay.a OK.a ok.a advantageous.a +inappropriate.a insufficient.a irrational.a irresponsible.a +enjoyable.a idiotic.a inconvenient.a unkind.a: + ({EA- or EF+} & ( + ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or ) & {LE+}}) + or ({@MV+} & { & {LE+}} & MJla+) + or (AJrc- & {@MV+}) + or )) + or ({EF+} & ) + or + or ; unknown.a questionable.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {QIi+}) or ({@MV+} & {QIi+} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + )) + or ({EF+} & ) + or + or ; certain.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Paf- or AF+ or Ma- or MJra-) & {@MV+} & {( or or TH+ or QI+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {( or or TH+ or QI+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or or TH+ or QI+ or OF+} & )); + ({ or or TH+ or QI+ or OF+} & ))) + or ({EF+} & { or or TH+ or QI+ or OF+} & ) + or + or ; sure.a unsure.a uncertain.a careful.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or or TH+ or QI+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {( or or TH+ or QI+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or or TH+ or QI+ or OF+} & )); + ({ or or TH+ or QI+ or OF+} & ))) + or ({EF+} & { or or TH+ or QI+ or OF+} & ) + or + or ; % XXX FIXME: many of the below should probably take like the % above... @@ -8284,36 +8679,33 @@ disappointed.a upset.a sorry.a content.a determined.a amused.a amazed.a astonished.a astounded.a pleased.a disgusted.a distressed.a dismayed.a irritated.a embarrassed.a alarmed.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or TH+) & {LE+}}) or ({@MV+} & {( or TH+) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or TH+} & )); - -glad.a fortunate.a unfortunate.a lucky.a unlucky.a happy.a sad.a surprised.a -delighted.a overjoyed.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({ or TH+} & ))) + or ({EF+} & { or TH+} & ) + or + or ; + +glad.a fortunate.a unfortunate.a lucky.a unlucky.a happy.a sad.a +surprised.a delighted.a overjoyed.a: + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or or TH+) & {LE+}}) or ({@MV+} & {( or or TH+) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or or TH+} & )); + ({ or or TH+} & ))) + or ({EF+} & { or or TH+} & ) + or + or ; % common adjectives, taking "to", "of", "that" e.g. "proud that" proud.a scared.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {( or TH+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {( or TH+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({ or TH+ or OF+} & )); + ({ or TH+ or OF+} & ))) + or ({EF+} & { or TH+ or OF+} & ) + or + or ; % common adjectives, taking "of" e.g. "tired of", "sick of" etc. tired.a pooped.a full.a sick.a critical.a guilty.a innocent.a @@ -8321,77 +8713,84 @@ capable.a contemptuous.a incapable.a reminiscent.a scornful.a mindful.a short.a appreciative.a complimentary.a born.a worthy.a free.a terrified.a unworthy.a prognostic.a dead.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {OF+ & {@MV+}}) or ({@MV+} & {OF+ & {@MV+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({OF+ & {@MV+}} & )); + ({OF+ & {@MV+}} & ))) + or ({EF+} & {OF+ & {@MV+}} & ) + or + or ; fond.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & OF+ & {@MV+}) or (OF+ & {@MV+} & MJla+) or - (OF+ & )); + (OF+ & ))) + or ({EF+} & OF+ & ) + or ; + afraid.a ashamed.a unafraid.a unashamed.a: - {EA- or EF+} & + ({EA- or EF+} & (((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {((OF+ & {@MV+}) or or TH+ or ) & {LE+}}) or ({@MV+} & {((OF+ & {@MV+}) or or TH+ or ) & {LE+}} & MJla+) or - ({OF+ or or TH+ or } & )); - -apprehensive.a secure.a optimistic.a pessimistic.a annoyed.a confused.a offended.a -insulted.a concerned.a depressed.a doubtful.a -grateful.a mad.a mistaken.a hopeful.a unhappy.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({OF+ or or TH+ or } & ))) + or ({EF+} & {OF+ or or TH+ or } & ) + or ; + +apprehensive.a optimistic.a annoyed.a offended.a +insulted.a unhappy.a +secure.a pessimistic.a confused.a concerned.a depressed.a +doubtful.a grateful.a mad.a mistaken.a hopeful.a: + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {TH+ & {LE+}}) or ({@MV+} & {TH+ & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({TH+} & )); + ({TH+} & ))) + or ({EF+} & {TH+} & ) + or + or ; aware.a unaware.a: - {EA- or EF+} & + ({EA- or EF+} & (((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {(TH+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {(TH+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({TH+ or OF+} & )); + ({TH+ or OF+} & ))) + or ({EF+} & {TH+ or OF+} & ) + or ; -true.a conscious.a confident.a skeptical.a jealous.a suspicious.a envious.a desirous.a +true.a conscious.a confident.a skeptical.a jealous.a +suspicious.a envious.a desirous.a convinced.a unconvinced.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & {(TH+ or (OF+ & {@MV+})) & {LE+}}) or ({@MV+} & {(TH+ or (OF+ & {@MV+})) & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({TH+ or OF+} & )); + ({TH+ or OF+} & ))) + or ({EF+} & {TH+ or OF+} & ) + or + or ; -eager.a reluctant.a able.a unable.a impatient.a eligible.a brave.a anxious.a apt.a desperate +eager.a reluctant.a able.a unable.a impatient.a +eligible.a brave.a anxious.a apt.a desperate keen.a prepared.a willing.a hesitant.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or + ({EA- or EF+} & ( ((Pa- or AF+ or Ma- or MJra-) & {@MV+} & { & {LE+}}) or ({@MV+} & { & {LE+}} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ({} & )); + ({} & ))) + or ({EF+} & {} & ) + or + or ; -former.a: A+ or G+ or (DG- & ); +former.a: ( & A+) or G+ or (DG- & ); latter.a: DG- & ; -overall.a onetime.a outboard.a pinstripe.a goddam.a de_facto de_jure erstwhile.a -foster.a outright.a online.a: A+; -pro_forma ad_hoc bona_fide: A+ or Pa-; -a_priori a_posteriori: A+ or MVa- or ({Xc+ & {Xd-}} & CO+); + +overall.a onetime.a outboard.a pinstripe.a +goddam.a defacto de_facto de_jure erstwhile.a +foster.a outright.a online.a: & A+; + +pro_forma ad_hoc bona_fide: ( & A+) or Pa-; + +a_priori a_posteriori apriori aposteriori: + ( & A+) or MVa- or ({Xc+ & {Xd-}} & CO+); asleep.a awake.a alike.a alive.a ablaze.a adrift.a afire.a aflame.a afloat.a afoot.a aghast.a aglow.a agog.a ajar.a amiss.a askew.a @@ -8415,24 +8814,31 @@ MVp- or E+; +% Max-: "the only apartment available" +% Max passes through post-processing, whereas Ma- is disallowed... outstanding.a available.a: - {EA- or EF+} & - (({[[@Ec-]]} & {Xc+} & A+) or - ((Pa- or AF+ or Max- or MJra-) & {@MV+}) or - ({@MV+} & MJla+) or - AA+ or - [[DD- & ]] or - [[{DD-} & ]] or - ); + ({EA- or EF+} & ( + ((Pa- or Max-) & {@MV+}) or + )) + or ({EF+} & ) + or + or ; due.i effective.i: (TY+ or JT+) & ; -north.a south.a east.a west.a northeast.a northwest.a southeast.a southwest.a: - or -A+ or ({Yd-} & {OF+} & (Pp- or MVp- or Mp- or (Xc+ & Xd- & (MVx- or MX*x-)))) -or [[DD- & ]]; -northern.a southern.a eastern.a western.a northeastern.a northwestern.a southeastern.a -southwestern.a: or A+ or G+; +north.a south.a east.a west.a northeast.a +northwest.a southeast.a southwest.a: + + or ( & A+) + or ({Yd-} & {OF+} & (Pp- or MVp- or Mp- or (Xc+ & Xd- & (MVx- or MX*x-)))) + or [[DD- & ]] + or [[{DD-} & ]]; + +northern.a southern.a eastern.a western.a northeastern.a +northwestern.a southeastern.a southwestern.a: + + or ( & A+) + or G+; % .. is this correct? benumbed.a bespattered.a non_compos_mentis dead_on_arrival @@ -8448,8 +8854,8 @@ %COMPARATIVES AND SUPERLATIVES % Omm-: "I want more" -- the second m blocks O*t+ on -% Non-zero cost on Omm- so that EA+ is prefered. -% Cost of >1.0 on Omm- so that MVm- is prefered for "He runs more". +% Non-zero cost on Omm- so that EA+ is preferred. +% Cost of >1.0 on Omm- so that MVm- is preferred for "He runs more". more: ({ECa-} & (EAm+ or EEm+ or [MVm-] or [EB*m-] or Qe+ or or AJrc- or AJlc+)) or ({OF+} & ( @@ -8462,7 +8868,7 @@ more_of_an: Ds*mv+ or ( & Ds*mx+); % XXX TODO: shouldn't less be a lot more like 'more', above? -% Cost of >1.0 on Om- so that MVm- is prefered for "He runs less". +% Cost of >1.0 on Om- so that MVm- is preferred for "He runs less". less: ({ECa-} & (EAm+ or EEm+ or [MVm-] or [EB*m-] or AJrc- or AJlc+)) or ({ECn-} & (Dmum+ or (Ss+ & ) or Bsm+)) @@ -8496,28 +8902,36 @@ as.e-c: (MVz- & (((O*c+ or S**c+ or ({SFsic+} & Zc+)) & {Mp+}) or Mpc+ or )) or (MVzo- & Ct+ & Bc+ & {U+}) - or (MVzp- & (CX+ or CQ+)) + or (MVzc- & (CX+ or CQ+)) + or (MVzp- & Pa+) or (MVza- & Cta+ & ((AFd+ & {Pa+}) or PFc+)); % prepositional, mostly % MVi- & TO+: "He said it in a voice so loud as to make everyone stare." -% MVs- & Sa*v+: "he left as agreed" -% MVs- & Sa*a+ & CV+: " ..., as shall be proven" +% VCz- & Sa*v+: "he left as agreed" +% VCz- & Sa*a+ & CV+: " ..., as shall be proven" % The punctuation is mandatory, here. % The CV is awkward, as it uses a null-subject. +% XXX Try to get rid of this... % Cz+ & CV+: "the accused, as it shall be shown, is innocent" % use Cz instead of because post-processing kills the % Cs link with a "Unbounded s domain78" error. -% -: Sa*v+ or (Sa*a+ & CV+); +% AZ- & Mg+: "It described the treaty as marking a new stage" +% & CO+: "As we set sail, a gale blew up" +% BIt+: "his statements, as candidate, contradict his actions" as.e: - ((J+ or Mp+ or TI+ or ({SFsic+} & Zs+)) & + ((J+ or Mp+ or TI+ or Zs+) & (({Xc+ & {Xd-}} & CO+) or ({Xd- & Xc+} & MVp-))) or ((J+ or Mp+ or BIt+) & ([Mp-] or (Xd- & Xc+ & MX*x-))) or (AZ- & Pa+) - or (( or ) & (({Xc+ & {Xd-}} & CO+) or ({Xd- & Xc+} & MVs-))) - or (Cz+ & CV+) - or ({{Xd-} & MVs-} & & {Xc+}) + or (AZ- & Mg+) + or ({Xd-} & {[hVCz-]-0.05} & Cz+ & CV+) + % or ({Xd-} & hVCz- & Cz+ & CV+) + or ( & (({Xc+ & {Xd-}} & CO+))) + or ((Sa*v+ or (Sa*a+ & CV+)) & {Xc+ & {Xd-}} & CO+) + or (Sa*v+ & {Xd- & {Xc+}} & VCz-) + or [Sa*a+ & CV+ & {Xd- & {Xc+}} & VCz-]-0.05 + or (Sa*a+ & CV+ & {Xd- & {Xc+}}) % needed for MXsr constructions or (MVi- & TO+) or [[(PFc+ or CQ+) & ({Xd- & Xc+} & MVs-)]]; @@ -8525,7 +8939,7 @@ as_possible: MVz-; -% Cc+ & CV+: C links to the head-noun of the followig clause, and CV+ +% Cc+ & CV+: C links to the head-noun of the following clause, and CV+ % links to the head verb. Must form a cycle. % Example: "I run more often than Ben climbs" than.e: @@ -8539,9 +8953,9 @@ % cost on MVa-: "we will arrive much sooner", want "much" to modify "sooner". % ({OFd+} & Dmu+): "I drank much of the beer" -% cost on [[]] so that the above is prefered to an O- link +% cost on [[]] so that the above is preferred to an O- link much: - ({EE-} & ([[MVa-]] or ECa+ or or Qe+)) + ({EE-} & ([[MVa-]] or ( & ECa+) or or Qe+)) or ({EEx- or H-} & ( ECn+ or ({OFd+} & Dmu+) @@ -8560,7 +8974,7 @@ % "cheaper than dirt, slime is better" : ({ECa-} & ( - ((Pam- or Mam- or AFm+ or AJrc-) & {@MV+}) + ((Pam- or Mam- or AFm+ or ({EA-} & AJrc-)) & {@MV+}) or ({[[@Ec-]]} & {Xc+} & Am+) or (Wr- & {@MV+} & MVt+ & Xc+ & PFb+) or AJlc+)) @@ -8639,7 +9053,8 @@ ((Pafm- or AFm+ or Mam- or AJrc-) & {@MV+}) or AA+ or [[DD- & ]] or - )); + )) + or ({ECa- or EF+} & ); than_expected than_imagined than_proposed than_suspected than_realized than_intended than_supposed than_reported than_ever than_usual @@ -8725,68 +9140,85 @@ a_bit a_little_bit the_least_bit: ({EE-} & EA+) - or EC+ or EE+ + or EC+ or ({Xd- & Xc+} & MVa-) or [[]]; % OFd+ & Dm+: "I will wait a little while of time" a_little_while: ({EE-} & EA+) - or EC+ or EE+ + or EC+ or ({Xd- & Xc+} & MVa-) or [[]] or (OFd+ & Dm+); -% -pretty.e extremely very_very very_very_very exceptionally -unbelievably incurably extraordinarily -jolly.e mighty.e damn.e bloody.e exceedingly overly downright plumb -vitally abundantly chronically frightfully genuinely -humanly patently -singularly supremely unbearably unmistakably unspeakably -awfully decidedly demonstrably fashionably frighteningly horrifyingly -indescribably intolerably laughably predominantly +% Basic adverbial adjectives, with phoneme markers. +% This is very simply just "({EE-} & EA+) or EE+" without the phonemes. +: ({EE- or } & EA+) or ( & EE+); +: ({EE- or } & EA+) or ( & EE+); +: & (EA+ or EE+); +: & (EA+ or EE+); +% +% +pretty.e very_very very_very_very +jolly.e mighty.e damn.e bloody.e downright plumb +vitally chronically frightfully genuinely +humanly patently singularly supremely +decidedly demonstrably fashionably +frighteningly horrifyingly +laughably predominantly hugely: + ; + +extremely exceptionally unbelievably incurably extraordinarily +exceedingly overly abundantly +unbearably unmistakably unspeakably +awfully indescribably intolerably unalterably undisputedly unpardonably -unreasonably unusually hugely infernally: - ({EE-} & EA+) or EE+; +unreasonably unusually infernally: + ; notoriously.e: - ({EE-} & EA+) or EE+ or Em+; + or Em+; % --------------------------------------------------------- % Adverbs whose main use is adjectival, but can be used with % adverbs such as "faster", "dumber", etc. % "That one is marginally better". (thus EBm+) % "It's an inherently better method" thus EC+ link -fabulously incomparably inherently marginally moderately -relatively ridiculously -unacceptably unarguably undeniably unimaginably: - ({EE-} & - (EA+ - or EC+ - or EBm-)) - or EE+; +fabulously marginally moderately relatively ridiculously: + + or ({EE- or } & EC+) + or ({EE-} & EBm-); + +incomparably inherently unacceptably unarguably undeniably unimaginably: + + or ({EE- or } & EC+) + or ({EE-} & EBm-); -wide.e: EE+; +wide.e: & EE+; % {EE-} & EE+: "not very much" % [[EE-]]: "It hurts, but not very" % MVl- & MVl+: "we are going to arrive very nearly on time" very.e way.e: - ({EE-} & EA+) - or ({EE-} & EE+) + + or ({EE- or } & EE+) or [[EE-]] or [[La-]] or (MVl- & MVl+); -real.e: [[EA+ or EE+]]; -quite: ({EE-} & EA+) or EE+ or EZ+ or [[Em+]]; +not_very: & EA+; + +real.e: [[]]; +quite: + + or EZ+ + or [[Em+]]; amazingly incredibly: - ({EE-} & EA+) - or EE+ + or EBm- or ({Xd- & Xc+} & Em+) or ({Xc+ & {Xd-}} & CO+) @@ -8794,69 +9226,94 @@ % MVa-: "He is behaving very strangely" strangely: - ({EE-} & EA+) - or EE+ + or EBm- or ({Xd- & Xc+} & Em+) or ({Xc+ & {Xd-}} & CO+) or ({Xd- & Xc+} & {EE-} & MVa-); -rather: EA+ or EE+ or Vw- or ({Xc+ & {Xd-}} & CO+); +% EB-: "The problem, or rather, one of the problems, ..." +rather: + + or Vw- + or ({Xc+ & {Xd-}} & CO+) + or ({Xc+ & {Xd-}} & EB-); particularly: - EA+ or EE+ or Em+ or EB- + + or Em+ or EB- or (MVl- & (MVp+ or MVa+ or MVs+)) or ({Xc+ & {Xd-}} & CO+); -notably: EB- or EA+ or EE+ or ({Xc+ & {Xd-}} & CO+); +notably: + + or EB- or ({Xc+ & {Xd-}} & CO+); + +% Mp- & Ju+: "She was a girl nearly John's age" +% MVp- & Ju+: "She was a girl of nearly John's age" +almost: + + or EN+ or EZ+ or Em+ or EBm- + or (MVl- & (MVp+ or MVa+ or MVs+)) + or ((Mp- or MVp-) & Ju+); -almost nearly: - EA+ or EE+ or EN+ or EZ+ or Em+ or EBm- - or (MVl- & (MVp+ or MVa+ or MVs+)); +nearly: + + or EN+ or EZ+ or Em+ or EBm- + or (MVl- & (MVp+ or MVa+ or MVs+)) + or ((Mp- or MVp-) & Ju+); % The below is similar to "nearly" ... just_about: Em+ or EN+ or EZ+ or EA+; -entirely reasonably highly fairly totally completely terribly: - EA+ or EE+ +entirely: + + or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); + +reasonably highly fairly totally completely terribly: + or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); absolutely: - EA+ or EE+ or EBm- + + or EBm- or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); % allowing as opener also altogether equally: - EA+ - or EE+ + or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )) or ({Xc+ & {Xd-}} & CO+); really.e: - EA+ or EE+ or Em+ or EBm-; + or Em+ or EBm-; surprisingly: - EA+ or EE+ + or ({Xc+ & {Xd-}} & CO+) or ({Xd- & Xc+} & E+) or (Xd- & Xc+ & MVa-); especially: - EA+ or EE+ or EB- or Em+ + + or EB- or Em+ or (MVl- & (MVp+ or MVa+ or MVs+)) or ({Xc+ & {Xd-}} & CO+); -virtually: EA+ or EE+ or EN+ or EZ+ or Em+; +virtually: or EN+ or EZ+ or Em+; wholly fully critically greatly grossly duly unduly: - EA+ or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); + ( & EA+) + or ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ or Qe+ or )); + +seemingly: ( & EA+) or Em+; +utterly: ( & EA+) or Em+; -seemingly utterly: EA+ or Em+; barely just_barely scarcely hardly merely truly practically: - Em+ or EBm- or EA+ or Wa-; + Em+ or EBm- or ( & EA+) or Wa- or ({EBm+} & ); partly.e largely.e mostly.e chiefly.e simply.e purely.e solely.e: - Em+ or EA+ or EB- or Wa- + Em+ or ( & EA+) or EB- or Wa- or (MVl- & (MVp+ or MVa+ or MVs+)); % Em+: "It sure is great" @@ -8871,14 +9328,21 @@ % Adverbs like "biochemically". These tend to answer the question "how?" % with a noun-form (as opposed to being verb-derived) A lot of these are % of the "-ically" form -/en/words/words.adv.3: - EA+ +% +/en/words/words.adv.3-const: + ( & EA+) or ({Xd- & Xc+} & (E+ or MVa-)) or EBm- or ({{Xd-} & Xc+} & CO+); -in_part: EB- or (MVl- & (MVp+ or MVa+ or MVs+)); % academically administratively aesthetically +/en/words/words.adv.3-vowel: + ( & EA+) + or ({Xd- & Xc+} & (E+ or MVa-)) + or EBm- + or ({{Xd-} & Xc+} & CO+); + +in_part: EB- or (MVl- & (MVp+ or MVa+ or MVs+)); % --------------------------------------------------------- %ADVERBS WHOSE ONLY (MAIN) USE IS POST_VERBAL @@ -8888,7 +9352,8 @@ inward.e outward.e inwards.e outwards.e anytime.e live.e wholesale.e anew.e forever.e awhile.e aback.e afoul.e afresh.e aloft.e amok.e amuck.e onstage.e -apiece.e askance.e astern.e asunder.e inter_alia mutatis_mutandis par_excellence +apiece.e askance.e astern.e asunder.e +inter_alia mutatis_mutandis par_excellence upside-down.e ab_initio ad_infinitum ad_lib ad_libitum ad_nauseum aground.e astray.e into_account into_effect to_market to_bid from_scratch to_office for_office for_good @@ -8965,9 +9430,10 @@ or [{EA- or EF+} & (Pa- or AF+)] or Yd+; +% EE- & EB-: "..., or more exactly, ..." exactly.e: E+ - or EB- + or ({EE-} & {Xc+ & {Xd-}} & EB-) or EN+ or EW+ or EZ+ @@ -8989,7 +9455,7 @@ initially already somehow again once_again nowadays sometimes nevertheless nonetheless at_first at_best at_present of_late indeed: -({Xd- & Xc+} & MVa-) or E+ or ({Xc+ & {Xd-}} & CO+) or EBm-; + ({Xd- & Xc+} & MVa-) or E+ or ({Xc+ & {Xd-}} & CO+) or EBm-; twice.e: ({Xd- & Xc+} & MVa-) @@ -9005,12 +9471,12 @@ ({Xd- & Xc+} & MVa-) or E+ or ({Xc+ & {Xd-}} & CO+) - or EB-; + or ({Xc+ & {Xd-}} & EB-); also.e: ({Xd- & Xc+} & (E+ or MVa-)) or ({Xc+ & {Xd-}} & CO+) - or EB-; + or ({Xc+ & {Xd-}} & EB-); gradually.e sadly.e broadly.e clearly.e annually.e characteristically.e comparatively.e @@ -9024,26 +9490,27 @@ appropriately.e simply.ee: {EE- or EF+} & ( ({Xd- & Xc+} & (MVa- or E+)) - or ({Xc+ & {Xd-}} & CO+) - or EB- - or Qe+ - or - or [[EA+]]); + or ({Xc+ & {Xd-}} & CO+) + or ({Xc+ & {Xd-}} & EB-) + or Qe+ + or + or [[EA+]]); +% ({Xc+ & {Xd-}} & EB-)): "..., or more precisely, ..." precisely.e specifically.e generally.e: {EE- or EF+} & ( ({Xd- & Xc+} & (MVa- or E+)) - or ({Xc+ & {Xd-}} & CO+) - or EB- - or Qe+ - or - or EW+); + or ({Xc+ & {Xd-}} & CO+) + or ({Xc+ & {Xd-}} & EB-) + or Qe+ + or + or EW+); occasionally.e often.e originally.e: {EE- or EF+} & ( ({Xd- & Xc+} & (MVa- or E+)) or ({Xc+ & {Xd-}} & CO+) - or EB- + or ({Xc+ & {Xd-}} & EB-) or Qe+ or ); @@ -9054,22 +9521,23 @@ % (EEh- & {Qe+}): "How quickly?" % : "Onward came the cavalry" % ECa+: "It is vastly cheaper" +% XXX TODO the PH- should be split into PHc- and PHv- versions. : ({EE- or EF+} & (({Xd- & Xc+} & MVa-) or Em+ - or ECa+ + or ({PH-} & ECa+) or ({Xc+ & {Xd-}} & COa+) or Qe+ or or or - or [[EA+]])) + or [[{PH-} & EA+]])) or (EEh- & {Qe+}); -% XXX fixme: there are things in there, like "tall.e" that seem not to +% XXX FIXME: there are things in there, like "tall.e" that seem not to % belong?? -% XXX fixme: the Qe construction is weird: +% XXX FIXME: the Qe construction is weird: % "How slickly did he talk?" % "*How slickly did you say it was?" /en/words/words.adv.1: ; @@ -9104,8 +9572,10 @@ & CO+) or EB- or Qe+ or )) or ({EA- or EF+} & (Pa- or AF+)); certainly possibly probably importantly remarkably interestingly: -{EE-} & (E+ or (Xd- & Xc+ & (E+ or MVa-)) or ({Xc+ & {Xd-}} & CO+) or -({Xc+ & {Xd-}} & EB-)); + {EE-} & (E+ + or (Xd- & Xc+ & (E+ or MVa-)) + or ({Xc+ & {Xd-}} & CO+) + or ({Xc+ & {Xd-}} & EB-)); % --------------------------------------------------------- % ordinary clausal adverbs @@ -9134,7 +9604,7 @@ or ({MVp+} & {Xc+ & {Xd-}} & CO+) or ({Xc+ & {Xd-}} & EBm-); -not_suprisingly if_nothing_else: +not_surprisingly if_nothing_else: E+ or (Xd- & Xc+ & (E+ or MVa-)) or ({Xc+ & {Xd-}} & CO+) @@ -9147,20 +9617,37 @@ % Nearly identical to words.adv.2, but do not force the EBm- % Wt-: single-word sentence: "Evidently" % Wt- & Pv+: "Evidently so" -still.e presumably undoubtedly evidently apparently usually typically perhaps: +% EB- & EA+: "... or perhaps taller" +still.e presumably undoubtedly evidently apparently +usually typically perhaps: E+ or (Xd- & Xc+ & (E+ or MVa-)) or (Wt- & ({Xc+} or Pv+ or N+)) or ({Xc+ & {Xd-}} & CO+) - or EB-; + or (EB- & {[EA+]-0.1}); -in_fact of_course in_effect for_example for_instance e.g. i.e. : +% scilicet +% "it [sc. gouache] was also popular in France" +% MX*tn- & O+: scilicet nouns +% MX*ta- & P+: scilicet adjectives +% MVx- & P+: to verbs "While walking [scilicet in the evening] ..." +% Some of these sc. constructions should forcefully require a comma. +in_fact of_course in_effect for_example for_instance +e.g. eg. i.e. ie. id_est +sc sc. scilicet viz.r viz..r videlicet +that_is that_is_to_say to_wit namely.r in_other_words: E+ or (Xd- & Xc+ & (E+ or MVa-)) or ({Xc+ & {Xd-}} & CO+) - or (EB- & {Xc+}) - or (Xd- & EB- & Xc+) - or ({Xd-} & & (Wd+ or Wp+ or Wr+)); + or ({Xd-} & & (Wd+ or Wp+ or Wr+)) + or [EB- & {Xc+}] + or [Xd- & EB- & Xc+] + or (Xd- & + ((MX*tn- & {Xc+} & O+) or + (MX*ta- & {Xc+} & P+) or + (MVx- & {Xc+} & P+)) + & (Xc+ or )); + % ----------------------------------------------------------- % ADVERBS USABLE POST_VERBALLY OR AS OPENERS @@ -9168,16 +9655,33 @@ % by the UNKNOWN-WORD.a rule -- "Umm, I think he did it." no.e nope.e nah.e no_way yes.e yeah.e yep.e yup.e -ok.e okay.e OK.e fine.e sure.e whatever.e -someday.e sometime.e thereafter.e anyhow.e anyway.e overall.e -afterwards.e afterward.e lengthwise.e worldwide.e nationwide.e -statewide.e world-wide.e nation-wide.e state-wide.e industrywide.e -instead.e maybe.e: +ok.e okay.e OK.e fine.e sure.e whatever.e: ({Xc+ & {Xd-}} & CO+); +% EB-: "Would they have accepted this or, instead, would they have...?" +thereafter.e overall.e lengthwise.e +instead.e anyhow.e anyway.e: + + or ({Xd- & Xc+} & (MVp- or E+ or EB-)); + +% Wa-: Single-word responses to questions. +someday.e sometime.e maybe.e +afterwards.e afterward.e worldwide.e nationwide.e +statewide.e world-wide.e nation-wide.e state-wide.e industrywide.e +the_world_over: + + or ({Xd- & Xc+} & (MVp- or E+)) + or (Wa- & {Wa+}); + +% Comparative form of maybe, similar to "perhaps" +% EB- & EA+: "She was John's age or maybe older" +maybe.c: + EB- & EA+; + % Argumentatives (children gain-saying). not.intj is_too is_not is_so unh_unh: Wa-; +%suppress: DUP-BASE (for seriously.ij) % Openers to directives, commands (Ic+ connection to infinitives) % or single-word interjections, exclamations. % These are semantically important, so they've got to parse! @@ -9237,19 +9741,20 @@ sacre_bleu ay caramba kamoley kamoly moley moly holy_Moses mother_of_God Mother_of_God -mother_of_God mama_mia mamma_mia +mama_mia mamma_mia sonuvabitch son_of_a_bitch heck sodding_hell aw aww awww oh_great oh_wow er err.ij errr um.ij umm anyways honey.ij man.ij baby.ij hush.ij: + or or ({{Ic-} & Wi-} & {{Xd-} & Xc+} & Ic+) or or (({A-} or {E-} or {EE-}) & Wa-); % A single plain hello all by itself. Costly, because these days, -% its not normally a sentence opener. +% it's not normally a sentence opener. % Vv-: "I said, hello!" % Ds- & Jv-: "He greeted me with a loud hello" % Perhaps these should also appear as nouns? hello.n does ... @@ -9274,9 +9779,7 @@ on_arrival by_request in_total in_turn later_on for_now more_often_than_not -anyhow.ij anyway.ij afterwards.ij afterward.ij instead.ij on_second_thought - again_and_again time_and_again over_and_over day_by_day day_after_day step_by_step one_by_one even_so all_of_a_sudden: @@ -9296,10 +9799,11 @@ newly: E+; rightly: {EE-} & E+; necessarily no_longer: E+ or EBm-; -ever: E+ or EBm- or EC+ or MVa-; +ever: E+ or EBm- or EC+ or MVa- or ; -never.e always: {EN-} & (E+ or EB-); -seldom rarely.e: {EE-} & (E+ or EB-); +never.e always: ({EN-} & (E+ or EB-)) or ; +%suppress: DUP-BASE (for rarely.e) +seldom rarely.e: ({EE-} & (E+ or EB-)) or ; % MVa-: "He did just what you asked." % EC+: "I ate just some of the cookies" @@ -9339,6 +9843,7 @@ or (Rnx+ & ) or (MVp+ & Wq- & Q+); +%suppress: DUP-BASE (for rarely.i) never.i at_no_time not_once rarely.i since_when: {MVp+} & Wq- & Q+; @@ -9352,7 +9857,7 @@ or ((MVp+ or MVa+ or MVs+) & (MVl- or ({Xc+ & {Xd-}} & CO+))) or (Rnx+ & ); -not_even: Rnx+ & ; +not_even: (Rnx+ & ) or ; % {EE-} & EE+: "but not too much" too: @@ -9382,7 +9887,7 @@ % ========================================================== % MISCELLANEOUS WORDS AND PUNCTUATION -etc: {Xi-} & Xd- & Xc+ & (MX- or MVa-); +etc etc.: {Xi-} & Xd- & Xc+ & (MX- or MVa-); so_on the_like vice_versa v.v.: ( & ) or or @@ -9390,7 +9895,7 @@ M- or MV-; % Emoticons ... at start or end of sentences ... -EMOTICON : + : CO+ or (Wd- & NIa+) or Wa- @@ -9403,11 +9908,11 @@ % Xp+ is for new sentences. "Who is Obama? Where was he born?" % Xs+ is for dependent clauses starting with "so". % "I stayed so I could see you." -% XXX TODO: afer all WV's work, the WV link should no longer be optional... +% XXX TODO: after all WV's work, the WV link should no longer be optional... % XXX that is, change to just WV+. % : - ( or ) & {CP+} & {(Xx+ or Xp+ or Xs+) & {hWV+}} & {RW+ or Xp+}; + ( or ) & {hCPx+ or hCPi+ or hCPu+} & {(Xx+ or Xp+ or Xs+) & {hWV+}} & {RW+ or Xp+}; % QU+ links to quoted phrases. % ZZZ+ is a "temporary" addition for randomly-quoted crap, and @@ -9415,6 +9920,7 @@ % (Xc+ or [()]): allow missing comma, but at a price. LEFT-WALL: + or hCPa+ or (QUd+ & & (Xc+ or [()]) & QUc+) or [[ZZZ+ & ]]; @@ -9439,7 +9945,7 @@ ({EBb-} & EAxk+ & {HA+}) or ({EZ-} & EExk+) or Em+ - or ((({Xd-} & (MVs- or []) & Xs-) or ({Xc+} & Wc-)) + or ((({Xd-} & ([MVs-]0.5 or ) & Xs-) or ({Xc+} & Wc-)) & ( or []0.5)) or or (Wq- & CQ+) @@ -9449,6 +9955,9 @@ or O- or Js-; +and_so and_thus: + ; + % Is ever needed here? % Should we be using instead of MVs- ?? % Or maybe every use of MVs- should be converted to ??? @@ -9526,17 +10035,19 @@ % % Comma can conjoin nouns only if used in a list of 3 or more items: % "This, that and the other thing" -% However, this is given a cost, so that geographic names are prefered: +% However, this is given a cost, so that geographic names are preferred: % "He went to Gaeta, Italy, and to Paris, France." % % SJ: "I saw John, not Mary" is handled via idiomatic ,_not construction +% cost on []: allow Pa links with commas, e.g. +% "he paced, worried" but lower cost than Xx links % ",": - ({@Xca- or [[[@Xc-]]]} & (({[EBx+]} & Xd+) or Xc-)) + ({[@Xca-]-0.05 or [[[@Xc-]]]} & (({[EBx+]} & Xd+) or Xc-)) or [] or or - or + or []0.5 or (SJl- & SJr+ & SJl+); % :.j @@ -9560,18 +10071,21 @@ or (TH- & ); % Coordinating conjunctions that behave like punctuation. These -% connect whole clauses. Should wee use here? Why not? +% connect whole clauses. Should we use here? Why or why not? % +% then.ij is often used as a time-ordered conjunction: "I eat then I sleep" % not.ij seems to result in bad parses quite often, do we need it? % Xx-: provides coordination to the wall. % The cost on [] is to use the Xx when possible, because % the VC link often does not go leftwards far enough. % (e.g. "John screamed when I arrived but Sue left") % Wc-: "But my efforts to win his heart have failed" -but.ij and.ij or.ij not.ij also.ij but_not and_not and_yet: - [{Xd-} & (Xx- or Wc-) & {Xc+} +% EB+: "would they have accepted this or, instead, would they have ..." +but.ij and.ij or.ij not.ij also.ij then.ij but_not and_not and_yet: + [{Xd-} & (Xx- or Wc-) & {Xc+} & {EB+} & (Wdc+ or Qd+ or Ws+ or Wq+ or Ww+) & ]1.1; +%suppress: DUP-BASE (for ..y) % (NI- & WV- & W+): Optionally numbered, bulleted lists ..y *.j "•" ⁂ ❧ ☞ ◊ ※ "….j" ○ 。 ゜ ✿ ☆ * ◕ ● ∇ □ ◇ @ ◎: (Wd- & W+) @@ -9579,7 +10093,8 @@ % 、 is the "enumeration-comma" used like a dash ... % ‧ is the "middle dot" -– ━ ー --.r -.r 、 ~.r ~.r ‧.r : +% The four dashes are e28092 e28093 e28094 e28095 +‒ – — ― ━ ー --.r -.r 、 ~.r ~.r ‧.r : [[]] or ({@Xca-} & (({EBx+} & Xd+) or Xc-)) or (Wd- & W+) @@ -9623,9 +10138,14 @@ % The percent sign following a number (also basis pt, per mil) % Also -- see above, for handling of 12ft. 12in. not just 12% -"%" ‰ ‱ : (ND- & {DD-} & & ) or (ND- & (OD- or AN+)); +% AN- & Jp-: "... the concentration in v/v %" +"%" ‰ ‱ : + (ND- & {DD-} & & ) + or (ND- & (OD- or AN+)) + or ({E- or EA-} & A+) + or (AN- & Jp-); -% See also /en/words/currency for curency names that follow a number. +% See also /en/words/currency for currency names that follow a number. $ USD.c US$.c C$.c AUD.c AUD$.c HK.c HK$.c £ ₤ € ¤ ₳ ฿ ¢ ₵ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ₸ ₮ ₩ ¥ ៛ 호점 † †† ‡ § ¶ © ® ℗ № "#": @@ -9656,25 +10176,51 @@ % foo: F+; % ------------------------------------------------------------------------- -% Common typographical errors -% Asign a cost of 1.7 for no good reason. Feel free to change this. - -then.#than: [than.e]1.7; -than.#then-r: [then.r]1.7; -than.#then-i: [then.i]1.7; - -% rather_then: rather_than; - -there.#their: [their.p]1.7; -% theres.#theirs: [theirs.p]1.7; - -% there.#they're: [they're]1.7; -% all.#all_of: [all_of]1.7; +% Common typographical (spelling) errors +% +% Assign a cost of 1.65 for no very good reason. Feel free to change this. +% .. well, some of this is dangerous. For example, setting the cost too +% low causes correct "than" usages with Z link (and missing wall) to be +% priced higher then "typos" with "then". +% +% These perform a kind-of poor-man's rewriting. A more principled, +% elevated design would define and use an "operator grammar" type +% approach. See https://en.wikipedia.org/wiki/Operator_grammar + +then.#than: [than.e]0.65; +than.#then-r: [then.r]1.65; +than.#then-i: [then.i]1.65; +than.#then-ij: [then.ij]0.65; + +rather_then.#rather_than: rather_than; + +there.#their: [their.p]0.65; +% theres.#theirs: [theirs.p]0.65; + +% The apostrophes prevent the dictionary from being read in. +% there.#they're: [they're]0.65; +% all.#all_of: [all_of]0.65; +% its.#it's: [it's]0.65; +% lets.#let's: [let's]0.65; % Using "or" instead of "nor" is a common mistake. % "Neither this or that one will do" or.#nor-j-n: [nor.j-n]; +% Hmm. "there lie the books" smells like present-tense: the books are +% there right now. "there lay the books" suggest past-continuous: they +% are not just there now, but always have been. So is it really a typo +% to say "lie" instead of "lay"? +lie.#lay-v-d: [lay.v-d]0.5; + +% i before e unless the weighty neighbor is a German loan word. +beleive.#believe-v: [believe.v]0.1; + +% Common shorts +thru.#through-r: [through.r]0.05; +nite.#night: [night.r or night.u or night.i or night.n]0.05; +tonite.#tonight: [tonight]0.05; + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Morphology guessing for unknown words. % Given a particular kind of ending to a word, try to guess @@ -9687,7 +10233,7 @@ % ING-WORDS.g: ( & ) or ( & ) % or or ; -ING-WORDS.g: +.g: [[( & )] or [( & )] or [({@E- or EA-} & A+)] or @@ -9701,13 +10247,13 @@ AN+ or {AN-})]0.1; -ED-WORDS.v-d: +.v-d: [ VERB_SPPP_T(`') or or ]0.1; -S-WORDS.v: [ VERB_S_T(`') ]0.1; +.v: [ VERB_S_T(`') ]0.1; -S-WORDS.n: +.n: [( & (({NM+ or Dmc-} & & ( or Bpm+)) or ({NM+ or Dmc-} & ) or @@ -9716,7 +10262,7 @@ Up-)) or [[AN+]]]0.1; -LY-WORDS.e: +.e: [{EE- or EF+} & ( ({Xd- & Xc+} & MVa-) or Em+ @@ -9734,36 +10280,36 @@ % guessed nouns that can be mass or countable % (-in, -ine, -ion, -yl, -ose, -ol, -ide, -ity) -MC-NOUN-WORDS.n: +.n: []0.1; -% guessed nouns that are signular countable (-on, -or) -C-NOUN-WORDS.n: +% guessed nouns that are singular countable (-on, -or) +.n: []0.1; % guessed adjectives (-ous, -ar, -ic) -ADJ-WORDS.a: - []0.1; +.a: + [ or ]0.1; % guessed adjectives/adverbs suffixed by "fold" with or without hyphen -FOLD-WORDS: +: [({EN-} & (MVp- or EC+ or A+)) or Em+]0.1; % latin (postposed) adjectives considered as mass nouns % in the current version (is this right???) -LATIN-ADJ-WORDS.a: +.a: []0.1; % latin (postposed) adjectives or latin plural noun % always considered as nouns in the current version % XXX maybe should be same as words.n.2.x instead of ??? -LATIN-ADJ-P-NOUN-WORDS: +: []0.1; % latin (postposed) adjectives or latin singular noun % always considered as nouns in the current version % XXX this is with weird plural-like stuff ?? is this right? -LATIN-ADJ-S-NOUN-WORDS: +: [ & (AN+ or ({NM+ or D*u-} & & ( or )) @@ -9775,7 +10321,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Guessing of unknown words, if none of the above rules applied. -UNKNOWN-WORD.n: +.n .n: & (AN+ or ({NM+ or ({Jd-} & D*u-)} & & ( or )) @@ -9785,12 +10331,13 @@ or (GN+ & (DD- or [()])) or U-); -UNKNOWN-WORD.v: - {@E-} & ((Sp- & ) or (RS- & Bp-) or (I- & ) or ({Ic-} & Wa- & )) & {O+ or } & {@MV+}; +.v .v: + {@E-} & ((Sp- & ) or (RS- & Bp-) or (I- & ) or ({Ic-} & Wa- & )) & {O+ or } & ; -% Add a miniscule cost, so that the noun-form is prefered, when -% availble. -UNKNOWN-WORD.a: []0.04; +% Add a miniscule cost, so that the noun-form is preferred, when +% available. +.a .a: + [ or ]0.04; % These are the link-types that are not subject to the length limit. % Always use "+" for these. Some of these are obvious. Some deserve @@ -9801,9 +10348,18 @@ % many others, the object of this sentence" requires a long O link % from "puts" to "object". % +% VJ+ is unlimited because some sentences have very long clauses: +% "He obtained the lease of the manor of Great Burstead Grange (near +% East Horndon) from the Abbey of Stratford Langthorne, and purchased +% the manor of Bayhouse in West Thurrock." +% UNLIMITED-CONNECTORS: - S+ & O+ & CO+ & C+ & Xc+ & MV+ & TH+ & W+ - & RW+ & Xp+ & Xx+ & CP+ & SFsx+ & WV+ & CV+; + S+ & O+ & CO+ & C+ & Xc+ & MV+ & TH+ & W+ & + RW+ & Xp+ & Xx+ & CP+ & SFsx+ & WV+ & CV+ & + VJ+ & SJ+; + +% The YS, YP, PH and ZZZ connectors can never be longer than one. +LENGTH-LIMIT-1: YS+ & YP+ & PH+ & ZZZ+; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Extensions by Peter Szolovits, psz@mit.edu, as a part of the work for @@ -9813,7 +10369,7 @@ % subject_to status_post in_conjunction_with sensu - in_relation_to neath amidst across_from circa astride + in_relation_to neath amidst across_from circa ca. c. astride previous_to together_with as_regards s/p aka amongst unto apropos_of w.i W.i: ({JQ+} & (J+ or Mgp+) & ) or (MVp- & B-); diff -Nru link-grammar-5.3.16/data/en/4.0.knowledge link-grammar-5.5.0/data/en/4.0.knowledge --- link-grammar-5.3.16/data/en/4.0.knowledge 2016-09-20 01:51:55.000000000 +0000 +++ link-grammar-5.5.0/data/en/4.0.knowledge 2018-03-12 01:29:05.000000000 +0000 @@ -32,7 +32,7 @@ ;W Ce Cs Ca Cc Ci R* Rn Re RSe Mr QI#d Mv* Jr Mj Qd W Ce Cs Ca Cc Ci R* Re RSe Mr QI#d Mv* Jr Mj Qd TOn TOi Mg* MVi Ss#d Bsd ER Z Ma#* SIs#g BIqx MX#p MX#a - MX#r MX#j MV#o MV#p Eq COq VCq AFd PFc + MX#r MX#j MV#c MV#o Eq COq VCq AFd PFc @@ -88,10 +88,11 @@ ; CV: similar to WV above, must form a loop along with Ce (typically). ; Ce links noun of subordinate clause to the dominating phrase, while ; CV links the verb of the subordinate clause to the dominating phrase. +; VC links to verb in paraphrasing and subordination situations ; Both the linked noun and verb must be in the same subordinate clause. ; Example: "I wish I could see him 100 times a day" -MUST_FORM_A_CYCLE_LINKS: R#* TOt EXx HA SFsic Jd Jr JQ OFd Xca WV CV IV +MUST_FORM_A_CYCLE_LINKS: R#* TOt EXx HA SFsic Jd Jr JQ OFd Xca WV CV IV VC CPu @@ -149,8 +150,8 @@ TOn e TOi e MVi e + MV#c s MV#o s - MV#p s AFd s PFc s Mg* e @@ -246,7 +247,7 @@ SFIst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'34" , SFIp , O*t Opt Omt Bp#t B*#t Bc#t , "Bad use of 'there'35" , OXt , O#t B##t , "Bad use of 'there'36" , - SFsi* , TOi THi QIi TSi O#i Ci THb CPi + SFsi* , TOi THi QIi TSi O#i Ci THb CPi CPu COqi CPi Eqi AFdi BIh , "Bad use of 'it'37" , SFIsi , TOi THi QIi TSi O#i Ci THb CPi COqi CPi Eqi AFdi BIh , "Bad use of 'it'38" , @@ -265,7 +266,7 @@ , "Bad comparative46" , MV#o , D##m D##y Om Oy Jm Jy Am MX#m , "Bad comparative47" , - MV#p , EEm MVb Dm#m EEy D##y MVm Om Oy + MV#c , EEm MVb Dm#m EEy D##y MVm Om Oy Jm Jy Am MX#m , "Bad comparative48" , Pafc , EB#m EB#y , "Bad comparative49" , @@ -292,6 +293,11 @@ Mam , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya , "Bad use of adjective64" , ; +; The below blocks the singular "I want a gift inexpensive", while +; allowing Pa**j to be used with other kinds of objects. +; + Pa##j , Osm Op Ox Os*e OXi , "Bad predicative adj 62" , +; ; The rule below sounds nice, in principle, in preventing some ; nasty constructions ... however, it also prevents some good ; ones, too. e.g. "The dog, unsatisfied, barked loudly" diff -Nru link-grammar-5.3.16/data/en/4.0.regex link-grammar-5.5.0/data/en/4.0.regex --- link-grammar-5.3.16/data/en/4.0.regex 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/4.0.regex 2018-04-27 00:35:36.000000000 +0000 @@ -23,50 +23,55 @@ % % Allows at most two colons in hour-minute-second HH:MM:SS expressions % Allows at most two digits between colons -HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/ +: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?$/ % e.g. 1950's leading number can be higher, for science fiction. % Must be four digits, or possible three. Must end in s, 's ’s -DECADE-DATE: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/ +: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/ % Similar to above, but does not end in s. Only allows four digits. % We process this before NUMBERS below, so that this is matched first. -YEAR-DATE: /^([1-4][0-9]{3}|[1-9][0-9]{0,2})$/ +: /^([1-4][0-9]{3}|[1-9][0-9]{0,2})$/ % Day-of-month names; this regex will match before the one below. -DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/ +: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/ % Ordinal numbers; everything except 1st through 13th % is handled by regex. -ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/ +: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/ % Allows any number of commas or periods % Be careful not match the period at the end of a sentence; % for example: "It happened in 1942." -NUMBERS: /^[0-9,.]*[0-9]$/ +: /^[0-9,.]*[0-9]$/ % This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5" -NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/ +: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/ % Parses simple fractions e.g. "1/60" with no decimal points or anything fancy -FRACTION: /^[0-9]+\/[0-9]+$/ +: /^[0-9]+\/[0-9]+$/ % "10(3)" exponent (used in PubMed) -NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/ +: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/ % Roman numerals % The first expr has the problem that it matches an empty string. The % cure for this is to use look-ahead, but neither the Gnu nor the BSD % regex libs support look-ahead. I can't think of a better solution. -ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/ +: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/ % ROMAN-NUMERAL-WORDS: /^(?=(M|C|D|L|X|V|I)+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/ % ROMAN-NUMERAL-WORDS: /^(?=.+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/ % Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St." % Make it at least two letters long, as otherwise it clobbers % single-letter handling in the dict, which is different. -INITIALS: /^[A-Z]\.([A-Z]\.)+$/ +: /^[A-Z]\.([A-Z]\.)+$/ + +% Strings of two or more upper-case letters. These might be initials, +% but are more likely to be titles (e.g. MD LLD JD) and might also +% be part numbers (see below, PART-NUMBER:) +: /^[A-Z]([A-Z])+$/ % Greek letters with numbers -GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/ -PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/ +: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/ +: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/ % Some "safe" derived units. Simple units are in dictionary. % The idea here is for the regex to match something that is almost @@ -87,16 +92,16 @@ % remember "mm"! % grams/anything -UNITS: /^([npmk]|milli|micro|nano|pico|femto|atto|kilo|mega|tera)?(g|grams?)\// +: /^([npmk]|milli|micro|nano|pico|femto|atto|kilo|mega|tera)?(g|grams?)\// % mol/anything -UNITS: /^([fnmp]|milli|micro|nano|pico|femto|atto|mu)?mol(es)?\// +: /^([fnmp]|milli|micro|nano|pico|femto|atto|mu)?mol(es)?\// % common endings -UNITS: /^[a-zA-Z\/.]+\/((m|micro)?[lLg]|mg|kg|mol|min|day|h|hr)$/ +: /^[a-zA-Z\/.]+\/((m|micro)?[lLg]|mg|kg|mol|min|day|h|hr)$/ % common endings, except in the style "mg.kg-1" instead of "mg/kg". -UNITS: /^[a-zA-Z\/.1-]+\.((m|micro)?[lLg]|mg|kg|mol|min|day|h|hr)(-1|\(-1\))$/ +: /^[a-zA-Z\/.1-]+\.((m|micro)?[lLg]|mg|kg|mol|min|day|h|hr)(-1|\(-1\))$/ % combinations of numbers and units, e.g. "50-kDa", "1-2h" % TODO: Clean up and check that these are up-to-date wrt the @@ -115,12 +120,12 @@ % spelled-out numbers, anything is allowed between the "initial" number % and "fold" to catch e.g. "two-to-three fold" ("fourteen" etc. are absent % as the prefix "four" is sufficient to match). -FOLD-WORDS: /^[0-9.,:-]*[0-9]([0-9.,:-]|\([0-9.,:-]*[0-9][0-9.,:-]*\)|\+\/-)*-?fold$/ -FOLD-WORDS: /^(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fifteen|twenty|thirty|fifty|hundred|thousand|million).*fold$/ +: /^[0-9.,:-]*[0-9]([0-9.,:-]|\([0-9.,:-]*[0-9][0-9.,:-]*\)|\+\/-)*-?fold$/ +: /^(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fifteen|twenty|thirty|fifty|hundred|thousand|million).*fold$/ % Plural proper nouns. % Make sure that apostrophe-s is split out correctly. -PL-CAPITALIZED-WORDS: /^[[:upper:]].*[^iuoys'’]s$/ +: /^[[:upper:]].*[^iuoys'’]s$/ % Other proper nouns. % We demand that these end with an alphanumeric, i.e. explicitly @@ -128,7 +133,7 @@ % commas, colons, or periods/question-marks at the end of sentences. % In addition, this must not swallow words ending in 's 'll etc. % (... any affix, for that matter ...) and so no embedded apostrophe -CAPITALIZED-WORDS: /^[[:upper:]][^'’]*[^[:punct:]]$/ +: /^[[:upper:]][^'’]*[^[:punct:]]$/ % SUFFIX GUESSING % For all suffix-guessing patterns, we insist that the pattern start @@ -138,20 +143,20 @@ % Fred followed." Since "(carrying" is not in the dict, we need to be % sure to not match the leading paren so that it will get tripped. % -ING-WORDS: /^\w.+ing$/ +: /^\w.+ing$/ % Plurals or verb-s. Make sure that apostrophe-s is split out correctly. % e.g. "The subject's name is John Doe." should be % +--Ds--+---YS--+--Ds-+ % | | | | % the subject.n 's.p name.n -S-WORDS: /^\w.+[^iuoys'’]s$/ +: /^\w.+[^iuoys'’]s$/ % Verbs ending -ed. -ED-WORDS: /^\w.+ed$/ +: /^\w.+ed$/ % Advebs ending -ly. -LY-WORDS: /^\w.+ly$/ +: /^\w.+ly$/ % Nouns ending in -ism, -asm (chiliasm .. ) Usually mass nouns % Stubbed out for now; I'm not convinced this improves accuracy. @@ -163,8 +168,8 @@ % AST-WORDS: /^\w.+ist$/ % Corresponding adjectival form of above -ADJ-WORDS: /^\w.+astic$/ -ADJ-WORDS: /^\w.+istic$/ +: /^\w.+astic$/ +: /^\w.+istic$/ % Nouns ending -ation stubbed out in BioLG, stub out here ... %ATION-WORDS: /^\w.+ation$/ @@ -179,23 +184,23 @@ % glycosylphosphatidylinositol % iodide, oligodeoxynucleotide % chronicity, hypochromicity -MC-NOUN-WORDS: /^\w.+ase$/ -MC-NOUN-WORDS: /^\w.+ene$/ -MC-NOUN-WORDS: /^\w.+ine?$/ -MC-NOUN-WORDS: /^\w.+yl$/ -MC-NOUN-WORDS: /^\w.+ion$/ -MC-NOUN-WORDS: /^\w.+ose$/ -MC-NOUN-WORDS: /^\w.+ol$/ -MC-NOUN-WORDS: /^\w.+ide$/ -MC-NOUN-WORDS: /^\w.+ity$/ +: /^\w.+ase$/ +: /^\w.+ene$/ +: /^\w.+ine?$/ +: /^\w.+yl$/ +: /^\w.+ion$/ +: /^\w.+ose$/ +: /^\w.+ol$/ +: /^\w.+ide$/ +: /^\w.+ity$/ % Can take TOn+. Must appear after above, to avoid clash with +ity -NOUN-TO-WORDS: /^\w.+ty$/ -NOUN-TO-WORDS: /^\w.+cy$/ -NOUN-TO-WORDS: /^\w.+nce$/ +: /^\w.+ty$/ +: /^\w.+cy$/ +: /^\w.+nce$/ % replicon, intron -C-NOUN-WORDS: /^\w.+o[rn]$/ +: /^\w.+o[rn]$/ % adjectives % exogenous, heterologous @@ -204,24 +209,24 @@ % ribosomal, ribsosomal % nonpermissive, thermosensitive % inducible, metastable -ADJ-WORDS: /^\w.+ous$/ -ADJ-WORDS: /^\w.+ar$/ -ADJ-WORDS: /^\w.+ic$/ -ADJ-WORDS: /^\w.+al$/ -ADJ-WORDS: /^\w.+ive$/ -ADJ-WORDS: /^\w.+ble$/ +: /^\w.+ous$/ +: /^\w.+ar$/ +: /^\w.+ic$/ +: /^\w.+al$/ +: /^\w.+ive$/ +: /^\w.+ble$/ % latin (postposed) adjectives % influenzae, tarentolae % pentosaceus, luteus, carnosus -LATIN-ADJ-WORDS: /^\w.+ae$/ -LATIN-ADJ-WORDS: /^\w.+us$/ % must appear after -ous in this file +: /^\w.+ae$/ +: /^\w.+us$/ % must appear after -ous in this file % latin (postposed) adjectives or latin plural noun % brevis, israelensis % japonicum, tabacum, xylinum -LATIN-ADJ-P-NOUN-WORDS: /^\w.+is?$/ -LATIN-ADJ-S-NOUN-WORDS: /^\w.+um$/ +: /^\w.+is?$/ +: /^\w.+um$/ % Hyphenated words. In the original LG morpho-guessing system that @@ -235,7 +240,8 @@ % Explicitly call out (5'|3') so that we don't all a generic match to 'll % But something is funky about this 5'-3' business since 2' also matches ??? % /^[[:alnum:]][][:alnum:],:.\[-]*-[][:alnum:],:.\[-]*[[:alnum:]]$/ -HYPHENATED-WORDS: +: !/--/ +: /^[[:alnum:](5'|3')][][:alnum:](5'|3'),:.\(\)\[-]*-[][:alnum:],:.\(\)\[-]*[[:alnum:]]$/ % Emoticon checks must come *after* the above, so that the above take precedence. @@ -254,8 +260,8 @@ % % EMOTICON: /^[[:punct:];BDOpTX0578C☆ಠ●@◎~][[:punct:]?@[\\\]^_`{|}~;BDOpTX0578C☆ಠ●@◎~][!"#$%&'()*+,\-/:;<=>?@[\\\]^_`{|}~?@[\\^_`{|}~: !/^"|[[:alnum:]]+"$/ +: /^[[:punct:];BC☆ಠ●@◎~][-!"#$%&'()+,:;<=>?@[\\^_`{|}~: /^[A-Z0-9#][A-Z0-9$\/#]*[A-Z0-9$\/#,.-]*[0-9][A-Z0-9$\/#,.-]*[A-Z0-9$\/#]+$|^[1-9][0-9]+[\/-][0-9+]$/ +% Single, stand-alone "quoted" "words" (so-called "scare" quotes). +: /^"[[:alnum:].-]+"$/ + % Sequence of punctuation marks. If some mark appears in the affix table % such as a period, comma, dash or underscore, and there's a sequence of % these, then treat it as a "fill-in-the-blank" placeholder. @@ -277,4 +286,4 @@ % tokenizer explicitly mangles based on these punctuation marks. % % Look for at least four in a row. -UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/ +: /^[.,-]{4}[.,-]*$/ diff -Nru link-grammar-5.3.16/data/en/corpus-basic.batch link-grammar-5.5.0/data/en/corpus-basic.batch --- link-grammar-5.3.16/data/en/corpus-basic.batch 2016-09-14 15:03:30.000000000 +0000 +++ link-grammar-5.5.0/data/en/corpus-basic.batch 2018-04-26 16:13:23.000000000 +0000 @@ -892,7 +892,7 @@ *When did Joe and John did leave the party % Huh ??? what's the correct parse/meaning of this sentence? -% "my dog's freind came" is one of the parses !!?? wtf .. +% "my dog's friend came" is one of the parses !!?? wtf .. % My dog, cat, and cousin's friend came *My dog, cat, horse, mouse, and his cow left My dog, cat, horse, and mouse, and his cow left diff -Nru link-grammar-5.3.16/data/en/corpus-fixes.batch link-grammar-5.5.0/data/en/corpus-fixes.batch --- link-grammar-5.3.16/data/en/corpus-fixes.batch 2016-09-28 01:23:32.000000000 +0000 +++ link-grammar-5.5.0/data/en/corpus-fixes.batch 2018-04-26 23:19:58.000000000 +0000 @@ -36,7 +36,7 @@ He sunburns easily. We had the painter texture the ceiling for us. This particular ceramic finish crazes easily. -Hi, can you please freind me? +Hi, can you please friend me? I coded up the program. I hard-coded the answer. I always hard-code. @@ -79,14 +79,48 @@ She volunteered as a nurse. She volunteers as a nurse. +He provided for the transport of horses. +she provided for the boy +He signalled for backup artillery +It repeated for ten seconds + +he won't divulge what type it is + +he washed up +he washed out +the five seat washed out in the final sprint +he cut out +he cut out after fifth period +he cut class after fifth period +he cut out when the phone rang +he dropped out after fifth grade + +we deliver! +he really delivers, doesn't he? +he really smells, doesn't he? He put forth an argument against sexism He puts forward an argument against sexism The Vatican declaration put forward an argument against sexism +He poots forth a quarter-ounce green rosette near the summit of a dense but radiant muffin of his own design It has a hardness of 0.34 Brinell units. It has a tensile toughness of 0.34 joules per cubic metre +I beleive he suspects +he suspects already +he will suspect soon +he knows already + +it helps that you know already +it helped that you knew already +It helped that you know already +it helps that you have a clear idea of what you want. +it helps for you to have a clear idea of what you want. +it helps when you have a clear idea of what you want. +it helps if you have a clear idea of what you want. + + % Parses, but not correctly! % Any expression with an exclamatory "right" mis-parses. *It is sitting right over here. @@ -404,21 +438,6 @@ Use whatever the fuck pleases you. Do whatever the fuck pleases you. -% intransitive verbs taking prepositions -He grabbed at it - -% transitive verbs taking prepositions -He dished it out at the unsuspecting onlookers. -I ladled it carefully into a bowl. -You should ladle it out. -He ladles it with a large wooden spoon. -He was ladling the molten steel when the crucible tipped. -He ladled the molten hot metal into the mold. -It was molten hot. -It was piping hot. -It was devilishly hot. - - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ========================================================== % NOUNS misclassified and mangled @@ -455,6 +474,33 @@ A length of chain was wrapped around the tree. He left town early next morning. I gave her a check for deposit. +He moved towards the lift +We went on a shoot +he eats shoots +he eats shoots and leaves +he eats, shoots and leaves + +I got to first base +I got to first base with her. +she won first prize +Her solo flight across the Atlantic was a remarkable first. +They were responsible for several engineering firsts. + +That script is missing several server-side includes. +The unlinks are detected by the Khovanov homology over Z. +The inlining of that subroutine blows up the object. +equivariant cohomology is a cohomology theory from algebraic topology. + +% Misc Latin abbreviations +Kentaro Hori, et al. wrote the book "Mirror Symmetry". +I saw the deputy head re the incident +re: your letter, I am unclear as to its intent + +The church filed an in rem petition with the court +The expected in personam judgment was handed down +In re Gault was one of the most important cases + +the court held that the Arizona Juvenile Code impliedly includes the requirements of due process % ---------------- % nouns not in nouns.1 @@ -493,7 +539,7 @@ % Proper names (entities) in the form of noun modifiers % The problem is that some of these appear in entity names -% and thus must be explicitly listed in the dictionary. +% and thus must be explicitly listed in the dictionary. My name is Gilles de la Tourette. I live in New York City. I work for the First National Bank. @@ -516,13 +562,17 @@ He represents the Undersea Exploration Company He represents the Equatorial Antarctic Bank +% initials and abbreviations +The J. P. Morgan company is associated with it. +O. Cordius Candidus at Dover provided for the transport of horses. + % Confusing given names Finn pushed at the door. % Some given names can be proper names The late Mr. Tom will be sorely missed. -% Collistions between given names and oteh words +% Collistions between given names and other words % Should be OK now; given name 'Frank' collides with frank.a, frank.v Frank felt vindicated when his long time friend Bill revealed that he was the winner of the competition. @@ -604,6 +654,11 @@ That bitch did it! +I have a PhD +I have an MBA +I have an MFA +I have a MA + % =================================================================== % quasi-numeric determiners aka quantifying determiners % There are a zillion of these, mostly due to the product of the @@ -718,6 +773,11 @@ I ate all of this cake. I ate all this cake. +All the cookies! +All the people! +All the people deserve a tax-cut! +None of them do! + I made all these corrections to the dictionary. I drank none of the beer. @@ -814,9 +874,9 @@ I gave him a number of these I gave him a number of those -a number of these were found -a number of those were found -a number of them were found +a number of these were found +a number of those were found +a number of them were found He has wads of cash He commands several companies of men @@ -863,6 +923,11 @@ plenty of others thought so too +% Phantom "of", usually with "all", "both" - uses the AL link. +all the world's ablaze +all of the world is ablaze +all the world knows it's false + % =================================================================== % Subject verb inversion % Many examples from wikipedia @@ -885,7 +950,10 @@ far out in the sea lived a fish. A fish lived far out in the sea. -% Locative subject-object iversion +% fronted copular inversion +coming to the party are a horse and two chickens + +% Locative subject-object inversion under the tree sat Jim *Under the tree sat he. under the tree sits Jim, looking lonely @@ -989,6 +1057,7 @@ Sure enough, there it was, as I had sensed. Sure enough, there it was, as I had guessed. There it was, just as I had said. +then last week, I changed my mind % zero-subject/phantom-subject in sentence opener As mentioned before, we are creating a business service. @@ -1029,10 +1098,6 @@ Mere rain will not stay the mailman. That dog will bite! -He did as he pleased. -He does as he pleases. -He did as it pleased him. - % resolve can be intranstive He resolved on a course of action He decided on a course of action @@ -1224,6 +1289,30 @@ The coach sat me out for the inning. She is going to stand you up. +% intransitive verbs taking prepositions +He grabbed at it + +% transitive verbs taking prepositions +He dished it out at the unsuspecting onlookers. +I ladled it carefully into a bowl. +You should ladle it out. +He ladles it with a large wooden spoon. +He was ladling the molten steel when the crucible tipped. +He ladled the molten hot metal into the mold. +It was molten hot. +It was piping hot. +It was devilishly hot. + + +% present-continuous with prepositions. +Similar legislation is passing state-wide +The party is happening afterwards. +It is happening world-wide +It is happening everywhere +It is happening state-wide +It is happening the world over + + % More verbs taking prepositions We will party until dawn. We will party hardy until dawn. @@ -1285,7 +1374,33 @@ Do you have any ruled paper? I gave him some ruled paper. -% Pg+ links between verbs and gerunds +% Like above, everything in v.2.3, v.4.3, v.6.3 and v.8.3, +% but with Pa- links: so predicative adjectives +% Should these be using the Pa**j link instead ??? +It was delivered broken +It was delivered very broken +it was posted unstamped +it arrived broken +it arrived utterly smashed +it arrived crushed +it emerged untarnished +it washed up unbroken +he finished, beaten +it emerged untarnished +he left unseen +he left unwashed +he left, unwanted +he entered unwashed +he looked up, unshaven +it looked unwanted and abused +it looked broken +he posed undressed +he was posed, undressed +she was posed reclining +he paced, worried +he tailgated, angry + +% Pg+ links between verbs and gerunds He imagined hitting the ball. She imagined photographing the couple. He feared hitting the wall. @@ -1300,7 +1415,7 @@ I enjoy running the show. % Here a Ce+ link is used to attach to the subject -% i.e. the gerund, acting as a noun. e.g pushing, trusting +% i.e. the gerund, acting as a noun. e.g pushing, trusting He conceded pushing the button was a mistake. We concluded trusting him was a mistake. We concluded that trusting him was a mistake. @@ -1315,10 +1430,13 @@ It is a thinning of the herd The event features a running of the bulls +The constant plotting annoyed Jim +The constant professing of love annoyed Jim + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%% Predicative adjectives %%%%%%%%%%%%%%%%%%% % -% sit, stand must take Pa+ link ... +% sit, stand must take Pa+ link ... The house now sits vacant The house now stands vacant The house now sits empty @@ -1330,11 +1448,11 @@ Stay a little while. Stay a while. -% More Pa+ links: -He was presumed innocent +% More Pa+ links: Should these use Pa**j instead ?? +He was presumed innocent He was presumed innocent of the crime He was made aware of the crime -He was declared dead +He was declared dead He was declared dead after a cursory examination. he was declared dead of suffocation. @@ -1398,6 +1516,8 @@ They continued undaunted They continued, undaunted +He ad-libbed, uninterrupted. + % A few more ... He came unarmed He came unprotected @@ -1456,7 +1576,7 @@ He is too pathetic to make fun of He is too obnoxious to complain to -% paint, color, with color names; constructions like +% paint, color, with color names; constructions like % "pale yellow" mean that the color names are nouns (O*n) and % not predicative adjectives (Pa**j) Just paint it white @@ -1509,7 +1629,7 @@ Was the man drunk or crazy or both? %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% More predicative-style constructions +% More predicative-style constructions - when should these use Pa**j ?? He emerged, spirit unbroken He was set free, spirit unbroken. @@ -1564,6 +1684,7 @@ *She are going shopping. % Assorted prepositional difficulties. +Do you have a screwdriver handy? A bucket of chum sat near the tackle box. It was a sad bit of commentary to read. His works disclose a mastery of composition. @@ -1674,6 +1795,7 @@ % Faulkner It was a big dog, a hound with a strain of mastiff from somewhere +% Colloquial determiners Them there beans ought to be picked No one is sitting at that there table @@ -1684,6 +1806,14 @@ You are nearly about there. You are nearly almost there. +% Preposed locative determiners. +% Ideally, the below would use an MVI link (MV Inverted) link to +% the verb. However, that is not possible, because it would result in +% link-crossings with Wd to the subject. To avoid link crossings, +% a new kind of wall connection is needed -- the Wl link. +Thither he now transferred St Columba's remains from Iona. +Here the remains can be found. + % EW links where are you? about where are you? @@ -1732,7 +1862,7 @@ % % For an anaysis, and more related sentences, see % Paul Kay & Ivan A. Sag, "Cleaning up the Big Mess: Discontinuous -% Dependencies and Complex Determiners" (2012) chapter 5 in +% Dependencies and Complex Determiners" (2012) chapter 5 in % "Sign-Based Construction Grammar." % http://lingo.stanford.edu/sag/papers/kay-sag-final.pdf @@ -1748,7 +1878,7 @@ It was so big a dog that it filled the cage How big a dog did he buy? *How big did he buy? -% Actually, the above is allowed, e.g. while talking about +% Actually, the above is allowed, e.g. while talking about % house prices, the conclusion might be: "So, how big did he buy?" @@ -1770,7 +1900,7 @@ To what end? For what reason? % Elipsis doesn't work, because the Qd- link can't be used without -% the SI link, due to post-processing. Arghhh. +% the SI link, due to post-processing. Arghhh. By what means ...? In what way ...? To what end ...? @@ -1919,7 +2049,7 @@ *What he wants is the cats What he wants are the cats The cats are what John wants -The issues are that some pages will not load completely or load at all +The issues are that some pages will not load completely or load at all % shame commonly used as a count noun. He felt a deep shame. @@ -2061,16 +2191,20 @@ I want my coffee fresh *I want coffee fresh +% "I want the review brilliant." is very awkward, unless context +% suggests that there should have been a "that you write to be". +% Viz: "I want the review [that you will write to be] brilliant." +% which does not mean "I want the brilliant review" I want it shiny. I wish it was shiny. I want it brilliant. I want the review to be brilliant. -I want the review brilliant. -I want it done. +*I want the review brilliant. +I want it done. I want the job done. -I want it finished. +I want it finished. I want the job finished. -I want it over with. +I want it over with. I want these troubles over with. I wish it was over with. I want it completed. @@ -2078,7 +2212,7 @@ I want it fixed. I want it fixed up. I want it cleaned up. -I want it gone. +I want it gone. I wish it was gone. I want to be rid of it. @@ -2095,7 +2229,7 @@ The area remains under severe threat of fire. He poses a threat to security. -% Good sentences ... +% Good sentences ... I heard a bark in the distance. % verb modifiers for talk, consist, etc. @@ -2127,7 +2261,7 @@ He does that all of the time. He does that a lot of the time. He does it all of the time. -He does it all of the day. +He does it all of the day. He does it a little of the time. He does it a little bit of the time. That happens none of the time. @@ -2146,20 +2280,20 @@ He is a part of the problem. Part of the problem is his attitude. -It happens every other time. -It happens every time. -It happens each time. -It happens each time I hit the brake. +It happens every other time. +It happens every time. +It happens each time. +It happens each time I hit the brake. It happens not every time I hit the brake. It happens every place I go. -Let's do it some other time. -Let's do it some other way. -Let's do it some other place. -That's what happens most other times. +Let's do it some other time. +Let's do it some other way. +Let's do it some other place. +That's what happens most other times. I see that happen most places I go. % Filler-it and the SF link: -It is reasoned that the vast concentration of the business information market in EMEA resides within Europe. +It is reasoned that the vast concentration of the business information market in EMEA resides within Europe. It is whispered that the vast concentration of the business information market in EMEA resides within Europe. it is reasoned that this statement is true. it is hoped that this statement is true. @@ -2168,9 +2302,8 @@ % buggy coppula and copula contractions % % Consider "Here's the ball." We have two choices: SFst+ as a filler-it, -% or the more questionable [Wq- & PF+]. Note that (Wd- & PF+) is barred -% by the post-processing rules. Maybe PF is reasonable.. but SFst seems -% better at the moment. +% or the fronted construction (Wp- & PF+). Note that (Wd- & PF+) is barred +% by the post-processing rules. Both seem about the same. % Here is the ball. Here's the ball. @@ -2178,6 +2311,13 @@ Where's the ball? Here's Johnny! +there are the books +Here lie the remains +Here lay the remains +Here lies the book +here rests the remains +there rest the remains of St. Stephen + Nearby is another temple that honors his wife. Close by is another temple. Nearby are more ruins. @@ -2191,7 +2331,6 @@ Here goes nothing. But for the grace of God, there go I. - % copulas with verb target: All he ever does is bark. All he ever does is whine. @@ -2225,7 +2364,9 @@ He's out for lunch. He's out on the town. -% Problems with "as" .... +He looked at the girl who'd been knitting + +% Problems with "as" .... as.e as adverb It is no longer portrayed as having the power to protect against sin. You are portrayed as having the power to make decisions. You are portrayed as wanting the power to make decisions. @@ -2241,6 +2382,7 @@ She sang well, as hoped for. She sang well, as was hoped for. +She sang genuinely well, as hoped for. She danced as promised. She danced, as promised, before leaving. As promised, she left @@ -2263,14 +2405,86 @@ The accused, as shall be proven, is innocent as shall be proven, the accused is innocent as it shall be proven, the accused is innocent + +the accused, as I suspected, is innocent +he, as I suspected, already left + +% present-tense +the accused, as I suspect, is innocent +the accused, as Joe surmises, is innocent +the accused, as Joe foresaw, was innocent +the accused, as Joe asserts, is innocent +the accused, as Joe opines, is innocent + +the accused, as you know, is innocent +the accused, as I will show, is innocent +the accused, as I shall show, is innocent + +% with adverbs +the accused, as was widely known, cheated on his wife +The accused, as I always suspected, is innocent +The accused, as I always suspected, was found innocent +The accused, as I will now show, is innocent + +% with objects +the movie, as he filmed it, is bad +that cut, as he cut it, is bad +that tool, as he uses it, is inappropriate + as had been agreed, the work began on Monday as had been agreed in advance, the low blow came in the third round +his statements, as candidate, contradict his current actions + +He is innocent, as I avow +The accused is innocent, as it is told +The accused is innocent, as it is said +The accused is innocent, as it is appears +The accused is innocent, as it is written +The accused is innocent, as it is claimed +The accused is innocent, as is well known +The accused is innocent, as is well-known +The accused is innocent, as you know +the accused is innocent, as I said +the accused is innocent, as I have said +The accused is innocent, as you claim +The accused is innocent, as you have claimed + + +He has the impunity to do as he pleases +He did as he pleased. +He does as he pleases. +He did as it pleased him. +He did just as he pleased. +He does as he wants. +He does as he wishes. +He does as he desires +he gets as he deserves + +he got what he had coming + +We set out to do it, as we vowed. +It began to rain, just as he had prayed for. +It began to rain, as he had wished for +She smiled at him, as he had wished +He finished at last, just as he had sworn +He finished at last + +He delivered, as it was arranged for +He delivered, as it had been arranged for +He delivered, as it was arranged +He delivered, as it had been arranged +She squirmed, as she was stared at +She kissed him, as he had long longed for + + Do as I do. He said it in a voice so loud as to make everyone stare. As it was commanded, so it shall be done. +the girl, as directed, turned her head + the note, as attached, is wrong the offer, as extended, is still valid the movie, as filmed, is too long @@ -2282,19 +2496,37 @@ any girl, as heckled as she was, would have fled the ground, as liquified as it was, suddenly surged downhill. +the finding, questionable though it may be, deserves attention +the claim, unclear as worded, deserves attention +it is unclear as worded + +Questionable as it may be, we will proceed. +Questionable though it may be, we will proceed. + % Also, lots and lots ot present-participles... the man, as acquiescing as he was, was ill-suited to guard the hen-house +the man, as acquiescing as he was, set a bad precedent the cookie, as big as life itself, fixed his attention +% Mg links with "as" +It described the treaty as marking a new stage -% idiom "as is" -We bought the table as is. +% "as" connecting to question words +decisions such as when to go are taken by the instructor. +decisions such as going are taken by the instructor. % idiom "as how" He allowed as how it was none of my business. I don't know as how I ought to interfere. +% idiom "as yet" +As yet, no one has thought of a solution. +So far, no one has thought of a solution. + +% idiom "as is" +We bought the table as is. + % Problems with "go": Let's go shop. @@ -2355,7 +2587,7 @@ Below the ground lay the treasure. In Xanadu did Kubla Kahn a stately pleasure dome decree. -% Confusion between K- and MVa- links -- +% Confusion between K- and MVa- links -- % "He turned left." uses the MVa- link. % "He turned around", "he jumped around", "he jumped up" use "K" % as a particle link. @@ -2389,7 +2621,7 @@ *He looked fearfully, about. % Set phrases involving "so" -- this is an exhaustive list ! -% There are no other verbs that take "so" -- should there be +% There are no other verbs that take "so" -- should there be % a special link for this ??? I assume so I believe so @@ -2439,8 +2671,8 @@ % tell taking prep Today, we will tell about ancient structures. Yesterday, we told about ancient structures. -They have told of the soldiers' fear -They have talked of the soldiers' fear +They have told of the soldiers' fear +They have talked of the soldiers' fear % More particles Your grammar should be worked on. @@ -2670,10 +2902,10 @@ Where did they go to? What has become of them now? How much was the leakage from the drain in the night? -Who are those dudes in the back seat of Calvin's car? -What did they do when they got off the car? -Did they go get a sandwich? -Did they eat in the dark? +Who are those dudes in the back seat of Calvin's car? +What did they do when they got off the car? +Did they go get a sandwich? +Did they eat in the dark? What did they do with the waxed-paper bundles? % Sentences ending in "to", null infinitive @@ -2706,7 +2938,7 @@ % Hmm. Things following a comma, but in a different register ... % I think the clause needs to be its own domain ?? -% Missing/implied subject +% Missing/implied subject He stepped forward, gun raised. He stepped forward, hand out. He stepped forward, his hand out. @@ -2733,6 +2965,21 @@ The stock, to be purchased on the open market, will be added to assets. The parts, being obtained as we speak, will arrive this afternoon. + +% Pronouns with the MX link. +The man, Joseph, rang the bell +He, Joseph, rang the bell +I, Joseph, rang the bell +they, the twins, did it. +He, the shop owner, rang the bell +he, the shop owner, ... +Herbert, the shop owner, rang the bell +We, the undersigned, declare this statement to be true + +% post-nominal MX to verbs +The problem, put simply, is money. +The problem, stated simply, is money. + % zero-copula (missing "to be") post-nominal MX broken-ness: The event, being historically insignificant, will soon be forgotten The event, historically insignificant, will soon be forgotten @@ -2751,9 +2998,12 @@ Shades of Cyc, where apples were fruit, but oranges weren't, Kennedy was a president, but Nixon a statesman. % A phantom subject. -Shades of Cyc! +Shades of Cyc! Where apples were fruit, but oranges weren't, Kennedy was a president, but Nixon a statesman. +% zero-copula comparative-conjunctions +that is very nice, just not what I want +that is very nice, but not what I want % Impersonal, passive "be" linking to passive participle. It is normal that such directives be worked upon. @@ -2855,6 +3105,10 @@ % care of It is addressed c/o the Smith family. +% numbers as adjectives +the five seat washed out in the final sprint +the sixteen runner burnt to a crisp + % Measurement units There is a nineteen minute gap in the tapes. There is a 12 year gap in the records. @@ -2900,9 +3154,27 @@ That will cost 2400 yen. We bought 200ft. of lumber. We bought the last 200ft. of lumber. +We bought 200 square ft. of plywood. +We bought 200 square ft of plywood. +We bought 200 linear ft of lumber. +We bought 200 sq. ft. of plywood. +We bought 200 sq ft of plywood. We bought 200sq.ft. of plywood. +We bought 200 cu. yds. of concrete. +five hundred cubic yards of concrete were used for this project +500 cubic yards of concrete were used for this project +500 cu. yds. of concrete were used for this project + +The requirement, 5 to 7 sq ft, is correct +The requirement calls for 5 cu in to 7 cu in +The requirement calls for 5 cu. in. to 7 cu. in. +The requirement calls for 5 to 7 cu in + +It has a 200 cu. in. displacement. +It has a 200 cubic inch displacement. Apply 12 ft.lbs. of torque. It is 12ft. long. +This battery is rated at 5 mAh The voltage fell almost 50mV. That will cost hundreds of lira. That will cost zillions of lira. @@ -2937,6 +3209,24 @@ I'd like 2 lbs. of coffee I'd like 2lbs. of coffee +The proof value is twice the v/v % value. +It is convenient to express its concentration in v/v percent +It is convenient to express its concentration in v/v % +We measure the concentration in v/v % +Wine has a v/v % alcohol content of 12 to 14 +wine contains 13.1% (v/v) alcohol +For example, wine is about 12% v/v ethanol. +For example, wine is about 12% vol/vol ethanol. +For example, wine is about 12 percent ethanol. +For example, wine is about 12% ethanol. +An example would be a 5%(w/v) NaCl solution. +An example would be a 5% weight per volume NaCl solution. +An example would be a 5% NaCl solution. +It contains 12 parts alcohol. +it contains about 12 liters of ethanol. + +We spend four dollars per student per year + % Lack of space between numbers and letters causes problems It was 10:30AM in the morning. It was 10AM in the morning. @@ -2973,11 +3263,11 @@ % XXX FIXME: % Note that the handling of prepositions is incorrect with regard to -% measurements: the linkage should be to lumber, not feet. That is, -% "12 feet of" is a modifier. Similarly, "hundreds of" should be a +% measurements: the linkage should be to lumber, not feet. That is, +% "12 feet of" is a modifier. Similarly, "hundreds of" should be a % modifier to "hours". But this can also be ambiguous: consider: -% "The rate is 10 nm per one msec of time" -- here "of time" is -% indeed the modifier. Hmmm ... +% "The rate is 10 nm per one msec of time" -- here "of time" is +% indeed the modifier. Hmmm ... % % Note also: parse confusion between "feet.n" and "feet.i" % and so "feet.n" should be removed/trimmed back from dict. @@ -3002,13 +3292,15 @@ It was about 3 miles out. It is a few miles over. +we walked for a further three kilometers +we walked for a further three hours +The judge jailed him for a further 3 and a half years + % Numeric comparatives I ran 10 fewer miles than Ben. I ran 10 more miles than Ben. - % XXX These parse, but there's an explosion of parses generated. -% Basically, miles.n is fucking things up -- we need to trim it down. It is between 5 miles and 15 miles long. It is between 5 miles and 15 miles distant. @@ -3033,7 +3325,7 @@ John, aged 40, is a douchebag. -% Can have numerical modifier *and* a determiner simultaneously: +% Can have numerical modifier *and* a determiner simultaneously: The sigma factor sigma 35 is homologous. % Nested modifiers too: @@ -3053,6 +3345,8 @@ Go to spot X on the map. Go to place X on the map. +suppose that X or Y is responsible. + % Numerical ranges It takes 2 to 3 times the effort. It takes 2 - 3 times the effort. @@ -3196,14 +3490,14 @@ next is this item. last is this item. -% Other misc ordinal usage, including dates. +% Other misc ordinal usage, including dates. Today is the third of December. Today is the 3rd of December. Today is the 31st of October. Today is the thirty-first of October. He is the third to speak up. Of them all, he ranked third. -Of them all, he ranked 3rd. +Of them all, he ranked 3rd. He left on the 20th. He left on the twentieth. We are leaving on December 2nd. @@ -3301,6 +3595,34 @@ % punctuation: sometimes commas are used as if they were semicolons: Sweat stood on his brow, fury was bright in his one good eye. +% Crazy UTF8 dashes +we are dealing with androcentric—that is to say, male-dominated—concepts +we are dealing with androcentric--that is to say, male-dominated--concepts + +% post-nominal paranthetical clarifiers ... (scilicet) +it [sc. gouache] was also popular in France +it -- that is, gouache -- was also popular in France +it (gouache) was also popular in France +it -- fried rice -- is very popular +it (namely, rice) is yummy +it (viz. rice) is yummy +it (in other words, rice) is yummy +it (to wit, rice) is yummy +it (i.e. rice) is yummy + +When I was going home [scilicet on the evening of day 18] I saw a dog. +I was going home [scilicet on the evening of day 18] when I saw a dog. +I was walking [scilicet on the evening of day 18] when I saw a dog. +I was walking away [scilicet on the evening of day 18] when I saw a dog. + +% Are commas legal, like the below? +% that thing, that was mentioned yesterday, is called off +% that thing, that we talked about yesterday, is off +% that equation, that he drew on the board, was wrong +% Here's howto add the commas, if we wanted to: +%%%that.j-s: +%%% ((S**w+ or (R+ & B*w+) & (Xd- & (Xc+ or ) & MX*r-)); + % Automatic sentence splitting. A period or question mark, mid-text, % will split the sentence into two. Note capitalization is handled. who is Obama? where was he born? @@ -3370,7 +3692,7 @@ The question is how do you convince your customer that you are on the right path? Answer: Very carefully! Answer 2: Politely. -Question 3: Did you double-check your math? +Question 3: Did you double-check your math? Step 3: Tell him off! Question 3: What about tomorrow? What about tomorrow? @@ -3498,7 +3820,7 @@ While we're on the topic, was it good? Kenny sure is a bastard. -% elided "that" (phantom that) +% elided "that" (phantom that, zero-that) Just thought you'd like to know. It was previously thought they were wrong. It's a good bet he did it. @@ -3506,8 +3828,12 @@ It's a good thing he did it. It's a good thing that he did it. +We have to make sure that everyone is here. +We have to make sure everyone is here. +You should make clear that it's possible. +You should make clear it's possible. -% elided "to be" (phantom to be) +% elided "to be" (phantom to be) % Handled by Pa**j, O*n (predicative adjective, object) She will think it an act of kindness. She will think it to be an act of kindness. @@ -3677,6 +4003,8 @@ Praise the Lord, amen! Praise the Lord God, amen! +Rock 'n' roll, dude! + % Given names can be openers to directives or questions. Ma'am, please don't do that. John, please don't do that. @@ -3717,7 +4045,7 @@ % and, or opening coordinating conjunctions. % Currently these take Wc- to the left wall, since they "coordinate" % with previous sentences. This is different than a plain-old CO -% link, since they are not 'plain-old' clause openers ... +% link, since they are not 'plain-old' clause openers ... And you can do it this way. Or you can do it this way. Or, you can do it this way. @@ -3826,8 +4154,46 @@ So! You thought I would fail! So! -% "so" as a conjunction. +% Inverted subjects, with "so". These use the CPa link. +so thinks all the world +so thinks everyone +yes, so thinks everyone +everyone thinks so +yes, everyone thinks so +so believes everyone +yes, so believes everyone +so does everyone +yes, so does everyone + +so it is! +yes, so it is! +yes, so it really is! +so it does! +Yes, so it does! +why, yes, so it does! +so it really does! +so it really does appear to be! +so it really does appear to be true! + +so it seems +yes, so it seems +so it appears +yes, so it appears + +so it would seem +so it would appear + +so seems it + +thus it would seem +thus it would appear +thus it appears +thus it begins +and so it begins +and so it would seem +and thus it begins +and there she goes! % Requests, directives, commands, imperatives. Do not be wise in your own estimation. @@ -3933,7 +4299,7 @@ What did you see? A red car. Who was in it? -John. +John. John's evil twin. The evil twin. John and his brother. @@ -4231,7 +4597,7 @@ I don't care if the outcome is good or bad. % conjoined adjectives with modifiers -The river is wide +The river is wide The river is wide here The river is a mile wide The river is a mile wide here @@ -4353,7 +4719,7 @@ % conjoined post-nominal modifiers: MJ*a (in place of Ma and MX*a) Many Democrats unhappy about the economy but doubtful that Clinton can be elected probably won't vote at all -Many Democrats, unhappy about the economy but doubtful that Clinton can be elected, probably won't vote at all +Many Democrats, unhappy about the economy but doubtful that Clinton can be elected, probably won't vote at all Many people, unaware of the deception and unfamiliar with the details, won't care. Many people, unaware or unfamiliar with the details, won't care. Many people, unaware of or unfamiliar with the details, won't care. @@ -4392,7 +4758,7 @@ The Easter eggs are hidden in and around the house. % conjoined adverbs: RJ*v link -She handled it quickly and gracefully +She handled it quickly and gracefully She handled it skillfully and with compassion She handled it quickly, quietly and gracefully @@ -4598,7 +4964,7 @@ % But for joining verbs She organized her work but accomplished very little No one but she saw the prowler. -He would have joined the band but he couldn't spare the time +He would have joined the band but he couldn't spare the time The brothers would have resisted but that they lacked courage. It never rains but it pours. There is no doubt but right will prevail. @@ -4683,10 +5049,14 @@ John describes and analyzes literature. Chief describes and analyzes the character of the families in a village, remarking that they all act completely the same. -% conjoined infinitives are still broken... +% conjoined infinitives They advised us to avoid sunlight and to limit the consumption of fish. I am expecting this sentence to parse correctly, and to yield a very wide diagram. +I aim to do something and also to help. +I aim to do something and to help. +I aim to help and also to do something. + % conjoined verbs with modifiers, negation He cannot change the beliefs of popular society as a whole. He can not change the beliefs of popular society as a whole. @@ -4728,6 +5098,9 @@ Show me my notes from today, but not from yesterday. Show me my notes from today but not from yesterday. +% Conditional sentences +If this is true, then do that. + % qualified conditional clauses Do it only if you are sure. Only if they are home, should you do it. @@ -4756,9 +5129,9 @@ % neither-nor constructions with verbs: we ate popcorn or watched movies we either ate popcorn or watched movies -we neither ate popcorn nor watched movies -*we either ate popcorn nor watched movies -*we both ate popcorn nor watched movies +we neither ate popcorn nor watched movies +*we either ate popcorn nor watched movies +*we both ate popcorn nor watched movies *we ate popcorn nor watched movies we both ate popcorn and watched movies we not only ate popcorn but watched movies @@ -4766,15 +5139,29 @@ We neither ate nor drank for three days we neither saw nor heard what happened -% lots of conjunctions, commas, lists mashed up ... +she didn't do that, nor did John +I don't want that, nor do you + +% lots of conjunctions, commas, lists mashed up ... The problem is, or rather one of the problems, for there are many, a sizeable proportion of which are continually clogging up the civil, commercial, and criminal courts in all areas of the Galaxy, and especially, where possible, the more corrupt ones, this. The problem is, or rather one of the problems, this. The problem, or rather one of the problems, is this. % idiomatic "or rather" The problem, or rather, one of the problems, is this. +The problem, or rather, one of the problems, ... The problems are clogging up the courts, and, where possible, the corrupt ones The problems are clogging up the courts, and, wherever possible, the corrupt ones +The problem, or more exactly, one of the problems, is this. +The problem, or more precisely, one of the problems, is this. + +The EU Services Directive: law or simply policy? +Would these critics have accepted this or, instead, would they have condemned the US? +Would these critics have accepted this or would they have condemned the US? +Would they have accepted this or, instead, would they have ...? + +It is thus imperative that assistance be provided through a variety of channels, in addition to, or instead of, the IMF, including ... + % lists: % List of gerunds: He has three jobs: counting, measuring and listing. @@ -4870,13 +5257,38 @@ *Susan likes to work the late shift, and Sam the early shift. % Compound queries -% Typically, the two sub-phrases parse correctly, but, +% Typically, the two sub-phrases parse correctly, but, % put together, they do not. What is a motor, and how does it work? What is a motor; how does it work? Who invented sliced bread, and when did they do it? Who invented sliced bread; when did they do it? +% conjoined negatives in decision-questions +Who, if not I, should defend you against such slanders ? +Who, if not Micheal, will provide for your care? +Who shall accomplish this, if not you? +What, if not this, is worth it? +when, if not now, do you want to do it? +when, if not tomorrow, do you want to do it? +when, if not next week, do you want to do it? +when do you want to do it, if not next week? +when, if not next Tuesday, do you want to do it? +if not next Tuesday, when do you want to do it? +if not next Tuesday, then when do you want to do it? + +% conjoined negatives in statements +why, and not whether, is the question +why, and not how, is the question +when, but not how, will be decided. +how, but not when, was decided. +how, yet not when, was decided. +how, and apparently when, was decided. +the time, but not the cause, was established +the time, and not the cause, was established +the time, yet not the cause, was established +They play billiards, yet not snooker + % Coordinating conjunctions These are the snippers and those are the cutters. Grant me this wish and I will die happy. @@ -4885,6 +5297,24 @@ It never rains but it pours. They had no sooner arrived but they turned around and left. +I eat then sleep +I eat, then sleep +I eat then I sleep +I eat, so I sleep + +% conjunctions that are comparative +he is fast, just not a good swimmer +he is fast, but not a good swimmer +he is fast but not a good swimmer +he is fast but just not a good swimmer +just not a good swimmer, he fell behind + +% Misc conjunctions +I like to get my knitting done whilst watching the telly. + +% conjunction used to provide a definition. +Logorrhea, or excessive and often incoherent talkativeness or wordiness, is a social disease. + % XXX Misc conjunction constructions that are still broken in the current parser: He is flighty and given to fanciful thoughts He is thoughtful, and not given to many words @@ -4905,21 +5335,26 @@ John; my advisor, Steve; and several other people are coming No one but she saw the prowler. -% conjunction used to provide a definition. -Logorrhea, or excessive and often incoherent talkativeness or wordiness, is a social disease. - % The two below are valid sentences, but they parse incorrectly just right now % so we will put a * in front of them, for now, till the parse is fixed. *Mike finished in first place, and John in third. *Mike finished in first place, and John in last +% More crazy parses due to bad conjunction handling +Whenever he comes, she goes, and v.v. +The market fell 156.83, or 8%, a week after Black Monday. + +% Hmm conjunctions .. these are tricky to get right ... +Mike finished in first place, and John in third. +Mike finished in first place, and John in last. + % -------------------------------------------------------------------- % non-planar graph: "saw" and "yesterday" should link, but don't. % "dog" and "which" do link (and that is correct). *John saw a dog yesterday which was a Yorkshire terrier. % incorrect parses -- INCORRECT PARSES -- bad disjunct usage -% These sentences typically parse just fine, but generate +% These sentences typically parse just fine, but generate % incorrect linkages. % Sentence used to parse incorrectly -- "of" linked to "bill", instead @@ -4938,14 +5373,7 @@ *Bill went over the river and right through the woods. *Bill went over the river and straight through the woods. -% More crazy parses due to bad conjunction handling -Whenever he comes, she goes, and v.v. -The market fell 156.83, or 8%, a week after Black Monday. - -% Hmm conjunctions .. these are tricky to get right ... -Mike finished in first place, and John in third. -Mike finished in first place, and John in last. - +% ------------------------------------------------------------- % Rules should select "shouting.g" gerund, not "shouting.v" (participle) % Now fixed. The teacher's shouting startled the student. @@ -4971,6 +5399,37 @@ % Other capitalization issues: We are from the planet Gorpon. +% clause opener comparatives +whilst a good worker, he's not a very good manager +although a good worker, he's not a very good manager +though a good worker, he's not a very good manager +hardly a good worker, he's a poor choice +hardly even a good worker, he's a poor choice +not even a good worker, he's a poor choice +never a good worker, he sank to new levels of laziness +ever the good worker, he strove to promote efficiency + +never a good runner, he fell behind +not a good runner, he fell behind +not very smart, he fell behind + +% Misc comparatives +She was a girl just about John's age +She was a girl of about John's age +She was a girl nearly John's age +She was a girl almost John's age +She was a girl of nearly John's age +She was a girl of almost John's age + +She was a girl almost John's age, or older. +She was John's age, or older +She was John's height, or taller +She was a girl of about Breeze's own age, or perhaps older. +She was a girl of about John's own age, or perhaps older. +She was a girl of almost John's own age, or perhaps older. +She was a girl of almost John's age, or maybe older. +She was John's age or maybe older + % Broken comparatives: He is nothing less than inspired! He is less than bright! @@ -5026,6 +5485,11 @@ they took there ball back than we left until than, I wasn't sure. +I eat than I sleep. +I eat then I sleep. + +% i before e unless the weighty neighbor is a German loan word. +I beleive you are right % "or" should be "nor" for this to be grammatically correct. neither this or that one will do @@ -5033,6 +5497,7 @@ % -------------------------------------------------------------------- % sentences that parse, but not correctly: % We are going to put *'s in front of them, until they are fixed... +% Proper fixes for some of these require links that cross... % viz they should parse, *You are as sweet as sugar. *The situation is not so bad as you suggest. @@ -5045,6 +5510,10 @@ I slipped on the ice as I ran home. +% This should parse with the TQ/BT link (see OT page) +% but that requires links crossing to head-word, so its screwed. +*How many years did it last + % -------------------------------------------------------------------- % assorted unclassified breakages, waiting for a fix: Although society has excluded the patients in the ward for their unique qualities, they feel 'safer' trying to fit in because they receive approval from nurses and the representatives of society. @@ -5052,7 +5521,7 @@ 92 Chapter 10............................................................................................................................................. % -------------------------------------------------------------------- -% Sentence containing UTF8 chars that can lead to i/o or +% Sentence containing UTF8 chars that can lead to i/o or % word-boundary problems *Bitter gourdはにがうりだそうであってるのかはちょっと不安ですが翻訳ソフトでやくしてみました。 *Синтезируется из глюкозы в тканях и органах, например в сердце, печени, почках. @@ -5096,6 +5565,9 @@ % on commit "Fix "Assertion (word_xnode != NULL) failed" in SAT solver". This is a TEST +% The example for defect 28 from issue #50. +The remarks--made off the cuff--were wrong. + % The very long sentences that take forever to parse are now in the % file 4.0.fix-long.batch % -------------------------------------------------------------------- diff -Nru link-grammar-5.3.16/data/en/corpus-fix-long.batch link-grammar-5.5.0/data/en/corpus-fix-long.batch --- link-grammar-5.3.16/data/en/corpus-fix-long.batch 2016-09-14 15:43:08.000000000 +0000 +++ link-grammar-5.5.0/data/en/corpus-fix-long.batch 2018-02-23 00:06:56.000000000 +0000 @@ -37,7 +37,7 @@ % a cascade of and-list overflow errors: I have not seen the grysbok, or the suni, or the dibitag, or the lechwi, or the aoul, or the gerenuk, or the blaauwbok, or the chevrotain, or lots of others, but who in the world could guess what they were or what they looked like, judging only from the names? -% Overflow + no complete linakges. +% Overflow + no complete linkages. In 1608 he wrote a treatise of the Errors, False Gods, and Other Superstitions of the Indians of the Provinces of Huarochiri, Mama, and Chaclla, of which unfortunately only the first six chapters are known to exist and have been translated into English. The strongest rain ever recorded in India shut down the financial hub of Mumbai, snapped communication lines, closed airports and forced thousands of people to sleep in their offices or walk home during the night, officials said today. @@ -45,6 +45,9 @@ % No complete linkages found. New York Post: The new Mel Brooks/Susan Stroman musical extravaganza ... is nearly very good indeed - but it is not the The Producers ... this story ... does not lend itself to stage adaptation in the way of the earlier movie ... Now for the good news ... Brooks and Stroman pull out every stop. +% (0 of 100 random linkages had no P.P. violations) +Agreeing that the constitutional guarantee of due process applies to proceedings in which juveniles are charged as delinquents, the court held that the Arizona Juvenile Code impliedly includes the requirements of due process in delinquency proceedings, and that such due process requirements were not offended by the procedure leading to Gerald's commitment. + % Combinatorial explosion However, the few tracts, the poetry, and the novels that embodied the social vision of Young England were directed to a New Generation of educated, religious, and socially conscious conservatives, who, like Young Englanders, were appalled at the despiritualizing effects of industrialization and the perceived amorality of Benthamite philosophy, which they blamed equally for Victorian social injustices. diff -Nru link-grammar-5.3.16/data/en/Makefile.in link-grammar-5.5.0/data/en/Makefile.in --- link-grammar-5.3.16/data/en/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/en/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -184,7 +184,7 @@ ETAGS = etags CTAGS = ctags DIST_SUBDIRS = $(SUBDIRS) -am__DIST_COMMON = $(srcdir)/Makefile.in README +am__DIST_COMMON = $(srcdir)/Makefile.in README missing DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) am__relativize = \ dir0=`pwd`; \ @@ -217,6 +217,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -245,6 +246,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -253,13 +255,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -326,12 +331,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/en/missing link-grammar-5.5.0/data/en/missing --- link-grammar-5.3.16/data/en/missing 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/data/en/missing 2018-02-22 05:17:37.000000000 +0000 @@ -0,0 +1,8 @@ +elipsis +et al. +intra +nominals +Re: +rock 'n' roll (idiom) +sc. +v/v https://forum.wordreference.com/threads/v-v-laboratory-abbreviation.2194381/ diff -Nru link-grammar-5.3.16/data/en/README link-grammar-5.5.0/data/en/README --- link-grammar-5.3.16/data/en/README 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/en/README 2017-08-13 22:11:23.000000000 +0000 @@ -12,7 +12,7 @@ since many of the semi-colons appear in lines that merely define new classes. -A better count of the contents of 4.0.dict yeilds 1430 distinct clusters. +A better count of the contents of 4.0.dict yields 1430 distinct clusters. There seem to be 86863 word forms in the dicts diff -Nru link-grammar-5.3.16/data/en/words/entities.given-female.sing link-grammar-5.5.0/data/en/words/entities.given-female.sing --- link-grammar-5.3.16/data/en/words/entities.given-female.sing 2016-09-20 04:16:12.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/entities.given-female.sing 2018-01-25 03:29:55.000000000 +0000 @@ -142,7 +142,6 @@ Amberly.f Amee.f Amelia.f -America.f Ami.f Amie.f Amiee.f diff -Nru link-grammar-5.3.16/data/en/words/entities.national.sing link-grammar-5.5.0/data/en/words/entities.national.sing --- link-grammar-5.3.16/data/en/words/entities.national.sing 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/entities.national.sing 2018-01-25 03:32:41.000000000 +0000 @@ -1,6 +1,7 @@ Afghanistan.l AF.l Albania.l AL.l Algeria.l AG.l +America.l Andorra.l AN.l Angola.l AO.l Anguilla.l AV.l @@ -28,6 +29,7 @@ Botswana.l BC.l Bouvet Bouvet_Island BV.l Brazil.l BR.l +Britain.l British_Indian_Ocean_Territory IO.l British_Virgin_Islands Brunei.l BX.l @@ -67,6 +69,7 @@ Ecuador.l EC.l Egypt.l EG.l El_Salvador ES.l +England.l Equatorial_Guinea EK.l Eritrea.l ER.l Estonia.l EN.l @@ -237,7 +240,7 @@ Undersea_Features UF.l Emirates United_Arab_Emirates AE.l Kingdom United_Kingdom UK.l -United_States_of_America United_States USA.l US.l America.l +United_States_of_America United_States USA.l US.l Uruguay.l UY.l Uzbekistan.l UZ.l Vanuatu.l diff -Nru link-grammar-5.3.16/data/en/words/Makefile.am link-grammar-5.5.0/data/en/words/Makefile.am --- link-grammar-5.3.16/data/en/words/Makefile.am 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/Makefile.am 2018-03-10 05:17:57.000000000 +0000 @@ -1,4 +1,4 @@ -WORDS= \ +WORDS= \ currency \ currency.p \ entities.given-bisex.sing \ @@ -19,22 +19,26 @@ units.4.dot \ units.5 \ units.6 \ - words.adj.1-vowel \ + units.a \ words.adj.1-const \ + words.adj.1-vowel \ words.adj.2 \ words.adj.3 \ words.adv.1 \ words-medical.adv.1 \ words.adv.2 \ - words.adv.3 \ + words.adv.3-const \ + words.adv.3-vowel \ words.adv.4 \ - words.n.1-vowel \ words.n.1-const \ + words.n.1-vowel \ words.n.1.gerund \ words.n.2.s \ words.n.2.x \ - words.n.3-vowel \ words.n.3-const \ + words.n.3-vowel \ + words.lang.3-const \ + words.lang.3-vowel \ words.n.3.y \ words.n.t \ words-medical.prep.1 \ diff -Nru link-grammar-5.3.16/data/en/words/Makefile.in link-grammar-5.5.0/data/en/words/Makefile.in --- link-grammar-5.3.16/data/en/words/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -353,22 +358,26 @@ units.4.dot \ units.5 \ units.6 \ - words.adj.1-vowel \ + units.a \ words.adj.1-const \ + words.adj.1-vowel \ words.adj.2 \ words.adj.3 \ words.adv.1 \ words-medical.adv.1 \ words.adv.2 \ - words.adv.3 \ + words.adv.3-const \ + words.adv.3-vowel \ words.adv.4 \ - words.n.1-vowel \ words.n.1-const \ + words.n.1-vowel \ words.n.1.gerund \ words.n.2.s \ words.n.2.x \ - words.n.3-vowel \ words.n.3-const \ + words.n.3-vowel \ + words.lang.3-const \ + words.lang.3-vowel \ words.n.3.y \ words.n.t \ words-medical.prep.1 \ diff -Nru link-grammar-5.3.16/data/en/words/units.1 link-grammar-5.5.0/data/en/words/units.1 --- link-grammar-5.3.16/data/en/words/units.1 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/units.1 2017-11-09 05:42:55.000000000 +0000 @@ -17,6 +17,7 @@ kilobase_pairs base-pairs.u base_pairs kb.u kilobases.u bp.u kbp.u Mb.u Ampere.u Amperes.u amp.u amps.u ampere.u amperes.u pA.u nA.u μA.u ma.u mA.u A.u kA.u KA.u +mAh.u mAH.u Bel.u Bels.u bel.u bels.u dB.u decibel.u decibels.u dBm.u calorie.u calories.u cal.u kcal.u kcal/mol.u Dalton.u Daltons.u dalton.o daltons.u Dal.u kDal.u kDa.u kD.u KD.u D.u diff -Nru link-grammar-5.3.16/data/en/words/units.1.dot link-grammar-5.5.0/data/en/words/units.1.dot --- link-grammar-5.3.16/data/en/words/units.1.dot 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/units.1.dot 2017-11-09 05:36:24.000000000 +0000 @@ -6,5 +6,5 @@ tbsp..u tsp..u pt..u qt..u gal..u bbl..u sq.u sq..u -sq.ft..u -ft.lbs..u +cu.u +cu..u diff -Nru link-grammar-5.3.16/data/en/words/units.3 link-grammar-5.5.0/data/en/words/units.3 --- link-grammar-5.3.16/data/en/words/units.3 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/units.3 2017-11-09 04:54:55.000000000 +0000 @@ -1,2 +1 @@ °C.u °F.u degrees_Fahrenheit degrees_Centigrade degrees_C -mph.i MPH.i mpg.i MPG.i cc.i ml.i diff -Nru link-grammar-5.3.16/data/en/words/units.a link-grammar-5.5.0/data/en/words/units.a --- link-grammar-5.3.16/data/en/words/units.a 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/units.a 2017-12-02 07:30:28.000000000 +0000 @@ -0,0 +1,6 @@ +sq.a +sq..a +cu.a +cu..a +ft.a +ft..a diff -Nru link-grammar-5.3.16/data/en/words/words.adj.1-const link-grammar-5.5.0/data/en/words/words.adj.1-const --- link-grammar-5.3.16/data/en/words/words.adj.1-const 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.adj.1-const 2018-02-23 00:06:56.000000000 +0000 @@ -116,6 +116,7 @@ biologic.a biological.a bioluminescent.a +biomedical.a biometric.a biotechnological.a biotic.a @@ -186,6 +187,8 @@ bony.a bookended.a bookish.a +Boolean.a +boolean.a boon.a boorish.a borderless.a @@ -246,6 +249,7 @@ budgetary.a bulbous.a bulky.a +bulleted.a bulletproof.a bullish.a bully.a @@ -328,6 +332,7 @@ catatonic.a catching.a catchy.a +categorial.a categorical.a catholic.a Catholic.a @@ -483,6 +488,7 @@ clinical.a cliquish.a clitic.a +clonable.a clonal.a close.a cloudless.a @@ -534,6 +540,8 @@ comatose.a combatant.a combative.a +combinatorial.a +combinatoric.a combustible.a comely.a comfortable.a @@ -578,6 +586,7 @@ computable.a computational.a computerized.a +concatenative.a concave.a concentic.a concentric.a @@ -774,6 +783,7 @@ crusty.a cryogenic.a cryptic.a +cryptographic.a cryptologic.a crystalline.a cubic.a @@ -968,6 +978,7 @@ diacritical.a diagnostic.a diagonal.a +diagramatic.a dialectical.a dialogical.a dialup.a @@ -1638,7 +1649,9 @@ gyroscopic.a habitable.a habitual.a +hackish.a hackneyed.a +hacky.a haggard.a haired.a hairless.a @@ -2369,6 +2382,7 @@ misbegotten.a miscellaneous.a mischievous.a +misdiagnostic.a miserable.a miserly.a misguided.a @@ -2643,6 +2657,7 @@ nondistinctive.a nonfat.a nonflammable.a +nonfunctional.a nonhomologous.a nonlinear.a nonprofit.a @@ -2856,6 +2871,7 @@ permeable.a permissible.a permissive.a +permittable.a pernicious.a pernickety.a perpendicular.a @@ -2918,6 +2934,7 @@ picturesque.a piebald.a piecemeal.a +piecewise.a pied.a piercing.a piezoelectric.a @@ -3288,11 +3305,8 @@ quixotic.a quizzical.a quotable.a +quotational.a quotidian.a -r/c.a -R/C.a -rc.a -RC.a rabbinic.a Rabbinic.a rabbinical.a @@ -3344,6 +3358,10 @@ raw.a rawhide.a razorbacked.a +r/c.a +rc.a +R/C.a +RC.a readable.a ready-made.a real.a @@ -3437,6 +3455,7 @@ reparable.a reparative.a repayable.a +repeatable.a repellent.a repentant.a repetitious.a @@ -3469,6 +3488,7 @@ resinous.a resistant.a resistive.a +resizable.a resolute.a resolvable.a resolved.a @@ -3630,12 +3650,15 @@ satiric.a satirical.a satisfactory.a +satisfiable.a satisfied.a satisfying.a saucy.a +Saudi_Arabian savage.a savoury.a savvy.a +scalable.a scalar.a scaly.a scandalous.a @@ -4079,10 +4102,12 @@ strapping.a strategic.a strategical.a +streamline.a strenuous.a streptococcal.a stressful.a stretchered.a +stretchy.a strident.a stringent.a stringy.a @@ -4415,6 +4440,7 @@ top.a topical.a topless.a +top-level.a topmost.a topnotch.a topographic.a diff -Nru link-grammar-5.3.16/data/en/words/words.adj.1-vowel link-grammar-5.5.0/data/en/words/words.adj.1-vowel --- link-grammar-5.3.16/data/en/words/words.adj.1-vowel 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.adj.1-vowel 2018-02-23 00:06:56.000000000 +0000 @@ -128,6 +128,7 @@ algebraic.a algorithmic.a alien.a +alienating.a alimentary.a aliphatic.a alkaline.a @@ -382,6 +383,7 @@ autocratic.a autofocus.a autologous.a +automagical.a automatic.a automotive.a autonomic.a @@ -504,7 +506,7 @@ empty.a encouraging.a encyclopaedic.a -encyclopædic.n +encyclopædic.a encyclopedic.a endemic.a endless.a @@ -568,6 +570,7 @@ equine.a equitable.a equivalent.a +equivariant.a equivocal.a erect.a ergodic.a @@ -690,7 +693,6 @@ exuberant.a exultant.a eyecatching.a -i/o.a I/O.a io.a IO.a input/output.a iambic.a iatrogenic.a icebound.a @@ -699,8 +701,10 @@ icy.a ideal.a idealistic.a +idempotent.a identical.a identifiable.a +ideographic.a ideological.a idiomatic.a idiopathic.a @@ -711,6 +715,7 @@ igneous.a ignoble.a ignominious.a +ignorable.a ignorant.a ill.a illegible.a @@ -787,6 +792,9 @@ impudent.a impulsive.a impure.a +in_personam +in_re +in_rem inaccessible.a inaccurate.a inactive.a @@ -1057,6 +1065,7 @@ interfaith.a intergalactic.a interior.a +interjective.a interleague.a interlibrary.a intermediate.a @@ -1125,6 +1134,7 @@ inviting.a involuntary.a invulnerable.a +i/o.a I/O.a io.a IO.a input/output.a ionic.a ionospheric.a irascible.a @@ -1475,6 +1485,7 @@ uncommunicative.a uncompleted.a uncompromising.a +uncomputed.a unconcealed.a unconcerned.a unconditional.a @@ -1493,12 +1504,15 @@ uncontrolled.a uncontroversial.a unconventional.a +unconvertable.a unconverted.a +unconvertible.a unconvincing.a uncooked.a uncoordinated.a uncorrected.a uncorroborated.a +uncorrrupted.a uncounted.a uncountered.a uncouth.a @@ -1866,6 +1880,7 @@ unrepentant.a unreported.a unrepresentative.a +unreproducible.a unrequited.a unreserved.a unresolved.a @@ -1883,6 +1898,7 @@ unsalted.a unsanctioned.a unsatisfactory.a +unsatisfiable.a unsatisfied.a unsatisfying.a unsaturated.a @@ -1909,6 +1925,7 @@ unsightly.a unsigned.a unsinkable.a +unsized.a unskilled.a unsmiling.a unsociable.a @@ -1921,6 +1938,7 @@ unspeakable.a unspecified.a unsplit.a +unsplittable.a unspoiled.a unspoilt.a unspoken.a @@ -1930,6 +1948,7 @@ unspotted.a unstable.a unstaffed.a +unstamped.a unstandardized.a unstated.a unstatesmanlike.a @@ -1940,6 +1959,7 @@ unstrung.a unstuck.a unstudied.a +unsubscripted.a unsubstantiated.a unsubtle.a unsuccessful.a @@ -1961,6 +1981,7 @@ untainted.a untamed.a untapped.a +untarnished.a untenable.a untended.a untenured.a @@ -1975,6 +1996,7 @@ untouchable.a untouched.a untoward.a +untracked.a untraditional.a untrained.a untranslatable.a @@ -1995,6 +2017,7 @@ unvarying.a unverified.a unversed.a +unversioned.a unvoiced.a unwalled.a unwanted.a @@ -2013,6 +2036,7 @@ unworn.a unworried.a unwounded.a +unwriteable.a unwritten.a unyielding.a up.a diff -Nru link-grammar-5.3.16/data/en/words/words.adv.1 link-grammar-5.5.0/data/en/words/words.adv.1 --- link-grammar-5.3.16/data/en/words/words.adv.1 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.adv.1 2018-02-22 06:44:54.000000000 +0000 @@ -83,6 +83,7 @@ authentically authoritatively autocratically +automagically automatically avariciously avidly @@ -1146,7 +1147,7 @@ randomly rapidly rapturously -rarely.a +rarely rashly rationally raucously diff -Nru link-grammar-5.3.16/data/en/words/words.adv.3 link-grammar-5.5.0/data/en/words/words.adv.3 --- link-grammar-5.3.16/data/en/words/words.adv.3 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.adv.3 1970-01-01 00:00:00.000000000 +0000 @@ -1,157 +0,0 @@ -academically.e -actuarially.e -administratively.e -aesthetically.e -aetiopathogenetically.e -agriculturally.e -agricuturally.e -algebraically.e -allegorically.e -anatomically.e -archeologically.e -architecturally.e -arithmetically.e -artistically.e -assumedly.e -astronomically.e -athletically.e -atypically.e -behaviorally.e -biblically.e -biochemically.e -biologically.e -biotically.e -bipedally.e -carnally.e -chemically.e -clandestinely.e -climatically.e -cognitively.e -collegiately.e -colonially.e -computationally.e -conceptually.e -contractually.e -cryogenically.e -cryptographically.e -cytochemically.e -ecclesiastically.e -ecologically.e -economically.e -educationally.e -electorally.e -empirically.e -environmentally.e -equidistantly.e -esthetically.e -ethically.e -ethnically.e -ethnoculturally.e -factually.e -federally.e -financially.e -finitely.e -genealogically.e -generically.e -genetically.e -geographically.e -geologically.e -geometrically.e -governmentally.e -grammatically.e -gynaecologically.e -harmonically.e -heretofore.e -histochemically.e -historically.e -hydraulically.e -immunophenotypically.e -infinitesimally.e -institutionally.e -journalistically.e -judicially.e -lastingly.e -legendarily.e -linguistically.e -logically.e -logistically.e -maddeningly.e -materially.e -mathematically.e -medically.e -medicinally.e -metaphysically.e -meteorologically.e -methodologically.e -morally.e -morbidly.e -mystically.e -nonspecifically.e -nutritionally.e -opportunistically.e -optically.e -organizationally.e -overridingly.e -pathogenetically.e -perceptually.e -perpendicularly.e -pharmacologically.e -phenomenologically.e -philosophically.e -phonetically.e -phonologically.e -photographically.e -pictorially.e -pinnately.e -politically.e -pragmatically.e -priestly.e -princely.e -probabilistically.e -prognostically.e -pseudomorphically.e -pseudonymously.e -psionically.e -psychically.e -psychologically.e -publically.e -putatively.e -quadratically.e -questionably.e -racially.e -recreationally.e -recursively.e -revolutionally.e -rhythmically.e -ritually.e -scientifically.e -semantically.e -sexually.e -socially.e -societally.e -sociologically.e -sonically.e -spatially.e -spherically.e -spirally.e -statistically.e -statutorily.e -steganographically.e -stereotypically.e -structurally.e -stylistically.e -supernaturally.e -syllabically.e -synonymously.e -synoptically.e -syntactically.e -tangentially.e -taxonomically.e -technologically.e -telepathically.e -terrestrially.e -theologically.e -theoretically.e -therapeutically.e -topographically.e -wirelessly.e diff -Nru link-grammar-5.3.16/data/en/words/words.adv.3-const link-grammar-5.5.0/data/en/words/words.adv.3-const --- link-grammar-5.3.16/data/en/words/words.adv.3-const 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.adv.3-const 2018-02-14 21:15:19.000000000 +0000 @@ -0,0 +1,120 @@ +behaviorally.e +biblically.e +biochemically.e +biologically.e +biotically.e +bipedally.e +carnally.e +chemically.e +clandestinely.e +climatically.e +cognitively.e +collegiately.e +colonially.e +computationally.e +conceptually.e +contractually.e +cryogenically.e +cryptographically.e +cytochemically.e +factually.e +federally.e +financially.e +finitely.e +genealogically.e +generically.e +genetically.e +geographically.e +geologically.e +geometrically.e +governmentally.e +grammatically.e +gynaecologically.e +harmonically.e +heretofore.e +histochemically.e +historically.e +hydraulically.e +journalistically.e +judicially.e +lastingly.e +legendarily.e +linguistically.e +logically.e +logistically.e +maddeningly.e +materially.e +mathematically.e +medically.e +medicinally.e +metaphysically.e +meteorologically.e +methodologically.e +morally.e +morbidly.e +mystically.e +nonspecifically.e +nutritionally.e +pathogenetically.e +perceptually.e +perpendicularly.e +pharmacologically.e +phenomenologically.e +philosophically.e +phonetically.e +phonologically.e +photographically.e +pictorially.e +pinnately.e +politically.e +pragmatically.e +priestly.e +princely.e +probabilistically.e +prognostically.e +pseudomorphically.e +pseudonymously.e +psionically.e +psychically.e +psychologically.e +publically.e +putatively.e +quadratically.e +questionably.e +racially.e +recreationally.e +recursively.e +revolutionally.e +rhythmically.e +ritually.e +scientifically.e +semantically.e +sexually.e +socially.e +societally.e +sociologically.e +sonically.e +spatially.e +spherically.e +spirally.e +statistically.e +statutorily.e +steganographically.e +stereotypically.e +structurally.e +stylistically.e +supernaturally.e +syllabically.e +synonymously.e +synoptically.e +syntactically.e +tangentially.e +taxonomically.e +technologically.e +telepathically.e +terrestrially.e +theologically.e +theoretically.e +therapeutically.e +topographically.e +wirelessly.e diff -Nru link-grammar-5.3.16/data/en/words/words.adv.3-vowel link-grammar-5.5.0/data/en/words/words.adv.3-vowel --- link-grammar-5.3.16/data/en/words/words.adv.3-vowel 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.adv.3-vowel 2018-02-14 21:15:19.000000000 +0000 @@ -0,0 +1,37 @@ +academically.e +actuarially.e +administratively.e +aesthetically.e +aetiopathogenetically.e +agriculturally.e +agricuturally.e +algebraically.e +allegorically.e +anatomically.e +archeologically.e +architecturally.e +arithmetically.e +artistically.e +assumedly.e +astronomically.e +athletically.e +atypically.e +ecclesiastically.e +ecologically.e +economically.e +educationally.e +electorally.e +empirically.e +environmentally.e +equidistantly.e +esthetically.e +ethically.e +ethnically.e +ethnoculturally.e +immunophenotypically.e +infinitesimally.e +institutionally.e +opportunistically.e +optically.e +organizationally.e +overridingly.e diff -Nru link-grammar-5.3.16/data/en/words/words.lang.3-const link-grammar-5.5.0/data/en/words/words.lang.3-const --- link-grammar-5.3.16/data/en/words/words.lang.3-const 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.lang.3-const 2018-03-10 05:17:57.000000000 +0000 @@ -0,0 +1,445 @@ +Bable +Balinese +Baltic +Baluchi +Bambara +Bamileke +Banda +Bantu +Basa +Bashkir +Basque +Batak +Bedawiyet +Beja +Belarusian +Bemba +Bengali +Berber +Bhojpuri +Bhutani +Bihari +Bikol +Bilin +Bini +Bislama +Blin +Bliss +Blissymbolics +Blissymbols +Bokmål +Bosnian +Braj +Breton +British +Buginese +Bulgarian +Buriat +Burmese +Byelorussian +Caddo +Cambodian +Cantonese +Castilian +Catalan +Caucasian +Cebuano +Celtic +Central_Khmer +Chagatai +Chamic +Chamorro +Chechen +Cherokee +Chewa +Cheyenne +Chibcha +Chichewa +Chinese +Chinook +Chipewyan +Choctaw +Chuang +Church_Slavic +Church_Slavonic +Chuukese +Chuvash +Coptic +Cornish +Corsican +Cree +Creek +Creole +Crimean_Tatar +Crimean_Turkish +Croatian +Cushitic +Czech +Dakota +Danish +Dargwa +Delaware +Dene_Suline +Dhivehi +Dimili +Dimli +Dinka +Divehi +Dogri +Dogrib +Dravidian +Duala +Dutch +Dyula +Dzongkha +Faeroese +Fang +Fanti +Faroese +Fiji +Fijian +Filipino +Finnish +Finno-Ugrian +Flemish +Fon +French +Frisian +Friulian +Fulah +Ga +Gaelic +Galibi_Carib +Galician +Ganda +Gayo +Gbaya +Geez +Georgian +German +Germanic +Gikuyu +Gilbertese +Gondi +Gorontalo +Gothic +Grebo +Greek +Greenlandic +Guarani +Gujarati +Gwich'in +Haida +Haitian +Haitian_Creole +Hausa +Hawaiian +Hebrew +Herero +Hiligaynon +Himachali +Hindi +Hiri_Motu +Hittite +Hmong +Hungarian +Hupa +Japanese +Javanese +Jingpho +Judeo-Arabic +Judeo-Persian +Kabardian +Kabyle +Kachin +Kalaallisut +Kalmyk +Kamba +Kannada +Kanuri +Kapampangan +Karachay-Balkar +Kara-Kalpak +Karelian +Karen +Kashmiri +Kashubian +Kawi +Kazakh +Khasi +Khoisan +Khotanese +Kikuyu +Kimbundu +Kinyarwanda +Kirdki +Kirghiz +Kirmanjki +Kirundi +Klingon +Komi +Kongo +Konkani +Korean +Kosraean +Kpelle +Kru +Kuanyama +Kumyk +Kurdish +Kurukh +Kutenai +Kwanyama +Kyrgyz +Ladino +Lahnda +Lamba +Land_Dayak +Lao +Laothian +Latin +Latvian +Leonese +Lettish +Letzeburgesch +Lezghian +Limburgan +Limburger +Limburgish +Lingala +Lithuanian +Lojban +Lower_Sorbian +Lozi +Luba-Katanga +Luba-Lulua +Luiseno +Lule_Sami +Lunda +Luo +Lushai +Luxembourgish +Macedonian +Macedo-Romanian +Madurese +Magahi +Maithili +Makasar +Malagasy +Malay +Malayalam +Maldivian +Maltese +Manchu +Mandar +Mandingo +Manipuri +Manobo +Manx +Maori +Mapuche +Mapudungun +Marathi +Mari +Marshallese +Marwari +Masai +Mayan +Mende +Micmac +Mi'kmaq +Minangkabau +Mirandese +Mohawk +Moksha +Moldavian +Moldovan +Mong +Mongo +Mongolian +Mon-Khmer +Montenegrin +Mossi +Multiple +Munda +Nahuatl +Nauru +Navaho +Navajo +Ndebele +Ndonga +Neapolitan +Nepal_Bhasa +Nepali +Newari +Nias +Niger-Kordofanian +Nilo-Saharan +Niuean +N'Ko +Nogai +Norse +Norwegian +Nubian +Nuosu +Nyamwezi +Nyanja +Nyankole +Nyasa +Nynorsk +Nyoro +Nzima +Pahlavi +Palauan +Pali +Pampanga +Pangasinan +Panjabi +Papiamento +Papuan +Pashto +Pedi +Persian +Philippine +Phoenician +Pilipino +Pohnpeian +Polish +Portuguese +Prakrit +Provençal +Punjabi +Pushto +Quechua +Rajasthani +Rapanui +Rarotongan +Rhaeto-Romance +Romance +Romanian +Romansh +Romany +Rundi +Russian +Sakan +Salishan +Samaritan_Aramaic +Sami +Samoan +Sandawe +Sango +Sangro +Sanskrit +Santali +Sardinian +Sasak +Saxon +Scots +Scottish_Gaelic +Selkup +Semitic +Sepedi +Serbian +Serbo-Croatian +Serer +Sesotho +Setswana +Shan +Shona +Sichuan_Yi +Sicilian +Sidamo +Siksika +Sindhi +Singhalese +Sinhala +Sinhalese +Sino-Tibetan +Siouan +Siswati +Skolt_Sami +Slave +Slavic +Slovak +Slovenian +Sogdian +Somali +Songhai +Soninke +Sorbian +Sotho +Spanish +Sranan_Tongo +Sudanese +Sukuma +Sumerian +Sundanese +Susu +Swahili +Swati +Swedish +Swiss_German +Syriac +Tagalog +Tahitian +Tai +Tajik +Tamashek +Tamazight +Tamil +Tatar +Tegulu +Telugu +Tereno +Tetum +Thai +Tibetan +Tigre +Tigrinya +Timne +Tiv +Tlingit +Tokelau +Tok_Pisin +Tonga +Tsimshian +Tsonga +Tswana +Tumbuka +Tupi +Turkish +Turkmen +Tuvalu +Tuvinian +Twi +Vai +Valencian +Venda +Vietnamese +Volapuk +Volapük +Votic +Wakashan +Walamo +Walloon +Waray +Washo +Welch +Welsh +Western_Pahari +Wolof +Xhosa +Yakut +Yao +Yapese +Yiddish +Yoruba +Yupik +Zande +Zapotec +Zaza +Zazaki +Zenaga +Zhuang +Zulu +Zuni diff -Nru link-grammar-5.3.16/data/en/words/words.lang.3-vowel link-grammar-5.5.0/data/en/words/words.lang.3-vowel --- link-grammar-5.3.16/data/en/words/words.lang.3-vowel 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.lang.3-vowel 2018-03-10 05:17:57.000000000 +0000 @@ -0,0 +1,109 @@ +Abkhazian +Achinese +Acoli +Adangme +Adygei +Adyghe +Afan +Afar +Afghani +Afrihili +Afrikaans +Afro-Asiatic +Ainu +Akan +Akkadian +Albanian +Alberian +Alemannic +Aleut +Algerian +Algonquian +Alsatian +Altai +Altaic +Amharic +Andorran +Angentinian +Angika +Apache +Arabic +Aragonese +Aramaic +Arapaho +Arawak +Armenian +Aromanian +Arumanian +Aserbaijani +Assamese +Asturian +Asturleonese +Athapascan +Australian +Austronesian +Avaric +Avestan +Awadhi +Aymara +Azerbaijani +Azeri +Edo +Efik +Egyptian +Ekajuk +Elamite +English +Eritrean +Erzya +Esperanto +Estonian +Ewe +Ewondo +Iban +Icelandic +Icelandish +Ido +Igbo +Ijo +Iloko +Inari_Sami +Indian +Indic +Indo-European +Indonesian +Ingush +Interlingua +Interlingue +Inuktitut +Inupiak +Inupiaq +Iranian +Irish +Iroquoian +Italian +Occidental +Occitan +Oirat +Ojibwa +Old_Bulgarian +Old_Church_Slavonic +Old_English +Old_Slavonic +Omani +Oriya +Oromo +Osage +Ossetian +Ossetic +Otomian +Udmurt +Ugaritic +Uighur +Uigur +Ukrainian +Umbundu +Upper_Sorbian +Urdu +Uyghur +Uzbek diff -Nru link-grammar-5.3.16/data/en/words/words-medical.adv.1 link-grammar-5.5.0/data/en/words/words-medical.adv.1 --- link-grammar-5.3.16/data/en/words/words-medical.adv.1 2014-11-15 22:28:38.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words-medical.adv.1 2018-02-23 00:06:56.000000000 +0000 @@ -268,7 +268,6 @@ diopsimetrically dioptometrically diplopiometrically -directionality directionally directoscopically direfully @@ -375,6 +374,7 @@ epistatically equipotently equivalently +equivariantly equivocally ergometrically ergonomically @@ -545,6 +545,7 @@ immunologically immunomorphologically immunospecifically +impliedly inaptly inaudibly incrementally diff -Nru link-grammar-5.3.16/data/en/words/words-medical.v.4.1 link-grammar-5.5.0/data/en/words/words-medical.v.4.1 --- link-grammar-5.3.16/data/en/words/words-medical.v.4.1 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words-medical.v.4.1 2018-02-22 01:41:26.000000000 +0000 @@ -539,7 +539,6 @@ cross-clamp.v crossclamp.v cross_dress -cross-link.v cross_match cross-match.v cross-protect.v @@ -1074,7 +1073,7 @@ freeze-dry.v freeze-fracture.v freight.v -freind.v +friend.v fringe.v frolick.v fruit.v diff -Nru link-grammar-5.3.16/data/en/words/words-medical.v.4.2 link-grammar-5.5.0/data/en/words/words-medical.v.4.2 --- link-grammar-5.3.16/data/en/words/words-medical.v.4.2 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words-medical.v.4.2 2018-02-22 01:41:30.000000000 +0000 @@ -548,7 +548,6 @@ crocks.v cross-clamps.v crossclamps.v -cross-links.v cross_matches cross-matches.v crossmatches.v @@ -1085,7 +1084,7 @@ freeze-dries.v freeze-fractures.v freights.v -freinds.v +friends.v fringes.v frolicks.v fruits.v diff -Nru link-grammar-5.3.16/data/en/words/words-medical.v.4.3 link-grammar-5.5.0/data/en/words/words-medical.v.4.3 --- link-grammar-5.3.16/data/en/words/words-medical.v.4.3 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words-medical.v.4.3 2018-03-11 23:13:19.000000000 +0000 @@ -265,7 +265,6 @@ bulled.v-d bunkered.v-d burked.v-d -burnt.v-d burred.v-d bussed.v-d cached.v-d @@ -570,7 +569,6 @@ crocked.v-d cross-clamped.v-d crossclamped.v-d -cross-linked.v-d cross_matched cross-matched.v-d cross-protected.v-d @@ -1119,7 +1117,7 @@ freeze-dried.v-d freeze-fractured.v-d freighted.v-d -freinded.v-d +friended.v-d fringed.v-d frivolled.v-d frolicked.v-d @@ -2920,6 +2918,7 @@ unmaned.v-d unravelled.v-d unroofed.v-d +unsatisfied.v-d unsayed.v-d unseated.v-d unsexed.v-d diff -Nru link-grammar-5.3.16/data/en/words/words-medical.v.4.4 link-grammar-5.5.0/data/en/words/words-medical.v.4.4 --- link-grammar-5.3.16/data/en/words/words-medical.v.4.4 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words-medical.v.4.4 2018-02-22 01:41:41.000000000 +0000 @@ -570,7 +570,6 @@ crooking.v cross-clamping.v crossclamping.v -cross-linking.v cross-matching.v cross-protecting.v crossprotecting.v @@ -1133,7 +1132,7 @@ freeze-drying.v freeze-fracturing.v freighting.v -freinding.v +friending.v fringing.v frivolling.v frolicking.v diff -Nru link-grammar-5.3.16/data/en/words/words-medical.v.4.5 link-grammar-5.5.0/data/en/words/words-medical.v.4.5 --- link-grammar-5.3.16/data/en/words/words-medical.v.4.5 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words-medical.v.4.5 2018-02-22 01:41:46.000000000 +0000 @@ -572,7 +572,6 @@ crooking.g cross-clamping.g crossclamping.g -cross-linking.g cross-matching.g crossmatching.g cross-protecting.g @@ -1139,7 +1138,7 @@ freeze-drying.g freeze-fracturing.g freighting.g -freinding.g +friending.g fringing.g frivolling.g frolicking.g diff -Nru link-grammar-5.3.16/data/en/words/words.n.1-const link-grammar-5.5.0/data/en/words/words.n.1-const --- link-grammar-5.3.16/data/en/words/words.n.1-const 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.n.1-const 2018-04-26 22:49:48.000000000 +0000 @@ -72,6 +72,7 @@ ball.s balustrade.n bambino.n +BA.n banality.s banana.s bandage.n @@ -325,6 +326,7 @@ bike.n bikini.s bilabial.n +bilateralism.n bilingual.n billboard.n billet.n @@ -867,6 +869,7 @@ calliope.s call.n call-out.n +callout.n calorie.n calumny.s calypso.n @@ -875,6 +878,7 @@ cameo.n camera.n Cameroonian.s +camisole.n campaigner.n camper.n camp.n @@ -1289,6 +1293,7 @@ circumference.n circumflex.n circumlocution.s +circumposition.n circumstance.n circus.n cissy.n @@ -1429,6 +1434,7 @@ cocoon.n coda.n codename.n +codepoint.n code.s codeshare.n codfish.s @@ -1519,6 +1525,7 @@ commissionaire.n commissioner.n commission.s +commit.n committee.n commode.n commodity.n @@ -1535,11 +1542,13 @@ Communist.n community.n commutation.s +commutator.n commuter.n compact.n companion.n company.s comparative.n +comparator.n comparison.s compartment.n compass.n @@ -1705,6 +1714,7 @@ contaminant.n contemporary.n contender.n +content.n contestant.n contest.n context.s @@ -1818,6 +1828,7 @@ corpse.n corps.s corpuscle.n +corpus.n corral.n correction.s corrective.n @@ -2161,6 +2172,7 @@ dart.n dashboard.n dash.s +dataset.n dateline.n date.n daughter.n @@ -2318,6 +2330,7 @@ deputy.n derailment.s derby.n +dereference.n derivation.s derivative.n dermatologist.n @@ -2328,6 +2341,7 @@ descender.n descent.s description.n +descriptor.n deserter.n desertification.n desertion.s @@ -2347,6 +2361,7 @@ destination.n destiny.s destroyer.n +destructor.n detachment.s detail.s detainee.n @@ -2825,6 +2840,7 @@ fakir.n falcon.n fall.s +fallthrough.n falsehood.s falsetto.s falsification.s @@ -2913,6 +2929,7 @@ ferment.n fern.n ferret.n +ferrite.n ferryboat.s ferry.n fertilizer.s @@ -2954,6 +2971,8 @@ filament.n filbert.s file.n +filename.n +filesystem.n filibuster.n Filipino.s fillet.n @@ -2996,12 +3015,13 @@ firelight.n fireman.n fireplace.n -fire.s +fire.n fireside.n firework.n firmament.n firm.n -fir.s +fir.n +first.n firstborn.n fishbone.n fishcake.n @@ -3288,8 +3308,8 @@ fuhrer.n fulcrum.n fullback.n -full-bodied.a -fullbodied.a +full-bodied.n +fullbodied.n fuller.n fumble.n fume.n @@ -3421,6 +3441,7 @@ gelding.n gell.n gel.s +gemination.n gem.n gemshorn.s gendarme.n @@ -3512,6 +3533,7 @@ glint.n glissade.n glitch.n +global.n globe.n globetrotter.n globule.n @@ -3527,6 +3549,7 @@ gluten.n glut.n glutton.n +glyph.n gnat.n gnome.n gnu.n @@ -3563,6 +3586,7 @@ goodnight.n goody.n goof.n +google.n goolash.s goon.n gooseberry.n @@ -3573,6 +3597,7 @@ gosling.n gospel.s gossip.s +gouache.n goulash.s gourd.n gourmand.n @@ -4004,6 +4029,7 @@ hippo.n hippopotamus.n hiss.n +histogramme.n histogram.n historian.n history.s @@ -4234,6 +4260,7 @@ jaw.n jay.n jaywalker.n +JD.n jeep.n jeer.n jellyfish.s @@ -4577,6 +4604,7 @@ lever.n levitation.s levy.n +lexeme.n lexicographer.n lexicon.n liability.s @@ -4617,6 +4645,7 @@ life.s lifestyle.s lifetime.n +lift.n liftoff.n ligament.n ligature.n @@ -4683,6 +4712,7 @@ livery.n lizard.n llama.n +LLD.n loafer.n loaf.n loan.s @@ -4693,6 +4723,7 @@ lobsterman.n lobster.s locale.n +localhost.n locality.s localization.s local.n @@ -4731,6 +4762,8 @@ look-alike.n look.n lookout.n +look-up.n +lookup.n loom.n loo.n loon.n @@ -4756,6 +4789,7 @@ lover.n love.s lowbrow.n +lowercase.n lowlander.n low.n loyalist.n @@ -4838,6 +4872,7 @@ mainstream.n maisonnette.n major.n +makefile.n maker.n makeshift.n make-up.s @@ -5004,6 +5039,7 @@ maypole.n maze.n mazurka.n +MBA.n mbira.s meadow.s mead.s @@ -5109,6 +5145,7 @@ Mexican.s mezzanine.n mezzo-soprano.s +MFA.n miaou.n miaow.n miasma.n @@ -5220,6 +5257,7 @@ misdeal.n misdeed.n misdemeanour.n +misdiagnosis.n misdoing.n miser.n misfire.n @@ -5286,6 +5324,7 @@ mole.s mollusc.n mollusk.n +monad.n monarchist.n monarch.n monarchy.s @@ -5417,6 +5456,7 @@ multiple.n multiplexer.n multiplex.s +multiset.n mummy.n municipality.n munition.n @@ -5444,6 +5484,7 @@ mutant.n mutation.s mute.n +mutex.n mutilation.s mutineer.n mutiny.s @@ -5595,6 +5636,7 @@ noise.s nomad.n nomenclature.n +nominal.n nomination.n nominee.n nonce.n @@ -6026,6 +6068,7 @@ pharmacy.n pharynx.n phase.n +PhD.n pheasant.s phenomenon.n phenotype.n @@ -6289,6 +6332,7 @@ polyglot.n polygon.n polymerase.n +polymorphism.n polynomial.n polyp.n polysaccharide.n @@ -6356,6 +6400,7 @@ postgraduate.n postmark.n postmaster.n +postmodifier.n postponement.s postprocessor.n post.s @@ -6519,7 +6564,6 @@ proboscis.n procedure.s procession.s -process.n processor.n proconsulate.n proconsul.n @@ -6631,6 +6675,8 @@ psalter.n psaltery.s pseud.n +pseudocount.n +pseudo.n pseudonym.n psi.s psyche.n @@ -6667,6 +6713,7 @@ punchball.n punchbowl.n punch.s +punctuation.n puncture.n pundit.n punishment.s @@ -6958,6 +7005,7 @@ redbreast.n redcap.n redcoat.s +redeclaration.n redeemer.n redevelopment.s redhead.n @@ -7053,6 +7101,7 @@ religion.s relleno.n remainder.n +remainer.n remake.n remarriage.s remedy.n @@ -7063,6 +7112,7 @@ remittance.s remnant.n remonstrance.s +remount.n removal.s remover.n renaissance.n @@ -7290,6 +7340,7 @@ roll.n romance.s Romanian.s +romanization.n romantic.n romp.n rondeau.n @@ -7307,6 +7358,7 @@ rosary.n rosebud.n rose.n +rosette.n roster.n rostrum.n rota.n @@ -7456,6 +7508,7 @@ saucepan.n saucer.n sauce.s +Saudi.s sauna.n sauropod.n sausage.s @@ -7573,6 +7626,7 @@ screwball.n screwdriver.n screw.n +screwup.n scribble.n scribbler.n scribe.n @@ -7818,6 +7872,7 @@ shoetree.n shogun.n shooter.n +shoot.n shopkeeper.n shoplifter.n shop.n @@ -8105,6 +8160,7 @@ socialite.n social.n society.s +sociolect.n sociologist.n socket.n sock.n @@ -8227,6 +8283,7 @@ speedboat.s speedometer.n speed.s +speedup.n speedway.n speller.n spelling.s @@ -8517,6 +8574,7 @@ streak.n streamer.n streamlet.n +streamline.n stream.n streetcar.n street.n @@ -8531,10 +8589,12 @@ strike.n strikeout.n striker.n +stringification.n string.s stripe.n strip.n stripper.n +striver.n stroboscope.n stroke.n stroller.n @@ -8572,6 +8632,7 @@ subcontinent.n subcontract.n subcontractor.n +subdirectory.n subdivision.s subdomain.n subeditor.n @@ -8590,6 +8651,7 @@ sub.n subordinate.n subpoena.n +subroutine.n subscriber.n subscription.s subscript.n @@ -8609,6 +8671,7 @@ subtitle.n subtlety.s subtraction.s +subtree.n suburb.n subversive.n subway.n @@ -8666,6 +8729,7 @@ super.n supernumerary.n superpower.n +superset.n superstar.n superstition.s superstructure.n @@ -8760,6 +8824,7 @@ symbolism.s symbolization.s symbol.n +symlink.n sympathizer.n symphony.n symposium.n @@ -9033,6 +9098,7 @@ thunderbolt.n thunderclap.s thunderstorm.n +thunk.n thwart.n thyroid.n tiara.n @@ -9057,6 +9123,7 @@ timbre.s timekeeper.n timeline.n +timeout.n timepiece.n timer.n timetable.n @@ -9091,6 +9158,7 @@ toboggan.n toccata.n toddler.n +todo.n toehold.n toe.n toenail.n @@ -9132,6 +9200,7 @@ toot.n topcoat.s topic.n +top-level.n topmast.n top.n topper.n @@ -9438,6 +9507,7 @@ typhoon.n typist.n typographer.n +typo.n tyranny.s tyrant.n tyre.n @@ -9606,6 +9676,7 @@ vocative.n vodka.s vogue.n +voicemail.n voice.n void.n volcano.n @@ -9629,7 +9700,6 @@ vulnerability.s vulture.n vulva.n -w/o.n wafer.n waffle.n wage.n @@ -9865,6 +9935,7 @@ woman.n wombat.n womb.n +w/o.n wonderland.n wonder.s woodcut.n diff -Nru link-grammar-5.3.16/data/en/words/words.n.1-vowel link-grammar-5.5.0/data/en/words/words.n.1-vowel --- link-grammar-5.3.16/data/en/words/words.n.1-vowel 2016-09-20 22:24:00.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.n.1-vowel 2018-03-10 05:17:57.000000000 +0000 @@ -9,6 +9,7 @@ abduction.n aberration.s ablaut.n +AB.n abnormality.s abode.n abolitionist.n @@ -39,6 +40,7 @@ accelerando.n accelerator.n accent.n +accessor.n accessory.n accident.n accolade.n @@ -96,6 +98,8 @@ addict.n addition.s additive.n +add-on.n +addon.n addressee.n address.n adherence.n @@ -121,6 +125,7 @@ adolescent.n adoption.s adornment.s +adposition.n adulteration.s adulterer.n adulteress.n @@ -238,6 +243,7 @@ alliance.s alligator.n allocation.s +allocator.n allotment.s allowance.n alloy.s @@ -511,6 +517,7 @@ ascetic.n ascription.n Aserbaijani.s +Azerbaijani.s ash.s ashtray.n Asian.s @@ -721,6 +728,7 @@ elixir.s elk.n ellipse.n +ellipsis.n elm.s el.n elocutionist.n @@ -772,6 +780,7 @@ enchilada_verde enclave.n enclosure.s +encoding.n encomium.n encounter.n encouragement.s @@ -787,6 +796,7 @@ endive.s end.n endorsement.s +endoscope.n endowment.s enema.n enemy.n @@ -903,7 +913,6 @@ estuary.n eta.s eternity.s -ethanol.n Ethiopian.s ethnographer.n ethnologist.n @@ -946,6 +955,7 @@ excess.s exchange.s exchequer.n +exciter.n exclamation.s excommunication.s excrescence.n @@ -1014,6 +1024,7 @@ extract.s extradition.s extra.n +extraposition.n extravagance.s extravaganza.n extreme.n @@ -1351,6 +1362,7 @@ invitation.n invocation.s invoice.n +invoker.n involution.n involvement.s ion.n @@ -1385,13 +1397,15 @@ iteration.n itinerary.n Ivorian.s +MBA.s +MFA.s oaf.n oak.s oar.n oatcake.s oath.n -oats oat.s +oats.n obbligato.n obelisk.n obituary.n @@ -1444,6 +1458,7 @@ official.n offload.n offprint.n +offset.n offshoot.n offspring.s ogre.n @@ -1472,6 +1487,7 @@ onset.n onslaught.n opal.s +opcode.n opener.n opera.s operation.s @@ -1660,6 +1676,7 @@ upheaval.s upholsterer.n upland.n +uppercase.n uppercut.n upper.n upright.n diff -Nru link-grammar-5.3.16/data/en/words/words.n.2.s link-grammar-5.5.0/data/en/words/words.n.2.s --- link-grammar-5.3.16/data/en/words/words.n.2.s 2016-09-20 22:25:58.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.n.2.s 2018-02-23 00:06:56.000000000 +0000 @@ -1,3 +1,4 @@ + abacuses.n abbes.n abbesses.n @@ -23,6 +24,7 @@ abscesses.n absences.n absentees.n +ABs.n absorbents.n abstentions.n abstractions.n @@ -39,6 +41,7 @@ accelerators.n accents.n accessories.n +accessors.n accidents.n accolades.n accommodations.n @@ -92,6 +95,8 @@ addicts.n additions.n additives.n +add-ons.n +addons.n addressees.n addresses.n adherences.n @@ -115,6 +120,7 @@ adolescents.n adoptions.n adornments.n +adpositions.n ads.n adulterers.n adulteresses.n @@ -237,6 +243,7 @@ allies.n alligators.n allocations.n +allocators.n allotments.n allowances.n alloys.n @@ -277,6 +284,7 @@ amortizations.n amours.n amperes.n +ampersands.n amphetamines.n amphibians.n amphitheatres.n @@ -642,6 +650,7 @@ backbones.n backdoors.n backdrops.n +backends.n backers.n backgrounds.n backings.n @@ -777,6 +786,7 @@ basins.n basketballs.n baskets.n +BAs.n bas-reliefs.n basses.n bassinets.n @@ -944,11 +954,12 @@ bikes.n bikinis.n bilabials.n +bilateralisms.n bilinguals.n billboards.n billets.n billfolds.n -billies +billies.n billings.n billionaires.n billionths.n @@ -1143,6 +1154,8 @@ books.n bookstalls.n bookworms.n +booleans.n +Booleans.n boomerangs.n boomers.n booms.n @@ -1486,6 +1499,7 @@ callings.n calliopes.n call-outs.n +callouts.n calls.n calories.n calumnies.n @@ -1496,6 +1510,7 @@ cameos.n cameras.n Cameroonians.n +camisoles.n campaigners.n campers.n campsites.n @@ -1540,6 +1555,7 @@ capers.n capes.n capillaries.n +capitalisations.n capitalists.n capitals.n capons.n @@ -1906,6 +1922,7 @@ circumferences.n circumflexes.n circumlocutions.n +circumpositions.n circumstances.n circuses.n cissies.n @@ -2047,6 +2064,7 @@ cocos.n codas.n codenames.n +codepoints.n codeshares.n codes.n codfish.p @@ -2062,6 +2080,7 @@ cognates.n cogs.n cogwheels.n +cohomologies.n cohorts.n coiffeurs.n coiffures.n @@ -2138,6 +2157,7 @@ commissionaires.n commissioners.n commissions.n +commits.n committees.n commodes.n commodities.n @@ -2154,10 +2174,12 @@ Communists.n communities.n commutations.n +commutators.n commuters.n compacts.n companions.n comparatives.n +comparators.n compares.n comparisons.n compartments.n @@ -2395,6 +2417,7 @@ copes.n copies.n coppersmiths.n +coprocessors.n copses.n cops.n copulas.n @@ -2437,6 +2460,7 @@ corpses.n corps.p corpuscles.n +corpuses.n corrals.n corrections.n correctives.n @@ -2768,7 +2792,7 @@ damages.n dames.n damselfishs.n -damselflies +damselflies.n damsels.n dams.n dancers.n @@ -2784,6 +2808,7 @@ darts.n dashboards.n dashes.n +datasets.n datelines.n dates.n daughters.n @@ -2938,6 +2963,7 @@ deputies.n derailments.n derbies.n +dereferences.n derivations.n derivatives.n dermatologists.n @@ -2948,6 +2974,7 @@ descenders.n descents.n descriptions.n +descriptors.n deserters.n desertifications.n desertions.n @@ -2967,6 +2994,7 @@ destinations.n destinies.n destroyers.n +destructors.n detachments.n details.n detainees.n @@ -3088,7 +3116,7 @@ disambiguations.n disappearances.n disappointments.n -disassembblies +disassembblies.n disasters.n disbursements.n discards.n @@ -3520,6 +3548,7 @@ emirs.n emissaries.n emissions.n +emoticons.n emotions.n emperors.n empires.n @@ -3541,6 +3570,7 @@ enchiladas_verdes enclaves.n enclosures.n +encodings.n encomiums.n encounters.n encouragements.n @@ -3556,6 +3586,7 @@ endings.n endives.n endorsements.n +endoscopes.n endowments.n ends.n enemas.n @@ -3721,6 +3752,7 @@ exchanges.n exchequers.n excitements.n +exciters.n exclamations.n excommunications.n excrescences.n @@ -3787,6 +3819,7 @@ extinguishers.n extracts.n extraditions.n +extrapositions.n extras.n extravagances.n extravaganzas.n @@ -3839,6 +3872,7 @@ fakes.n fakirs.n falcons.n +fallthroughs.n falsehoods.n falsettos.n falsifications.n @@ -3929,6 +3963,7 @@ ferns.n ferrets.n ferries.n +ferrites.n ferryboats.n fertilizers.n festivals.n @@ -3967,7 +4002,9 @@ Fijians.n filaments.n filberts.n +filenames.n files.n +filesystems.n filibusters.n filings.n Filipinos.n @@ -4428,6 +4465,7 @@ geishas.n geldings.n gels.n +geminations.n gemshorns.n gems.n gendarmeries.n @@ -4488,7 +4526,7 @@ girders.n girdles.n girlfriends.n -girlies +girlies.n girls.n gists.n giveaways.n @@ -4515,6 +4553,7 @@ glints.n glissades.n glitches.n +globals.n globes.n globetrotters.n globules.n @@ -4529,6 +4568,7 @@ glutens.n gluts.n gluttons.n +glyphs.n gnats.n gnomes.n gnus.n @@ -4563,6 +4603,7 @@ goodies.n goods.n goofs.n +googles.n goons.n gooseberries.n gophers.n @@ -4571,6 +4612,7 @@ goslings.n gospels.n gossips.n +gouaches.n goulashes.n gourds.n gourmands.n @@ -4988,7 +5030,7 @@ hijacks.n hikers.n hikes.n -hillbillies +hillbillies.n hillocks.n hillsides.n hills.n @@ -5005,6 +5047,7 @@ hippos.n hips.n hisses.n +histogrammes.n histograms.n historians.n histories.n @@ -5047,12 +5090,14 @@ homeworlds.n homicides.n homilies.n +homologies.n homologs.n homomorphisms.n homonyms.n homophones.n homosexuals.n homos.n +homotopies.n Hondurans.n hones.n honeybees.n @@ -5110,6 +5155,7 @@ hotheads.n hothouses.n hotplates.n +hotspots.n hounds.n hourglasses.n houseboats.n @@ -5300,6 +5346,7 @@ incivilities.n inclinations.n inclines.n +includes.n incomes.n incongruities.n inconsistencies.n @@ -5377,6 +5424,7 @@ iniquities.n initialisations.n initializations.n +initializers.n initials.n initiates.n initiatives.n @@ -5389,6 +5437,7 @@ inkwells.n inlays.n inlets.n +inlinings.n inmates.n innings.n innkeepers.n @@ -5523,6 +5572,7 @@ invitations.n invocations.n invoices.n +invokers.n involutions.n involvements.n ionospheres.n @@ -5588,6 +5638,7 @@ jaws.n jays.n jaywalkers.n +JDs.n jeeps.n jeers.n jellies.n @@ -5684,7 +5735,7 @@ keepers.n keepsakes.n kegs.n -kellies +kellies.n kennels.n Kenyans.n kerbs.n @@ -5932,6 +5983,7 @@ levers.n levies.n levitations.n +lexemes.n lexicographers.n lexicons.n liabilities.n @@ -5972,6 +6024,7 @@ lifestyles.s lifetimes.n liftoffs.n +lifts.n ligaments.n ligatures.n lighters.n @@ -6041,6 +6094,7 @@ livings.n lizards.n llamas.n +LLDs.n loafers.n loans.n loaves.n @@ -6050,6 +6104,7 @@ lobs.n lobsters.n locales.n +localhosts.n localities.n localizations.n locals.n @@ -6085,8 +6140,11 @@ longboats.n longbows.n longings.n +look-alikes.n lookouts.n looks.n +look-ups.n +lookups.n looms.n loonies.n loons.n @@ -6112,6 +6170,7 @@ lovebirds.n lovers.n lowbrows.n +lowercases.n lowlanders.n lows.n loyalists.n @@ -6193,6 +6252,7 @@ maisonnettes.n majorities.n majors.n +makefiles.n makers.n makeshifts.n make-ups.n @@ -6360,6 +6420,7 @@ maypoles.n mazes.n mazurkas.n +MBAs.n mbiras.n meadows.n meads.n @@ -6464,6 +6525,7 @@ Mexicans.n mezzanines.n mezzo-sopranos.n +MFAs.n miaous.n miaows.n miasmas.n @@ -6498,7 +6560,6 @@ mikes.n mileages.n mileometers.n -miles.n milestones.n milieus.n militants.n @@ -6572,11 +6633,13 @@ miscarriages.n miscellanies.n misconceptions.n +misconfigurations.n miscounts.n miscreants.n misdeals.n misdeeds.n misdemeanours.n +misdiagnoses.n misdoings.n misers.n misfires.n @@ -6645,6 +6708,7 @@ moles.n molluscs.n mollusks.n +monads.n monarchies.n monarchists.n monarchs.n @@ -6775,6 +6839,7 @@ multiples.n multiplexers.n multiplexes.s +multisets.n multitudes.n mummies.n municipalities.n @@ -6802,6 +6867,7 @@ mutants.n mutations.n mutes.n +mutexes.n mutilations.n mutineers.n mutinies.n @@ -6950,6 +7016,7 @@ noises.n nomads.n nomenclatures.n +nominals.n nominations.n nominees.n nonces.n @@ -7081,6 +7148,7 @@ officials.n offloads.n offprints.n +offsets.n offshoots.n offspring.p ogres.n @@ -7107,6 +7175,7 @@ onsets.n onslaughts.n opals.n +opcodes.n openers.n openings.n operas.n @@ -7601,6 +7670,7 @@ pharmacologists.n pharynxes.n phases.n +PhDs.n pheasants.n philanderers.n philanthropists.n @@ -7864,6 +7934,7 @@ polyglots.n polygons.n polymerases.n +polymorphisms.n polynomials.n polyps.n polysaccharides.n @@ -7931,7 +8002,9 @@ postgraduates.n postmarks.n postmasters.n +postmodifiers.n postponements.n +postprocessors.n postscripts.n posts.n postulates.n @@ -7964,6 +8037,7 @@ practices.n practitioners.n praesidiums.n +pragmas.n pragmatists.n prairies.n praises.n @@ -8212,7 +8286,9 @@ psalteries.n psalters.n psalterys.n +pseudocounts.n pseudonyms.n +pseudos.n pseuds.n psis.n psyches.n @@ -8247,6 +8323,7 @@ punchballs.n punchbowls.n punches.n +punctuations.n punctures.n pundits.n punishments.n @@ -8543,6 +8620,7 @@ redbreasts.n redcaps.n redcoats.n +redeclarations.n redeemers.n redevelopments.n redheads.n @@ -8637,6 +8715,7 @@ religions.n rellenos.n remainders.n +remainers.n remains.n remakes.n remarriages.n @@ -8649,6 +8728,7 @@ remixes.n remnants.n remonstrances.n +remounts.n removals.n removers.n renaissances.n @@ -8879,6 +8959,7 @@ rolls.n romances.n Romanians.n +romanizations.n romantics.n romps.n rondeaus.n @@ -8896,6 +8977,7 @@ rosaries.n rosebuds.n roses.n +rosettes.n rosters.n rostrums.n rotaries.n @@ -9046,6 +9128,7 @@ saucepans.n saucers.n sauces.n +Saudis.n saunas.n sauropods.n sausages.n @@ -9163,6 +9246,7 @@ screwballs.n screwdrivers.n screws.n +screwups.n scribblers.n scribbles.n scribes.n @@ -9310,6 +9394,7 @@ settlements.n settlers.n setups.n +severities.n sewers.n sexagenarians.n sexes.n @@ -9405,6 +9490,7 @@ shoguns.n shooters.n shootings.n +shoots.n shopkeepers.n shoplifters.n shoppers.n @@ -9462,6 +9548,7 @@ sicknesses.n sideboards.n sidecars.n +side-effects.n sidekicks.n sidelights.n sidelines.n @@ -9629,6 +9716,7 @@ smacks.n smallholders.n smallholdings.n +smartphones.n smashes.n smatterings.n smears.n @@ -9690,6 +9778,7 @@ socialites.n socials.n societies.n +sociolects.n sociologists.n sockets.n socks.n @@ -9812,6 +9901,7 @@ speedboats.n speedometers.n speeds.n +speedups.n speedways.n spellers.n spellings.n @@ -10103,6 +10193,7 @@ streaks.n streamers.n streamlets.n +streamlines.n streams.n streetcars.n streets.n @@ -10117,10 +10208,12 @@ strikeouts.n strikers.n strikes.n +stringifications.n strings.n stripes.n strippers.n strips.n +strivers.n stroboscopes.n strokes.n strollers.n @@ -10156,9 +10249,10 @@ subcontinents.n subcontractors.n subcontracts.n +subdirectories.n subdivisions.n subeditors.n -subfamilies +subfamilies.n subgraphs.n subgroups.n subheadings.n @@ -10172,6 +10266,7 @@ submissions.n subordinates.n subpoenas.n +subroutines.n subscribers.n subscriptions.n subscripts.n @@ -10186,13 +10281,16 @@ substitutes.n substitutions.n substrates.n +substrings.n subterfuges.n subtitles.n subtleties.n subtractions.n +subtrees.n suburbs.n subversives.n subways.n +subwords.n successes.n successors.n suckers.n @@ -10237,7 +10335,7 @@ suntans.n supercomputers.n superconductors.n -superfamilies +superfamilies.n superficialities.n superfluities.n superintendents.n @@ -10246,6 +10344,7 @@ supermarkets.n supernumeraries.n superpowers.n +supersets.n supers.n superstars.n superstitions.n @@ -10341,6 +10440,7 @@ symbolisms.n symbolizations.n symbols.n +symlinks.n sympathies.n sympathizers.n symphonies.n @@ -10620,6 +10720,7 @@ thunderbolts.n thunderclaps.n thunderstorms.n +thunks.n thwarts.n thyroids.n tiaras.n @@ -10643,6 +10744,7 @@ timbres.n timekeepers.n timelines.n +timeouts.n timepieces.n timers.n timetables.n @@ -10676,6 +10778,7 @@ toboggans.n toccatas.n toddlers.n +todos.n toeholds.n toenails.n toes.n @@ -10716,6 +10819,7 @@ toots.n topcoats.n topics.n +top-levels.n topmasts.n topologies.n toppers.n @@ -11029,6 +11133,7 @@ typhoons.n typists.n typographers.n +typos.n tyrannies.n tyrants.n tyres.n @@ -11048,6 +11153,7 @@ unbelievers.n uncertainties.n uncles.n +underbars.n underbellies.n undercarriages.n underclasses.n @@ -11083,6 +11189,7 @@ units.n universes.n universities.n +unlinks.n untouchables.n untruths.n upbraidings.n @@ -11091,6 +11198,7 @@ upheavals.n upholsterers.n uplands.n +uppercases.n uppercuts.n uppers.n uprights.n @@ -11288,6 +11396,7 @@ vocatives.n vodkas.n vogues.n +voicemails.n voices.n voids.n volcanos.n @@ -11479,6 +11588,7 @@ whispers.n whistles.n whites.n +whitespaces.n whiting.p whizzes.n whodunits.n @@ -11495,6 +11605,7 @@ wigs.n wigwams.n wikis.n +wildcards.n wildebeests.n willows.n wills.n @@ -11552,6 +11663,7 @@ woofers.n woofs.n words.n +workarounds.n workbenches.n workbooks.n workdays.n @@ -11584,6 +11696,8 @@ wristbands.n wrists.n wristwatches.n +write-offs.n +writeoffs.n writers.n writings.n writs.n diff -Nru link-grammar-5.3.16/data/en/words/words.n.2.x link-grammar-5.5.0/data/en/words/words.n.2.x --- link-grammar-5.3.16/data/en/words/words.n.2.x 2016-09-20 22:25:40.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.n.2.x 2018-02-22 06:44:54.000000000 +0000 @@ -1,4 +1,4 @@ -aircraft.p +aircraft.p bacteria.p basemen.n beano.p @@ -12,6 +12,7 @@ chum.p clergymen.p congressmen.p +corpora.p criteria.p data.p deer.p @@ -38,6 +39,7 @@ hyphae.p kin.p larvae.p +latter.p lobstermen.p mailmen.p manga.n @@ -71,7 +73,9 @@ tambon.n teeth.p trivia.n +undersigned.p waterfowl.p women.p +world.p zucchini.p zuchini.p diff -Nru link-grammar-5.3.16/data/en/words/words.n.3-const link-grammar-5.5.0/data/en/words/words.n.3-const --- link-grammar-5.3.16/data/en/words/words.n.3-const 2016-10-16 04:56:40.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.n.3-const 2018-04-26 22:49:48.000000000 +0000 @@ -16,26 +16,24 @@ bagel.n-u baggage.n-u bagpipe.n-u -Bahamian.n-u bail.n-u bait.n-u balderdash.n-u baldness.n-u +ball.n-u ballast.n-u ballet.n-u -ball.n-u ballyhoo.n-u balm.n-u baloney.n-u -balsam.n-u balsa.n-u +balsam.n-u bamboo.n-u banana.n-u banditry.n-u bandoneón.n-u bandura.n-u bane.n-u -Bangladeshi.n-u banishment.n-u banjo.n-u bankholding.n-u @@ -43,18 +41,18 @@ banter.n-u baptism.n-u barbacoa.n-u -Barbadian.n-u barbiturate.n-u barf.n-u baritone.n-u barium.n-u -barleycorn.n-u barley.n-u +barleycorn.n-u barrenness.n-u barstar.n-u barter.n-u baryton.n-u basalt.n-u +base.n-u baseball.n-u basil.n-u basketball.n-u @@ -88,13 +86,9 @@ begging.n-u behalf.n-u behavior.n-u -behaviourism.n-u behaviour.n-u +behaviourism.n-u beige.n-u -Belarusian.n-u -Belgian.n-u -Belizian.n-u -Beninese.n-u benzene.n-u benzine.n-u benzol.n-u @@ -107,7 +101,6 @@ betterment.n-u bewilderment.n-u bezique.n-u -Bhutanian.n-u bias.n-u bibliography.n-u bicarbonate.n-u @@ -119,10 +112,12 @@ bilateralism.n-u bile.n-u bilge.n-u +billiards.n-u billing.n-u binary.n-u bingo.n-u biochemistry.n-u +biography.n-u biology.n-u biotechnology.n-u birch.n-u @@ -149,8 +144,8 @@ blindness.n-u bliss.n-u blitzkrieg.n-u -bloodlust.n-u blood.n-u +bloodlust.n-u bloodshed.n-u bloodthirstiness.n-u bloom.n-u @@ -160,20 +155,19 @@ bluff.n-u bluntness.n-u bluster.n-u -boarding.n-u board.n-u +boarding.n-u bodywork.n-u boldness.n-u -Bolivian.n-u bombarde.n-u bombardment.n-u bombast.n-u -bondage.n-u bond.n-u +bondage.n-u bonhomie.n-u +boo.n-u boogie.n-u bookkeeping.n-u -boo.n-u booze.n-u bop.n-u borax.n-u @@ -181,7 +175,6 @@ boredom.n-u boron.n-u borrowing.n-u -Bosnian.n-u botany.n-u bother.n-u botulism.n-u @@ -196,20 +189,19 @@ braille.n-u brain.n-u bramble.n-u -brandy.n-u bran.n-u +brandy.n-u brass.n-u bravado.n-u bravery.n-u brawn.n-u -Brazilian.n-u bread.n-u breadth.n-u breakdown.n-u breakfast.n-u breaststroke.n-u -breathing.n-u breath.n-u +breathing.n-u breed.n-u briar.n-u bribery.n-u @@ -221,18 +213,16 @@ brine.n-u brinkmanship.n-u brisket.n-u -British.n-u brocade.n-u broccoli.n-u bromide.n-u bromine.n-u bronchitis.n-u bronze.n-u -brotherhood.n-u broth.n-u +brotherhood.n-u brownstone.n-u brunch.n-u -Bruneian.n-u brush.n-u brushwork.n-u bubbly.n-u @@ -244,15 +234,14 @@ buffoonery.n-u buggery.n-u bugle.n-u -Bulgarian.n-u -bullion.n-u bull.n-u +bullion.n-u bullroarer.n-u bullshit.n-u bum.n-u +bun.n-u bunk.n-u bunkum.n-u -bun.n-u burden.n-u burglary.n-u burgundy.n-u @@ -260,15 +249,14 @@ burlap.n-u burlesque.n-u burrito.n-u -Burundian.n-u bush.n-u business.n-u bustle.n-u butane.n-u butchery.n-u +butter.n-u butterfat.n-u buttermilk.n-u -butter.n-u butterscotch.n-u cabbage.n-u cable.n-u @@ -289,13 +277,10 @@ callousness.n-u calumny.n-u camaraderie.n-u -Cambodian.n-u -Cameroonian.n-u camomile.n-u camouflage.n-u camping.n-u campus.n-u -Canadian.n-u cancellation.n-u cancer.n-u candlelight.n-u @@ -307,38 +292,37 @@ cannabis.n-u cannibalism.n-u canonization.n-u -cantaloupe.n-u cant.n-u -Cantonese.n-u +cantaloupe.n-u canvas.n-u +capital.n-u capitalisation.n-u capitalism.n-u capitalization.n-u -capital.n-u capitulation.n-u cappuccino.n-u capture.n-u caramel.n-u carbohydrate.n-u -carbonization.n-u carbon.n-u +carbonization.n-u carcinoma.n-u +card.n-u cardamom.n-u cardboard.n-u -card.n-u +care.n-u careerism.n-u carefulness.n-u carelessness.n-u -care.n-u cargo.n-u caricature.n-u carillon.n-u carmine.n-u carnage.n-u +carne.n-u carne_asada carne_gisada carne_guisada -carne.n-u carnitas.n-u carpentry.n-u carrion.n-u @@ -348,8 +332,8 @@ cartilage.n-u cartography.n-u casework.n-u -cashmere.n-u cash.n-u +cashmere.n-u castanets.n-u caste.n-u castigation.n-u @@ -400,49 +384,46 @@ championship.n-u change.n-u chaos.n-u -characterization.n-u character.n-u +characterization.n-u charcoal.n-u charge.n-u charisma.n-u charm.n-u chartreuse.n-u chastisement.n-u -château.n-u chattel.n-u chatter.n-u chauvinism.n-u cheapness.n-u cheek.n-u cheerfulness.n-u +cheese.n-u cheesecake.n-u cheesecloth.n-u -cheese.n-u chemistry.n-u chemotherapy.n-u chess.n-u chestnut.n-u +chi.n-u chiaroscuro.n-u +chic.n-u chicanery.n-u -chickenfeed.n-u chicken.n-u +chickenfeed.n-u chickenpox.n-u -chic.n-u chicory.n-u chiffon.n-u childbirth.n-u childhood.n-u -Chilean.n-u chile_con_carne chile_con_queso +chili.n-u chili_con_carne chili_con_queso -chili.n-u china.n-u chinaware.n-u -Chinese.n-u chintz.n-u -chi.n-u chiropody.n-u chitarra.n-u chivalry.n-u @@ -458,11 +439,12 @@ cholesterol.n-u choreography.n-u chorizo.n-u -chowder.n-u chow.n-u +chowder.n-u chrome.n-u chromium.n-u chronology.n-u +château.n-u church.n-u chutney.n-u cider.n-u @@ -474,6 +456,7 @@ circumcision.n-u circumlocution.n-u circumnavigation.n-u +circumposition.n-u circumspection.n-u circumvention.n-u cirrhosis.n-u @@ -487,9 +470,9 @@ claret.n-u clarification.n-u clarinet.n-u +class.n-u classicism.n-u classification.n-u -class.n-u claustrophobia.n-u claves.n-u clavichord.n-u @@ -503,8 +486,8 @@ clockwork.n-u closeness.n-u closure.n-u -clothing.n-u cloth.n-u +clothing.n-u clout.n-u clover.n-u clumsiness.n-u @@ -516,9 +499,9 @@ cockfighting.n-u cocoa.n-u coconut.n-u -codeine.n-u -code.n-u cod.n-u +code.n-u +codeine.n-u coeducation.n-u coercion.n-u coffee.n-u @@ -528,13 +511,14 @@ cognition.n-u cohabitation.n-u cohesion.n-u -coinage.n-u +cohomology.n-u coin.n-u +coinage.n-u coitus.n-u coke.n-u cola.n-u -coldness.n-u cold.n-u +coldness.n-u coleslaw.n-u colic.n-u colitis.n-u @@ -548,15 +532,14 @@ collision.n-u colloquy.n-u collusion.n-u -Colombian.n-u colonialism.n-u colonization.n-u -coloratura.n-u color.n-u -colouring.n-u +coloratura.n-u colour.n-u -combat.n-u +colouring.n-u comb.n-u +combat.n-u combustion.n-u comedy.n-u comeliness.n-u @@ -613,10 +596,10 @@ condescension.n-u condiment.n-u conditioner.n-u -conduction.n-u conduct.n-u -confectionery.n-u +conduction.n-u confection.n-u +confectionery.n-u confederation.n-u conferment.n-u confession.n-u @@ -628,8 +611,8 @@ confusion.n-u conga.n-u congestion.n-u -Congolese.n-u congregation.n-u +congress.n-u conjecture.n-u conjugation.n-u conjunction.n-u @@ -660,8 +643,8 @@ contamination.n-u contemplation.n-u contempt.n-u -contentment.n-u content.n-u +contentment.n-u context.n-u continuation.n-u contraband.n-u @@ -686,23 +669,23 @@ cookware.n-u coolant.n-u coolness.n-u -co-operation.n-u cooperation.n-u -co-ordination.n-u +co-operation.n-u coordination.n-u +co-ordination.n-u copper.n-u copra.n-u copulation.n-u copyright.n-u coral.n-u -cordial.n-u cord.n-u +cordial.n-u corduroy.n-u cork.n-u +corn.n-u cornet.n-u cornett.n-u cornflour.n-u -corn.n-u cornstarch.n-u coronation.n-u correction.n-u @@ -710,22 +693,21 @@ corroboration.n-u corrosion.n-u corruption.n-u -Corsican.n-u cortisone.n-u cosmology.n-u cost.n-u costume.n-u cotton.n-u council.n-u -counseling.n-u counsel.n-u +counseling.n-u counteraction.n-u counterespionage.n-u counterpoint.n-u countertenor.n-u courage.n-u -courtesy.n-u court.n-u +courtesy.n-u courtship.n-u coverage.n-u covetousness.n-u @@ -755,8 +737,6 @@ crisis.n-u crispness.n-u criticism.n-u -Croatian.n-u -Croat.n-u crochet.n-u crockery.n-u cromorne.n-u @@ -770,12 +750,11 @@ crumpet.n-u crust.n-u crwth.n-u +crystal.n-u crystallization.n-u crystallography.n-u crystallophone.n-u -crystal.n-u cuatro.n-u -Cuban.n-u cubism.n-u cucumber.n-u cuisine.n-u @@ -784,8 +763,8 @@ cunning.n-u cuprophone.n-u curd.n-u -cure-all.n-u cure.n-u +cure-all.n-u curling.n-u curry.n-u curtailment.n-u @@ -796,13 +775,11 @@ cutlass.n-u cutlery.n-u cuttlefish.n-u -cyanide.n-u cyan.n-u +cyanide.n-u cymbal.n-u cymbalum.n-u cynicism.n-u -Cypriote.n-u -Czech.n-u damage.n-u damask.n-u damnation.n-u @@ -810,7 +787,6 @@ damselfish.n-u dancing.n-u dandruff.n-u -Danish.n-u daring.n-u darkness.n-u dash.n-u @@ -855,8 +831,8 @@ defacement.n-u defamation.n-u default.n-u -defeatism.n-u defeat.n-u +defeatism.n-u defecation.n-u defection.n-u defense.n-u @@ -911,8 +887,8 @@ depopulation.n-u deportation.n-u deportment.n-u -deposition.n-u deposit.n-u +deposition.n-u deprecation.n-u depreciation.n-u depredation.n-u @@ -933,11 +909,11 @@ desecration.n-u desegregation.n-u desensitization.n-u -desertion.n-u desert.n-u +desertion.n-u deshabille.n-u -designation.n-u design.n-u +designation.n-u desolation.n-u despair.n-u despatch.n-u @@ -989,12 +965,12 @@ dill.n-u dilution.n-u diminution.n-u -dinner.n-u din.n-u +dinner.n-u dint.n-u dioxide.n-u -diphtheria.n-u dip.n-u +diphtheria.n-u dipsomania.n-u direction.n-u directness.n-u @@ -1101,7 +1077,6 @@ divorce.n-u dizziness.n-u djembe.n-u -Djiboutian.n-u doctrine.n-u documentation.n-u doggerel.n-u @@ -1110,7 +1085,6 @@ dole.n-u domestication.n-u domination.n-u -Dominican.n-u dominion.n-u donation.n-u donut.n-u @@ -1131,8 +1105,8 @@ dread.n-u dressage.n-u dressmaking.n-u -drinking.n-u drink.n-u +drinking.n-u drivel.n-u drizzle.n-u drone.n-u @@ -1162,14 +1136,14 @@ dyslexia.n-u dyspepsia.n-u fable.n-u -fabrication.n-u fabric.n-u +fabrication.n-u faction.n-u faglong.n-u faintness.n-u fairness.n-u -faithfulness.n-u faith.n-u +faithfulness.n-u falafel.n-u falconry.n-u fall.n-u @@ -1190,18 +1164,18 @@ fascism.n-u fashion.n-u fastidiousness.n-u +fat.n-u fatalism.n-u fate.n-u fatherhood.n-u fatigue.n-u -fat.n-u fauna.n-u favor.n-u -favouritism.n-u favour.n-u +favouritism.n-u federalism.n-u -feedback.n-u feed.n-u +feedback.n-u fegereng.n-u fellowship.n-u felt.n-u @@ -1219,31 +1193,28 @@ fianchetto.n-u fiat.n-u fiber.n-u +fibre.n-u fibreboard.n-u fibreglass.n-u -fibre.n-u fiction.n-u fiddle.n-u fieldwork.n-u fife.n-u -Fijian.n-u filbert.n-u filigree.n-u -Filipino.n-u film.n-u filth.n-u filtration.n-u financing.n-u finery.n-u finesse.n-u -Finnish.n-u -firebrick.n-u +fir.n-u fire.n-u +firebrick.n-u firewood.n-u firmness.n-u -fir.n-u -fishing.n-u fish.n-u +fishing.n-u fission.n-u fitness.n-u fixation.n-u @@ -1257,8 +1228,8 @@ flatness.n-u flattery.n-u flavor.n-u -flavouring.n-u flavour.n-u +flavouring.n-u flax.n-u fleece.n-u flesh.n-u @@ -1275,11 +1246,11 @@ flotsam.n-u flour.n-u flow.n-u +flu.n-u fluctuation.n-u fluff.n-u flugelhorn.n-u fluid.n-u -flu.n-u fluoridation.n-u fluoride.n-u fluoridization.n-u @@ -1308,10 +1279,10 @@ footwork.n-u forage.n-u force.n-u +fore.n-u foreboding.n-u foreclosure.n-u foreknowledge.n-u -fore.n-u foresight.n-u forest.n-u forestry.n-u @@ -1320,11 +1291,11 @@ forgery.n-u forgetfulness.n-u forgiveness.n-u +form.n-u formaldehyde.n-u formalism.n-u -formation.n-u format.n-u -form.n-u +formation.n-u formulation.n-u fornication.n-u fortepiano.n-u @@ -1343,7 +1314,6 @@ freedom.n-u freestyle.n-u freight.n-u -French.n-u frenzy.n-u fresco.n-u freshness.n-u @@ -1352,37 +1322,35 @@ friendship.n-u fright.n-u frijoles.n-u -frostbite.n-u frost.n-u +frostbite.n-u froth.n-u +fruit.n-u fruitcake.n-u fruition.n-u -fruit.n-u frustration.n-u fuel.n-u fuglung.n-u fulfilment.n-u fullness.n-u fumigation.n-u +fun.n-u functionalism.n-u fundamentalism.n-u fungicide.n-u fungus.n-u -fun.n-u +fur.n-u furlough.n-u furniture.n-u -fur.n-u fury.n-u fusion.n-u fuss.n-u futurism.n-u fuzz.n-u gab.n-u -Gabonian.n-u gadgetry.n-u -gallantry.n-u gall.n-u -Gambian.n-u +gallantry.n-u gambling.n-u gamboge.n-u game.n-u @@ -1391,22 +1359,22 @@ gammon.n-u gangrene.n-u gaol.n-u -garbage.n-u garb.n-u +garbage.n-u gardening.n-u garlic.n-u garnet.n-u -gaslight.n-u gas.n-u +gaslight.n-u gasoline.n-u gaspacho.n-u gastronomy.n-u gaudiness.n-u gauze.n-u gear.n-u -gelatine.n-u -gelatin.n-u gel.n-u +gelatin.n-u +gelatine.n-u gemshorn.n-u gender.n-u genealogy.n-u @@ -1420,18 +1388,15 @@ geography.n-u geology.n-u geometry.n-u -Georgian.n-u -German.n-u germination.n-u gerontology.n-u gestation.n-u -Ghanaian.n-u gibberish.n-u gilt.n-u -gingerbread.n-u +gin.n-u ginger.n-u +gingerbread.n-u gingham.n-u -gin.n-u girlhood.n-u girth.n-u glacier.n-u @@ -1439,8 +1404,8 @@ glamorization.n-u glamour.n-u glare.n-u -glasschord.n-u glass.n-u +glasschord.n-u glassware.n-u glaucoma.n-u glaze.n-u @@ -1459,25 +1424,27 @@ goatskin.n-u gobbledygook.n-u godspeed.n-u -goldenrod.n-u gold.n-u +goldenrod.n-u golf.n-u gong.n-u gonorrhea.n-u gonorrhoea.n-u +goo.n-u goodness.n-u goodwill.n-u goolash.n-u -goo.n-u gore.n-u gospel.n-u gossamer.n-u gossip.n-u +gouache.n-u goulash.n-u gout.n-u government.n-u grace.n-u graciousness.n-u +grade.n-u graduation.n-u graft.n-u grain.n-u @@ -1487,8 +1454,8 @@ grapefruit.n-u grapeshot.n-u graphite.n-u -grassland.n-u grass.n-u +grassland.n-u gratefulness.n-u gratification.n-u gratitude.n-u @@ -1498,15 +1465,13 @@ grease.n-u greasepaint.n-u greatness.n-u -greediness.n-u greed.n-u -Greek.n-u +greediness.n-u greenery.n-u -Grenadan.n-u grief.n-u grime.n-u -gristle.n-u grist.n-u +gristle.n-u grit.n-u ground.n-u groundwater.n-u @@ -1514,20 +1479,18 @@ growth.n-u grub.n-u gruel.n-u -guacamole.n-u guac.n-u +guacamole.n-u guano.n-u -Guatemalan.n-u guava.n-u guesswork.n-u guile.n-u guilt.n-u -Guinean.n-u guitar.n-u guitarrón.n-u guitjo.n-u -gumbo.n-u gum.n-u +gumbo.n-u gumption.n-u gumshoe.n-u gunfire.n-u @@ -1538,29 +1501,27 @@ gunrunning.n-u gustation.n-u gusto.n-u -Guyanese.n-u gynaecology.n-u gypsum.n-u gyration.n-u haberdashery.n-u -habitation.n-u -habitat.n-u habit.n-u +habitat.n-u +habitation.n-u habituation.n-u haddock.n-u haemoglobin.n-u haemophilia.n-u haemorrhage.n-u hail.n-u -hairdressing.n-u hair.n-u +hairdressing.n-u hairsplitting.n-u -Haitian.n-u half-time.n-u halibut.n-u hallucination.n-u -hamburger.n-u ham.n-u +hamburger.n-u handbells.n-u handicraft.n-u handiwork.n-u @@ -1584,8 +1545,8 @@ harpsichord.n-u harrassment.n-u harshness.n-u -hashish.n-u hash.n-u +hashish.n-u hatching.n-u hate.n-u hatred.n-u @@ -1593,22 +1554,22 @@ havoc.n-u hay.n-u hazard.n-u -hazel.n-u haze.n-u +hazel.n-u headache.n-u headgear.n-u headroom.n-u headway.n-u health.n-u hearsay.n-u +heart.n-u heartache.n-u heartbreak.n-u heartburn.n-u -heart.n-u -heather.n-u +heat.n-u heath.n-u +heather.n-u heating.n-u -heat.n-u heatstroke.n-u heaven.n-u heaviness.n-u @@ -1627,14 +1588,14 @@ hemp.n-u hepatitis.n-u heraldry.n-u -herbicide.n-u herb.n-u +herbicide.n-u heresy.n-u +hero.n-u heroin.n-u heroism.n-u -hero.n-u -herringbone.n-u herring.n-u +herringbone.n-u hesitation.n-u hessian.n-u heterodoxy.n-u @@ -1647,8 +1608,8 @@ hiring.n-u history.n-u hocchiku.n-u -hockey.n-u hock.n-u +hockey.n-u hogwash.n-u hold.n-u holiness.n-u @@ -1660,11 +1621,12 @@ homicide.n-u homily.n-u homoeopathy.n-u +homology.n-u homophobia.n-u -Honduran.n-u +homotopy.n-u +honey.n-u honeycomb.n-u honeydew.n-u -honey.n-u honeysuckle.n-u honor.n-u honour.n-u @@ -1681,8 +1643,8 @@ horseplay.n-u horsepower.n-u horseracing.n-u -horse-radish.n-u horseradish.n-u +horse-radish.n-u horticulture.n-u hose.n-u hosiery.n-u @@ -1702,7 +1664,6 @@ humor.n-u humour.n-u humus.n-u -Hungarian.n-u hunger.n-u hunting.n-u hurdy_gurdy @@ -1722,9 +1683,7 @@ hysteria.n-u jade.n-u jail.n-u -Jamaican.n-u jam.n-u -Japanese.n-u jargon.n-u jasmine.n-u jaundice.n-u @@ -1739,7 +1698,6 @@ jingoism.n-u jive.n-u jogging.n-u -Jordanian.n-u journalese.n-u journalism.n-u joy.n-u @@ -1762,12 +1720,9 @@ karate.n-u karma.n-u katzenklavier.n-u -Kazakhi.n-u -kazoo.n-u kedgeree.n-u kelp.n-u ken.n-u -Kenyan.n-u kerosene.n-u ketchup.n-u keytar.n-u @@ -1789,11 +1744,9 @@ knitwear.n-u know-how.n-u kora.n-u -Korean.n-u koto.n-u kouxian.n-u kudos.n-u -Kuwaiti.n-u labor.n-u labour.n-u lace.n-u @@ -1802,8 +1755,8 @@ lacrosse.n-u lager.n-u laissez-faire.n-u -lambda.n-u lamb.n-u +lambda.n-u lambskin.n-u lamellaphone.n-u lamentation.n-u @@ -1812,7 +1765,6 @@ landscape.n-u language.n-u languor.n-u -Laotian.n-u lap.n-u larceny.n-u larch.n-u @@ -1825,7 +1777,6 @@ latex.n-u lather.n-u latitude.n-u -Latvian.n-u laudanum.n-u laughter.n-u lava.n-u @@ -1838,7 +1789,6 @@ leapfrog.n-u leasing.n-u leather.n-u -Lebanese.n-u lechery.n-u lee.n-u leeway.n-u @@ -1846,8 +1796,8 @@ legend.n-u legislation.n-u leisure.n-u -lemonade.n-u lemon.n-u +lemonade.n-u lending.n-u length.n-u leprosy.n-u @@ -1866,24 +1816,22 @@ liberalism.n-u liberalization.n-u liberation.n-u -Liberian.n-u libido.n-u -Libyan.n-u licentiousness.n-u lichen.n-u lieu.n-u -lifeblood.n-u life.n-u +lifeblood.n-u lifestyle.n-u +light.n-u lightness.n-u lightning.n-u -light.n-u lignite.n-u likeness.n-u limbo.n-u +lime.n-u limejuice.n-u limelight.n-u -lime.n-u limestone.n-u limitation.n-u linage.n-u @@ -1896,12 +1844,11 @@ lipstick.n-u liquidation.n-u liquidizer.n-u -liquorice.n-u liquor.n-u +liquorice.n-u lirone.n-u literature.n-u lithography.n-u -Lithuanian.n-u litigation.n-u litmus.n-u litotes.n-u @@ -1932,8 +1879,8 @@ loss.n-u lotion.n-u loudness.n-u -loveliness.n-u love.n-u +loveliness.n-u lox.n-u lubricant.n-u lubrication.n-u @@ -1941,14 +1888,13 @@ luggage.n-u lumbago.n-u lumber.n-u -luncheon.n-u lunch.n-u +luncheon.n-u lunchtime.n-u -luster.n-u lust.n-u +luster.n-u lustre.n-u lute.n-u -Luxembourian.n-u luxury.n-u lye.n-u lymph.n-u @@ -1957,13 +1903,11 @@ lyricism.n-u macadam.n-u macaroni.n-u -Macedonian.n-u mace.n-u machination.n-u machinery.n-u machismo.n-u mackerel.n-u -Madagascan.n-u madness.n-u magazine.n-u magenta.n-u @@ -1982,15 +1926,11 @@ malachite.n-u maladjustment.n-u malaria.n-u -Malawian.n-u -Malaysian.n-u malformation.n-u malfunction.n-u -Malian.n-u malice.n-u malnutrition.n-u malpractice.n-u -Maltese.n-u malt.n-u maltreatment.n-u management.n-u @@ -2027,8 +1967,8 @@ marmalade.n-u marriage.n-u marrow.n-u -marshmallow.n-u marsh.n-u +marshmallow.n-u martyrdom.n-u maryjane.n-u marzipan.n-u @@ -2036,28 +1976,26 @@ mash.n-u masochism.n-u masonry.n-u +mast.n-u mastery.n-u mastication.n-u -mast.n-u masturbation.n-u +material.n-u materialism.n-u materialization.n-u -material.n-u -mathematics.n-u math.n-u +mathematics.n-u matricide.n-u matriculation.n-u matrimony.n-u matter.n-u matting.n-u maturation.n-u -Mauritanian.n-u -Mauritian.n-u mauve.n-u maximization.n-u mayhem.n-u -mayonnaise.n-u mayo.n-u +mayonnaise.n-u mbira.n-u mead.n-u meadow.n-u @@ -2102,13 +2040,13 @@ merrymaking.n-u mescaline.n-u mesh.n-u +mess.n-u messaging.n-u messiness.n-u -mess.n-u metabolism.n-u +metal.n-u metallophone.n-u metallurgy.n-u -metal.n-u metalware.n-u metaphor.n-u meteorology.n-u @@ -2119,13 +2057,11 @@ methodology.n-u methyl.n-u mettle.n-u -Mexican.n-u mezzo-soprano.n-u microbiology.n-u microfiche.n-u microfilm.n-u micromanagement.n-u -Micronesian.n-u microscopy.n-u midday.n-u midnight.n-u @@ -2170,8 +2106,8 @@ misquotation.n-u misrepresentation.n-u misrule.n-u -mistletoe.n-u mist.n-u +mistletoe.n-u mistranslation.n-u mistrust.n-u misuse.n-u @@ -2189,7 +2125,6 @@ mohair.n-u moisture.n-u molasses.n-u -Moldovan.n-u mole.n-u molestation.n-u molybdenum.n-u @@ -2198,7 +2133,6 @@ monarchy.n-u monasticism.n-u money.n-u -Mongolian.n-u monogamy.n-u monotheism.n-u monotony.n-u @@ -2208,7 +2142,6 @@ moorland.n-u morale.n-u morning.n-u -Moroccan.n-u morphine.n-u morphology.n-u morsing.n-u @@ -2222,7 +2155,7 @@ mourning.n-u mousse.n-u movement.n-u -Mozambican.n-u +mu.n-u muck.n-u muckraking.n-u mucus.n-u @@ -2230,16 +2163,15 @@ muesli.n-u multiplex.n-u multiplication.n-u -mu.n-u murder.n-u muscle.n-u musete.n-u mush.n-u mushroom.n-u -musicianship.n-u music.n-u -musketry.n-u +musicianship.n-u musk.n-u +musketry.n-u mustard.n-u mutation.n-u mutilation.n-u @@ -2254,7 +2186,6 @@ mythology.n-u nadaswaram.n-u naivete.n-u -Namibian.n-u napalm.n-u narcissism.n-u narration.n-u @@ -2265,7 +2196,6 @@ naturalization.n-u nature.n-u naught.n-u -Nauruan.n-u nausea.n-u navigation.n-u neatness.n-u @@ -2281,8 +2211,8 @@ nephritis.n-u nepotism.n-u nerve.n-u -netball.n-u net.n-u +netball.n-u neufchatel.n-u neuralgia.n-u neurasthenia.n-u @@ -2290,19 +2220,16 @@ neutralization.n-u newsprint.n-u ney.n-u -Nicaraguan.n-u nickel.n-u nicotine.n-u -Nigerian.n-u -Nigerien.n-u nightfall.n-u nihilism.n-u nil.n-u nirvana.n-u nitrate.n-u nitrogen.n-u -nitroglycerine.n-u nitroglycerin.n-u +nitroglycerine.n-u nix.n-u nohkan.n-u noise.n-u @@ -2315,7 +2242,6 @@ nonsense.n-u noon.n-u normalization.n-u -Norwegian.n-u nosh.n-u nostalgia.n-u notation.n-u @@ -2325,10 +2251,10 @@ nougat.n-u nourishment.n-u novella.n-u +nu.n-u nudism.n-u nullification.n-u numbness.n-u -nu.n-u nursing.n-u nurture.n-u nutmeg.n-u @@ -2347,19 +2273,17 @@ paint.n-u paisley.n-u paixiao.n-u -Pakistani.n-u palaeontology.n-u palendag.n-u paleness.n-u paleontology.n-u pallor.n-u palsy.n-u -Panamanian.n-u pancake.n-u pandemonium.n-u panelling.n-u panic.n-u -pan_pipen-u +pan_pipe pantheism.n-u pantomime.n-u papadam.n-u @@ -2367,9 +2291,9 @@ paperwork.n-u paprika.n-u papyrus.n-u +par.n-u paradise.n-u paraffin.n-u -Paraguayan.n-u parallelism.n-u paralysis.n-u paranoia.n-u @@ -2377,7 +2301,6 @@ parchment.n-u pardon.n-u parentage.n-u -par.n-u parochialism.n-u parody.n-u parole.n-u @@ -2398,9 +2321,9 @@ pasteurization.n-u pastry.n-u pasture.n-u +paté.n-u patchwork.n-u pate.n-u -paté.n-u paternalism.n-u pathology.n-u pathos.n-u @@ -2409,25 +2332,25 @@ patronage.n-u patter.n-u pause.n-u +pay.n-u paydirt.n-u payment.n-u -pay.n-u payout.n-u +pea.n-u peace.n-u peacetime.n-u peach.n-u -pea.n-u peanut.n-u -pearl.n-u pear.n-u +pearl.n-u peat.n-u pecan.n-u pectin.n-u pedagogy.n-u pedantry.n-u pedigree.n-u -peel.n-u pee.n-u +peel.n-u penetration.n-u penicillin.n-u penmanship.n-u @@ -2435,8 +2358,8 @@ penury.n-u peonage.n-u pep.n-u -peppermint.n-u pepper.n-u +peppermint.n-u perch.n-u percussion.n-u perfection.n-u @@ -2444,6 +2367,7 @@ perforation.n-u perfume.n-u peril.n-u +period.n-u periwinkle.n-u perjury.n-u permafrost.n-u @@ -2460,20 +2384,19 @@ perspiration.n-u persuasion.n-u perusal.n-u -Peruvian.n-u perversion.n-u pessimism.n-u pesticide.n-u -petroleum.n-u petrol.n-u +petroleum.n-u pettiness.n-u pewter.n-u peyote.n-u pharmacology.n-u pheasant.n-u +phi.n-u philanthropy.n-u philology.n-u -phi.n-u phlegm.n-u phonology.n-u phosphate.n-u @@ -2489,8 +2412,9 @@ physiology.n-u physiotherapy.n-u physique.n-u -pianoforte.n-u +pi.n-u piano.n-u +pianoforte.n-u pibgorn.n-u piccolo.n-u pickle.n-u @@ -2498,19 +2422,18 @@ pico_de_gallo pide.n-u pidgin.n-u -piecework.n-u pie.n-u +piecework.n-u piffle.n-u -pigmentation.n-u pigment.n-u +pigmentation.n-u pigskin.n-u pilaf.n-u pimento.n-u pinball.n-u -pineapple.n-u pine.n-u +pineapple.n-u pingpong.n-u -pi.n-u pipe.n-u piping.n-u pique.n-u @@ -2530,11 +2453,11 @@ plaque.n-u plasma.n-u plaster.n-u -plasticine.n-u plastic.n-u +plasticine.n-u platinum.n-u -playback.n-u play.n-u +playback.n-u playtime.n-u pleasure.n-u pluck.n-u @@ -2554,18 +2477,17 @@ poker.n-u polarization.n-u police.n-u -poliomyelitis.n-u polio.n-u +poliomyelitis.n-u polish.n-u -Polish.n-u politeness.n-u politics.n-u pollen.n-u pollination.n-u +pollo.n-u pollo_con_oregano pollo_con_queso pollo_frito -pollo.n-u pollution.n-u polo.n-u polska_kielbasa @@ -2574,10 +2496,10 @@ polytheism.n-u polythene.n-u pomp.n-u +pop.n-u popcorn.n-u popery.n-u poplar.n-u -pop.n-u poppycock.n-u popularization.n-u population.n-u @@ -2589,23 +2511,23 @@ porridge.n-u port.n-u portraiture.n-u -Portugese.n-u positivism.n-u possession.n-u -postage.n-u post.n-u +postage.n-u postponement.n-u posture.n-u +pot.n-u potash.n-u potassium.n-u potato.n-u potential.n-u -pot.n-u pottery.n-u poultry.n-u powder.n-u power.n-u practice.n-u +pragmatics.n-u pragmatism.n-u praise.n-u prattle.n-u @@ -2621,8 +2543,8 @@ prejudice.n-u premeditation.n-u preoccupation.n-u -preparation.n-u prep.n-u +preparation.n-u prescription.n-u presentation.n-u preservation.n-u @@ -2640,11 +2562,12 @@ prime.n-u primogeniture.n-u principal.n-u -printing.n-u print.n-u +printing.n-u prison.n-u privatization.n-u privilege.n-u +prize.n-u probate.n-u probation.n-u procedure.n-u @@ -2655,13 +2578,13 @@ production.n-u professionalism.n-u profile.n-u -profiteering.n-u profit.n-u +profiteering.n-u profusion.n-u prognostication.n-u programming.n-u -progression.n-u progress.n-u +progression.n-u prohibition.n-u prolongation.n-u promptness.n-u @@ -2676,13 +2599,13 @@ proportion.n-u propulsion.n-u proscription.n-u -prosecution.n-u prose.n-u +prosecution.n-u prosody.n-u prostitution.n-u prostration.n-u -protectionism.n-u protection.n-u +protectionism.n-u protein.n-u protest.n-u protocol.n-u @@ -2723,15 +2646,14 @@ pursuit.n-u pus.n-u puzzlement.n-u -Qatari.n-u quackery.n-u qualification.n-u quarantine.n-u quartz.n-u quatro.n-u queasiness.n-u -queso_blanco queso.n-u +queso_blanco quiche.n-u quicksand.n-u quicksilver.n-u @@ -2744,21 +2666,21 @@ race.n-u racing.n-u racism.n-u -racketeering.n-u racket.n-u +racketeering.n-u radar.n-u radiation.n-u radicalism.n-u +radio.n-u radiography.n-u radiology.n-u -radio.n-u radiotherapy.n-u radium.n-u radius.n-u rage.n-u ragtime.n-u -rainfall.n-u rain.n-u +rainfall.n-u rainstick.n-u rainwater.n-u ranch.n-u @@ -2766,8 +2688,8 @@ range.n-u rank.n-u ransom.n-u -rape.n-u rap.n-u +rape.n-u rapport.n-u rapture.n-u rarebit.n-u @@ -2779,6 +2701,7 @@ rayon.n-u reach.n-u reaction.n-u +reader.n-u reading.n-u readjustment.n-u readmission.n-u @@ -2807,8 +2730,8 @@ reconciliation.n-u reconsideration.n-u reconstruction.n-u -recorder.n-u record.n-u +recorder.n-u recourse.n-u recovery.n-u recreation.n-u @@ -2817,6 +2740,7 @@ rectification.n-u rectitude.n-u recuperation.n-u +redeclaration.n-u redemption.n-u redeployment.n-u redevelopment.n-u @@ -2829,15 +2753,15 @@ redwood.n-u reed.n-u reek.n-u -re-election.n-u reelection.n-u +re-election.n-u referral.n-u refinancing.n-u refinement.n-u reflection.n-u reforestation.n-u -reformation.n-u reform.n-u +reformation.n-u refraction.n-u refreshment.n-u refrigeration.n-u @@ -2922,9 +2846,9 @@ respect.n-u respiration.n-u respite.n-u +rest.n-u restitution.n-u restlessness.n-u -rest.n-u restoration.n-u restraint.n-u restructuring.n-u @@ -2946,8 +2870,8 @@ reunification.n-u reunion.n-u revaluation.n-u -revelation.n-u revel.n-u +revelation.n-u revelry.n-u revenge.n-u revenue.n-u @@ -2957,8 +2881,8 @@ reverse.n-u reversion.n-u review.n-u -revisionism.n-u revision.n-u +revisionism.n-u revitalization.n-u revival.n-u revocation.n-u @@ -2978,13 +2902,13 @@ richness.n-u ridicule.n-u rigging.n-u +right.n-u righteousness.n-u rightness.n-u -right.n-u rigmarole.n-u rigour.n-u -rioting.n-u riot.n-u +rioting.n-u risotto.n-u ritual.n-u rivalry.n-u @@ -2992,7 +2916,6 @@ robbery.n-u rock.n-u roe.n-u -Romanian.n-u romanticism.n-u roofing.n-u room.n-u @@ -3000,10 +2923,10 @@ rosemary.n-u rosewood.n-u rosin.n-u +rot.n-u rotation.n-u rotgut.n-u rotini.n-u -rot.n-u rouge.n-u roughage.n-u roulette.n-u @@ -3018,23 +2941,21 @@ ruggedization.n-u rugger.n-u ruin.n-u -rummy.n-u rum.n-u +rummy.n-u rumour.n-u rupture.n-u rush.n-u russet.n-u -Russian.n-u -rustle.n-u rust.n-u -Rwandan.n-u +rustle.n-u rye.n-u ryuteki.n-u sable.n-u sabotage.n-u saccharin.n-u -sackbut.n-u sack.n-u +sackbut.n-u sacrifice.n-u sacrilege.n-u sadism.n-u @@ -3055,24 +2976,22 @@ saline.n-u saliva.n-u salmon.n-u -salsa_mexicana salsa.n-u +salsa_mexicana salsa_rojo salsa_verde salt.n-u saltpetre.n-u saltwater.n-u -Salvadoran.n-u salvage.n-u salvation.n-u salve.n-u sameness.n-u -Samoan.n-u samponia.n-u sanction.n-u sanctuary.n-u -sandalwood.n-u sand.n-u +sandalwood.n-u sandpaper.n-u sandstone.n-u sandwich.n-u @@ -3113,8 +3032,8 @@ schnitzel.n-u scholarship.n-u scholasticism.n-u -schooling.n-u school.n-u +schooling.n-u schooltime.n-u sciatica.n-u scintillation.n-u @@ -3123,7 +3042,6 @@ scope.n-u scorn.n-u scotch.n-u -Scottish.n-u scrabble.n-u scrap.n-u scruple.n-u @@ -3132,9 +3050,9 @@ sculpture.n-u scum.n-u scurvy.n-u +sea.n-u seafood.n-u seamanship.n-u -sea.n-u seasickness.n-u seasoning.n-u seaweed.n-u @@ -3169,13 +3087,11 @@ semaphore.n-u semen.n-u semolina.n-u -Senegalese.n-u -sensationalism.n-u sensation.n-u +sensationalism.n-u sentiment.n-u separation.n-u sepia.n-u -Serbian.n-u serfdom.n-u serge.n-u seriousness.n-u @@ -3186,8 +3102,8 @@ set.n-u settlement.n-u sewage.n-u -sexism.n-u sex.n-u +sexism.n-u shade.n-u shading.n-u shag.n-u @@ -3234,19 +3150,18 @@ shrinkage.n-u shuteye.n-u shyness.n-u -Sicilian.n-u sickness.n-u siege.n-u sienna.n-u sight.n-u -sight-seeing.n-u sightseeing.n-u +sight-seeing.n-u sigma.n-u signification.n-u siku.n-u silica.n-u -silicone.n-u silicon.n-u +silicone.n-u silk.n-u silliness.n-u silt.n-u @@ -3284,8 +3199,6 @@ sloe.n-u slope.n-u sloth.n-u -Slovakian.n-u -Slovenian.n-u sludge.n-u slush.n-u smallpox.n-u @@ -3309,9 +3222,9 @@ socialism.n-u socialization.n-u sociology.n-u +sod.n-u soda.n-u sodium.n-u -sod.n-u sodomy.n-u softness.n-u software.n-u @@ -3324,7 +3237,6 @@ solipsism.n-u solitaire.n-u solitude.n-u -Somalian.n-u soot.n-u sophistication.n-u sophistry.n-u @@ -3343,7 +3255,6 @@ space.n-u spaghetti.n-u spam.n-u -Spanish.n-u spawn.n-u spearmint.n-u specialization.n-u @@ -3354,13 +3265,13 @@ speed.n-u spelling.n-u spending.n-u -spermicide.n-u sperm.n-u +spermicide.n-u spice.n-u spillage.n-u spillover.n-u -spinach.n-u spin.n-u +spinach.n-u spinsterhood.n-u spirit.n-u spite.n-u @@ -3406,15 +3317,15 @@ steak.n-u stealth.n-u steam.n-u -steelmaking.n-u steel.n-u +steelmaking.n-u steelpan.n-u stenography.n-u stereotype.n-u sterilization.n-u sterling.n-u -stewardship.n-u stew.n-u +stewardship.n-u stiffness.n-u stillness.n-u stimulation.n-u @@ -3431,8 +3342,10 @@ strength.n-u stress.n-u stretch.n-u +stretchiness.n-u strife.n-u string.n-u +stringification.n-u striptease.n-u strontium.n-u structure.n-u @@ -3443,8 +3356,8 @@ stubbornness.n-u stucco.n-u study.n-u -stuffing.n-u stuff.n-u +stuffing.n-u stupefaction.n-u stupor.n-u sturgeon.n-u @@ -3469,10 +3382,9 @@ subtraction.n-u suburbia.n-u subversion.n-u -succession.n-u success.n-u +succession.n-u suction.n-u -Sudanese.n-u suede.n-u suet.n-u suffering.n-u @@ -3485,10 +3397,10 @@ sulphur.n-u summer.n-u summertime.n-u +sun.n-u sunburn.n-u sundown.n-u sunlight.n-u -sun.n-u sunrise.n-u sunset.n-u sunshine.n-u @@ -3502,7 +3414,6 @@ support.n-u suppression.n-u surgery.n-u -Surinamese.n-u surprise.n-u surrealism.n-u surveying.n-u @@ -3513,7 +3424,6 @@ swamp.n-u sway.n-u sweat.n-u -Swedish.n-u sweetbread.n-u sweetening.n-u sweetmeat.n-u @@ -3521,7 +3431,6 @@ swelling.n-u swill.n-u swimsuit.n-u -Swiss.n-u swordplay.n-u swordsmanship.n-u sycamore.n-u @@ -3538,7 +3447,6 @@ syntax.n-u synthesizer.n-u syphilis.n-u -Syrian.n-u syrup.n-u systematization.n-u tabbouleh.n-u @@ -3548,13 +3456,12 @@ taboulleh.n-u taboulli.n-u tabulation.n-u -tackle.n-u tack.n-u +tackle.n-u tact.n-u taffeta.n-u taffy.n-u tag.n-u -Taiwanese.n-u takeoff.n-u take-up.n-u talcum.n-u @@ -3567,12 +3474,11 @@ tamburitza.n-u tandem.n-u tangerine.n-u -Tanzanian.n-u tape.n-u tapestry.n-u tapioca.n-u -tarmac.n-u tar.n-u +tarmac.n-u tarogato.n-u tarragon.n-u tartar.n-u @@ -3583,11 +3489,11 @@ taxation.n-u taxidermy.n-u taxonomy.n-u +tea.n-u teaching.n-u teak.n-u teal.n-u teamwork.n-u -tea.n-u teatime.n-u technique.n-u technology.n-u @@ -3599,9 +3505,9 @@ telepathy.n-u teletype.n-u television.n-u +temper.n-u temperament.n-u temperature.n-u -temper.n-u tenderloin.n-u tenderness.n-u tenné.n-u @@ -3615,15 +3521,13 @@ terra_cotta terrain.n-u territory.n-u -terrorism.n-u terror.n-u +terrorism.n-u testimony.n-u testing.n-u tetanus.n-u -Texan.n-u text.n-u texture.n-u -Thai.n-u thankfulness.n-u thanksgiving.n-u theater.n-u @@ -3642,8 +3546,8 @@ thrombosis.n-u throttle.n-u thrust.n-u -thunderclap.n-u thunder.n-u +thunderclap.n-u thyme.n-u tide.n-u tightness.n-u @@ -3653,9 +3557,9 @@ timeliness.n-u timing.n-u timpani.n-u +tin.n-u tinder.n-u tinfoil.n-u -tin.n-u tinsel.n-u tiredness.n-u tissue.n-u @@ -3669,8 +3573,8 @@ tomatillo.n-u tomato.n-u tomfoolery.n-u -tom-tom.n-u tomtom.n-u +tom-tom.n-u tone.n-u tonette.n-u tonnage.n-u @@ -3678,6 +3582,7 @@ toothache.n-u toothpaste.n-u topography.n-u +topology.n-u topsoil.n-u torment.n-u torpor.n-u @@ -3688,16 +3593,16 @@ totalitarianism.n-u touch.n-u tourism.n-u -town.n-u tow.n-u +town.n-u toxicology.n-u toxin.n-u trace.n-u traction.n-u trade.n-u trading.n-u -traditionalism.n-u tradition.n-u +traditionalism.n-u traffic.n-u tragedy.n-u training.n-u @@ -3709,16 +3614,16 @@ transformation.n-u transfusion.n-u transgression.n-u -transition.n-u transit.n-u +transition.n-u translation.n-u transmigration.n-u transmission.n-u transmutation.n-u transpiration.n-u transplantation.n-u -transportation.n-u transport.n-u +transportation.n-u transposition.n-u transubstantiation.n-u trash.n-u @@ -3739,7 +3644,6 @@ trifle.n-u trigonometry.n-u trim.n-u -Trinidadian.n-u tripe.n-u triplicate.n-u triumph.n-u @@ -3765,11 +3669,9 @@ tuna.n-u tundra.n-u tungsten.n-u -Tunisian.n-u turbot.n-u turf.n-u turkey.n-u -Turkish.n-u turmeric.n-u turmoil.n-u turnaround.n-u @@ -3812,7 +3714,6 @@ vending.n-u veneer.n-u veneration.n-u -Venezuelan.n-u venison.n-u venom.n-u ventilation.n-u @@ -3840,7 +3741,6 @@ victory.n-u video.n-u vielle.n-u -Vietnamese.n-u vigor.n-u vigour.n-u vihuela.n-u @@ -3851,10 +3751,10 @@ vinegar.n-u vintage.n-u vinyl.n-u +viol.n-u viola.n-u violation.n-u violin.n-u -viol.n-u violoncello.n-u violotta.n-u viridian.n-u @@ -3876,18 +3776,18 @@ wallpaper.n-u walnut.n-u wanderlust.n-u +war.n-u warfare.n-u warmth.n-u -war.n-u warpaint.n-u wartime.n-u washboard.n-u washing.n-u waste.n-u +water.n-u watercolour.n-u watercress.n-u watermelon.n-u -water.n-u wattage.n-u wax.n-u weakness.n-u @@ -3895,13 +3795,12 @@ weariness.n-u weather.n-u wedlock.n-u -weedkiller.n-u weed.n-u -weightlifting.n-u +weedkiller.n-u weight.n-u +weightlifting.n-u welfare.n-u well-being.n-u -Welsh.n-u westernization.n-u whalebone.n-u wheat.n-u @@ -3913,15 +3812,15 @@ whiskey.n-u whisky.n-u whistle.n-u -whiteness.n-u white.n-u +whiteness.n-u whitewash.n-u whiting.n-u wholemeal.n-u wholesale.n-u +wick.n-u wickedness.n-u wicker.n-u -wick.n-u width.n-u wilderness.n-u wildfire.n-u @@ -3931,32 +3830,32 @@ wind.n-u wine.n-u winter.n-u -wireless.n-u wire.n-u +wireless.n-u wiring.n-u wisdom.n-u wisteria.n-u +wit.n-u witchcraft.n-u withdrawal.n-u -wit.n-u wizardry.n-u woe.n-u womanhood.n-u womankind.n-u wonder.n-u wont.n-u +wood.n-u woodcraft.n-u woodland.n-u -wood.n-u woodwork.n-u wool.n-u -workmanship.n-u work.n-u +workmanship.n-u worry.n-u worship.n-u worsted.n-u -worthiness.n-u worth.n-u +worthiness.n-u wrath.n-u wreckage.n-u wrestling.n-u @@ -3971,7 +3870,6 @@ yarn.n-u yarrow.n-u yeast.n-u -Yemeni.n-u yew.n-u yield.n-u yoga.n-u @@ -3980,10 +3878,8 @@ yogurt.n-u yolk.n-u yore.n-u -youthfulness.n-u youth.n-u -Yugoslavian.n-u -Yugoslav.n-u +youthfulness.n-u yuletide.n-u zeal.n-u zealotry.n-u diff -Nru link-grammar-5.3.16/data/en/words/words.n.3-vowel link-grammar-5.5.0/data/en/words/words.n.3-vowel --- link-grammar-5.3.16/data/en/words/words.n.3-vowel 2016-09-20 22:37:21.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.n.3-vowel 2018-03-10 05:17:57.000000000 +0000 @@ -1,7 +1,5 @@ -a/c.n-u -A/C.n-u -abandonment.n-u abandon.n-u +abandonment.n-u abasement.n-u abdication.n-u aberration.n-u @@ -22,18 +20,20 @@ abstention.n-u abstraction.n-u abuse.n-u +a/c.n-u +A/C.n-u academia.n-u acceleration.n-u accentuation.n-u -accession.n-u access.n-u +accession.n-u acclaim.n-u acclamation.n-u acclimatization.n-u accommodation.n-u accomplishment.n-u -accordion.n-u accord.n-u +accordion.n-u accretion.n-u accumulation.n-u acetylene.n-u @@ -60,14 +60,14 @@ administration.n-u admiration.n-u admission.n-u +ado.n-u adobe.n-u adobo.n-u -ado.n-u adoption.n-u adoration.n-u adornment.n-u -adrenaline.n-u adrenalin.n-u +adrenaline.n-u adroitness.n-u adulation.n-u adulteration.n-u @@ -86,14 +86,12 @@ affection.n-u affiliation.n-u affirmation.n-u -Afghan.n-u -African.n-u aftercare.n-u aftermath.n-u afternoon.n-u agar.n-u -ageing.n-u age.n-u +ageing.n-u agglomeration.n-u aggravation.n-u aggregation.n-u @@ -102,7 +100,6 @@ aggro.n-u agidigbo.n-u agitation.n-u -ag.n-u agnosticism.n-u agony.n-u agoraphobia.n-u @@ -112,22 +109,19 @@ agronomy.n-u aid.n-u aim.n-u +air.n-u aircraft.n-u airmail.n-u -air.n-u alabaster.n-u alarm.n-u -Albanian.n-u -Alberian.n-u albumen.n-u alchemy.n-u -alcoholism.n-u alcohol.n-u +alcoholism.n-u ale.n-u alertness.n-u alfalfa.n-u algebra.n-u -Algerian.n-u alienation.n-u alignment.n-u alimony.n-u @@ -149,9 +143,9 @@ altitude.n-u alto.n-u altruism.n-u +alum.n-u aluminium.n-u aluminum.n-u -alum.n-u amalgamation.n-u amaranth.n-u amateurism.n-u @@ -161,7 +155,6 @@ ambush.n-u amelioration.n-u amendment.n-u -American.n-u amethyst.n-u ammo.n-u ammonia.n-u @@ -176,7 +169,6 @@ anaemia.n-u anaesthesia.n-u anaesthetic.n-u -anesthetic.n-u analgesia.n-u analogy.n-u analysis.n-u @@ -184,11 +176,9 @@ anarchy.n-u anathema.n-u anatomy.n-u -Andorran.n-u -Angentinian.n-u +anesthetic.n-u anger.n-u angina.n-u -Angolan.n-u angora.n-u angst.n-u anguish.n-u @@ -208,7 +198,6 @@ anthropology.n-u anticipation.n-u antifreeze.n-u -Antiguan.n-u antimony.n-u antipathy.n-u apartheid.n-u @@ -239,9 +228,8 @@ apricot.n-u aptitude.n-u aptness.n-u -aquamarine.n-u aqua.n-u -Arab.n-u +aquamarine.n-u arbitration.n-u archaeology.n-u archeology.n-u @@ -256,17 +244,17 @@ argumentation.n-u arithmetic.n-u armament.n-u -Armenian.n-u armor.n-u armour.n-u arpeggione.n-u arrangement.n-u arrest.n-u +arroz.n-u arroz_blanco arroz_con_pollo -arroz.n-u arsenic.n-u arson.n-u +art.n-u arteriosclerosis.n-u arthritis.n-u artichoke.n-u @@ -274,13 +262,10 @@ artifice.n-u artillery.n-u artistry.n-u -art.n-u artwork.n-u asbestos.n-u asceticism.n-u -Aserbaijani.n-u ash.n-u -Asian.n-u asparagus.n-u asphalt.n-u asphyxia.n-u @@ -321,8 +306,6 @@ audition.n-u aught.n-u augmentation.n-u -Australian.n-u -Austrian.n-u autarchy.n-u autarky.n-u authentication.n-u @@ -345,8 +328,8 @@ azure.n-u earnest.n-u earshot.n-u -earthenware.n-u earth.n-u +earthenware.n-u earwax.n-u ease.n-u ebony.n-u @@ -357,12 +340,11 @@ ecru.n-u ecstasy.n-u ectoplasm.n-u -Ecuadorian.n-u edification.n-u education.n-u effacement.n-u -effectiveness.n-u effect.n-u +effectiveness.n-u effluent.n-u effrontery.n-u effusiveness.n-u @@ -372,15 +354,14 @@ eggshell.n-u egoism.n-u egotism.n-u -Egyptian.n-u ejaculation.n-u ejection.n-u elaboration.n-u elan.n-u elastic.n-u elation.n-u -electioneering.n-u election.n-u +electioneering.n-u electrification.n-u electrochemistry.n-u electrocution.n-u @@ -434,12 +415,11 @@ endorsement.n-u endowment.n-u energy.n-u -enervation.v +enervation.n-u enforcement.n-u enfranchisement.n-u engagement.n-u engineering.n-u -English.n-u engraving.n-u enhancement.n-u enjoyment.n-u @@ -478,7 +458,6 @@ equivocation.n-u erasure.n-u erection.n-u -Eritrean.n-u erosion.n-u eroticism.n-u error.n-u @@ -496,12 +475,11 @@ estate.n-u esteem.n-u estimation.n-u -Estonian.n-u estrangement.n-u eta.n-u +ethanol.n-u ether.n-u ethics.n-u -Ethiopian.n-u ethnography.n-u ethnology.n-u ethyl.n-u @@ -510,7 +488,6 @@ etymology.n-u euphonium.n-u euphoria.n-u -European.n-u euthanasia.n-u evacuation.n-u evangelism.n-u @@ -550,10 +527,10 @@ exercise.n-u exertion.n-u exhalation.n-u -exhaustion.n-u exhaust.n-u -exhibitionism.n-u +exhaustion.n-u exhibition.n-u +exhibitionism.n-u exhilaration.n-u exhortation.n-u exile.n-u @@ -574,9 +551,9 @@ exportation.n-u exposition.n-u exposure.n-u -expressionism.n-u -expression.n-u express.n-u +expression.n-u +expressionism.n-u expropriation.n-u expulsion.n-u expurgation.n-u @@ -587,8 +564,8 @@ extinction.n-u extirpation.n-u extortion.n-u -extraction.n-u extract.n-u +extraction.n-u extradition.n-u extrapolation.n-u extrication.n-u @@ -597,9 +574,8 @@ eyesight.n-u eyestrain.n-u eyewash.n-u -iceberg.n-u -Icelandish.n-u ice.n-u +iceberg.n-u idealism.n-u idealization.n-u identification.n-u @@ -660,7 +636,6 @@ incursion.n-u indecision.n-u indemnification.n-u -Indian.n-u indictment.n-u indigestion.n-u indignation.n-u @@ -669,7 +644,6 @@ indisposition.n-u individualism.n-u indoctrination.n-u -Indonesian.n-u inducement.n-u induction.n-u industrialization.n-u @@ -784,30 +758,24 @@ iodine.n-u ionization.n-u iota.n-u -Iranian.n-u -Iraqi.n-u ire.n-u iridium.n-u -Irish.n-u -ironmongery.n-u iron.n-u +ironmongery.n-u ironware.n-u irony.n-u irrigation.n-u irritation.n-u -isolationism.n-u isolation.n-u -Israeli.n-u -Italian.n-u -Ivorian.n-u +isolationism.n-u ivory.n-u ivy.n-u oak.n-u oarsmanship.n-u +oat.n-u oatcake.n-u oatmeal.n-u -oatn-u -oat.n-u +oats.n-u obliteration.n-u oblivion.n-u obnoxiousness.n-u @@ -815,8 +783,8 @@ obscurantism.n-u observation.n-u obsession.n-u -obstructionism.n-u obstruction.n-u +obstructionism.n-u ocarina.n-u ocean.n-u ochre.n-u @@ -828,15 +796,14 @@ officialdom.n-u officialese.n-u offset.n-u -oilcloth.n-u oil.n-u +oilcloth.n-u ointment.n-u okra.n-u olestra.n-u oligarchy.n-u olive.n-u olivine.n-u -Omani.n-u omega.n-u omelet.n-u omelette.n-u @@ -861,18 +828,18 @@ optics.n-u optimism.n-u opus.n-u -orangeade.n-u orange.n-u +orangeade.n-u oratory.n-u orchestration.n-u ordination.n-u ore.n-u -organization.n-u organ.n-u +organization.n-u orientation.n-u origin.n-u -ornamentation.n-u ornament.n-u +ornamentation.n-u ornithology.n-u orthodoxy.n-u orthography.n-u @@ -912,9 +879,7 @@ oyster.n-u oystershell.n-u ozone.n-u -Ugandan.n-u ugliness.n-u -Ukrainian.n-u ukulele.n-u umbrage.n-u unconsciousness.n-u @@ -950,10 +915,9 @@ uranium.n-u urbanization.n-u urine.n-u -Uruguayan.n-u usage.n-u -usefulness.n-u use.n-u +usefulness.n-u usurpation.n-u usury.n-u utilitarianism.n-u diff -Nru link-grammar-5.3.16/data/en/words/words.n.3.y link-grammar-5.5.0/data/en/words/words.n.3.y --- link-grammar-5.3.16/data/en/words/words.n.3.y 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.n.3.y 2018-02-22 06:44:54.000000000 +0000 @@ -36,6 +36,7 @@ amiability.n-u amicability.n-u amity.n-u +amorality.n-u animosity.n-u annoyance.n-u anonymity.n-u @@ -81,8 +82,8 @@ brutality.n-u buoyancy.n-u bureaucracy.n-u -calm.n-u calmness.n-u +calm.n-u capability.n-u captivity.n-u catholicity.n-u @@ -178,6 +179,7 @@ dignity.n-u diligence.n-u diplomacy.n-u +directionality.n-u disability.n-u discontinuity.n-u discrepancy.n-u @@ -292,6 +294,7 @@ humanity.n-u humidity.n-u humility.n-u +idempotence.n-u identity.n-u idiocy.n-u ignorance.n-u @@ -402,6 +405,7 @@ intelligibility.n-u intemperance.n-u interference.n-u +interoperability.n-u intimacy.n-u intolerance.n-u intractability.n-u @@ -513,6 +517,7 @@ passivity.n-u paternity.n-u patience.n-u +patternicity.n-u paucity.n-u peculiarity.n-u pederasty.n-u @@ -541,6 +546,7 @@ piracy.n-u pity.n-u placidity.n-u +planarity.n-u plasticity.n-u plausibility.n-u pliancy.n-u @@ -626,6 +632,7 @@ reminiscence.n-u remittance.n-u remonstrance.n-u +repeatability.n-u repentance.n-u repugnance.n-u resemblance.n-u @@ -651,6 +658,7 @@ sanctity.n-u sanity.n-u satiety.n-u +satisfiability.n-u scarcity.n-u science.n-u secrecy.n-u diff -Nru link-grammar-5.3.16/data/en/words/words.v.10.1 link-grammar-5.5.0/data/en/words/words.v.10.1 --- link-grammar-5.3.16/data/en/words/words.v.10.1 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.10.1 2018-01-25 03:30:03.000000000 +0000 @@ -124,5 +124,6 @@ waver.q whisper.q wonder.q +word.q write.q yell.q diff -Nru link-grammar-5.3.16/data/en/words/words.v.10.2 link-grammar-5.5.0/data/en/words/words.v.10.2 --- link-grammar-5.3.16/data/en/words/words.v.10.2 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.10.2 2018-01-25 03:30:03.000000000 +0000 @@ -124,5 +124,6 @@ wavers.q whispers.q wonders.q +words.q writes.q yells.q diff -Nru link-grammar-5.3.16/data/en/words/words.v.10.3 link-grammar-5.5.0/data/en/words/words.v.10.3 --- link-grammar-5.3.16/data/en/words/words.v.10.3 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.10.3 2018-02-10 23:03:13.000000000 +0000 @@ -122,4 +122,5 @@ wavered.q-d whispered.q-d wondered.q-d +worded.q-d yelled.q-d diff -Nru link-grammar-5.3.16/data/en/words/words.v.10.4 link-grammar-5.5.0/data/en/words/words.v.10.4 --- link-grammar-5.3.16/data/en/words/words.v.10.4 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.10.4 2018-01-25 03:30:03.000000000 +0000 @@ -124,5 +124,6 @@ wavering.q whispering.q wondering.q +wording.q writing.q yelling.q diff -Nru link-grammar-5.3.16/data/en/words/words.v.1.p link-grammar-5.5.0/data/en/words/words.v.1.p --- link-grammar-5.3.16/data/en/words/words.v.1.p 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.1.p 2018-01-24 20:17:28.000000000 +0000 @@ -1,17 +1,86 @@ -done_for accounted_for adhered_to arrived_at barked_at belched_at catered_to -clung_to complied_with coped_with depended_on disapproved_of discriminated_against -embarked_on frowned_on gazed_upon glared_at gossiped_about interfered_with -laughed_at lived_in longed_for monkeyed_with objected_to presided_over -ratted_on relied_on resorted_to talked_to toyed_with trifled_with voted_on -voted_for jumped_on lied_to listened_to screamed_at sat_on -spoken_to stepped_on appealed_to attended_to conceived_of dealt_with -dispensed_with entered_into improved_on improved_upon negotiated_with -operated_on referred_to searched_for stared_at tended_to yelled_at -counted_on pointed_to pointed_at -fought_with planned_for arranged_for tried_for added_to reckoned_with -approved_of ruled_on wished_for hoped_for decided_on cared_for shouted_at -dreamed_of dreamt_of thought_of heard_from looked_at called_for uncalled_for -asked_for undreamed_of undreamt_of paid_for slept_in -agreed_upon talked_about - - +accounted_for +added_to +adhered_to +agreed_upon +appealed_to +approved_of +arranged_for +arrived_at +asked_for +attended_to +barked_at +belched_at +called_for +cared_for +catered_to +clung_to +complied_with +conceived_of +coped_with +counted_on +dealt_with +decided_on +depended_on +disapproved_of +discriminated_against +dispensed_with +done_for +dreamed_of +dreamt_of +embarked_on +entered_into +fought_with +frowned_on +gazed_upon +glared_at +gossiped_about +heard_from +hoped_for +improved_on +improved_upon +interfered_with +jumped_on +laughed_at +lied_to +listened_to +lived_in +longed_for +looked_at +monkeyed_with +negotiated_with +objected_to +operated_on +paid_for +planned_for +pointed_at +pointed_to +prayed_for +presided_over +ratted_on +reckoned_with +referred_to +relied_on +resorted_to +ruled_on +sat_on +screamed_at +searched_for +shouted_at +slept_in +spoken_to +stared_at +stepped_on +talked_about +talked_to +tended_to +thought_of +toyed_with +tried_for +trifled_with +uncalled_for +undreamed_of +undreamt_of +voted_for +voted_on +wished_for +yelled_at diff -Nru link-grammar-5.3.16/data/en/words/words.v.2.1 link-grammar-5.5.0/data/en/words/words.v.2.1 --- link-grammar-5.3.16/data/en/words/words.v.2.1 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.2.1 2018-02-22 06:44:54.000000000 +0000 @@ -219,6 +219,7 @@ coprecipitate.v coprocess.v copurify.v +copy/paste.v coregister.v correct.v correlate.v @@ -248,6 +249,8 @@ cross-check.v crosscheck.v cross_fertilize +cross-fertilize.v +cross-link.v crosslink.v cryosection.v crystallize.v @@ -342,6 +345,7 @@ de-quench.v dequench.v derail.v +dereference.v derive.v derogate.v de-saturate.v @@ -542,10 +546,12 @@ glucosylate.v gnash.v gnaw.v +google.v govern.v granulate.v grasp.v grate.v +grep.v grieve.v groan.v grope.v @@ -573,6 +579,7 @@ hemolyze.v hem.v heterotransplant.v +histogram.v hoard.v hoe.v honk.v @@ -692,6 +699,7 @@ localize.v locate.v lodge.v +lowercase.v luminesce.v lunch.v luteinize.v @@ -712,6 +720,7 @@ meet.v meld.v meliorate.v +memoize.v mend.v merge.v mesh.v @@ -775,6 +784,7 @@ obsess.v occlude.v off-load.v +offset.v ogle.v opacify.v operate.v @@ -862,18 +872,20 @@ plate.v platt.v plat.v -please.v plunder.v poach.v poise.v poll.v pose.v +postpose.v postprocess.v power-dive.v practise.v preach.v preactivate.v +preallocate.v precompile.v +precompute.v precontract.v preincubate.v premiere.v @@ -931,6 +943,7 @@ rear.v reassemble.v reawaken.v +rebalance.v rebate.v rebleed.v rebreathe.v @@ -948,6 +961,7 @@ reciprocate.v recolonize.v recommence.v +recompile.v recompute.v reconfirm.v reconnect.v @@ -958,6 +972,7 @@ recross.v recruit.v recrystallize.v +recurse.v recurve.v redden.v redifferentiate.v @@ -991,6 +1006,7 @@ rehearse.v rehydrate.v re-ignite.v +reimplement.v reincorporate.v reincubate.v reinduce.v @@ -1019,6 +1035,7 @@ reoxidize.v repack.v repair.v +reparse.v rephosphorylate.v repolarize.v repossess.v @@ -1027,6 +1044,7 @@ reproduce.v re-read.v rerun.v +rescale.v reseed.v resensitize.v resettle.v @@ -1072,7 +1090,9 @@ saponify.v scan.v scar_cicatrise +scar-cicatrise.v scar_cicatrize +scar-cicatrize.v scatter.v scissor.v sclerose.v @@ -1225,6 +1245,7 @@ thrum.v thump.v thunder.v +thunk.v tickle.v tilt.v tine.v @@ -1232,7 +1253,7 @@ titivate.v titrate.v toast.v -tokenise.n +tokenise.v tokenize.v toot.v trace.v @@ -1260,10 +1281,15 @@ twirl.v ulcerate.v umpire.v +unapply.v unbend.v +unbundle.v +uncapitalize.v uncoat.v uncoil.v +uncomment.v uncurl.v +undefine.v underbid.v undercorrect.v underfeed.v @@ -1271,6 +1297,7 @@ underreport.v underspend.v undress.v +unexpand.v unfasten.v unfold.v unfreeze.v @@ -1287,13 +1314,16 @@ unroll.v unscrew.v unseal.v +unselect.v unstress.v untangle.v +unterminate.v untie.v untwist.v unwind.v unwrap.v upload.v +uppercase.v urbanize.v vaporize.v varicellize.v @@ -1342,3 +1372,4 @@ yell.v yield.v yodel.v +zeroize.v diff -Nru link-grammar-5.3.16/data/en/words/words.v.2.2 link-grammar-5.5.0/data/en/words/words.v.2.2 --- link-grammar-5.3.16/data/en/words/words.v.2.2 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.2.2 2018-02-22 06:44:54.000000000 +0000 @@ -230,6 +230,7 @@ coprecipitates.v coprocesses.v copurifies.v +copy/pastes.v coregisters.v corrects.v correlates.v @@ -259,6 +260,8 @@ cross-checks.v crosschecks.v cross_fertilizes +cross-fertilizes.v +cross-links.v crosslinks.v cryosections.v crystallizes.v @@ -353,6 +356,7 @@ de-quenchs.v dequenchs.v derails.v +dereferences.v derives.v derogates.v de-saturates.v @@ -563,10 +567,12 @@ gnashes.v gnashs.v gnaws.v +googles.v governs.v granulates.v grasps.v grates.v +greps.v grieves.v groans.v gropes.v @@ -595,6 +601,7 @@ hemolyzes.v hems.v heterotransplants.v +histograms.v hoards.v hoes.v honks.v @@ -719,6 +726,7 @@ localizes.v locates.v lodges.v +lowercases.v luminesces.v lunches.v lunchs.v @@ -740,6 +748,7 @@ meets.v melds.v meliorates.v +memoizes.v mends.v merges.v meshes.v @@ -805,6 +814,7 @@ obsesses.v occludes.v off-loads.v +offsets.v ogles.v opacifies.v operates.v @@ -899,20 +909,22 @@ plates.v plats.v platts.v -pleases.v plunders.v poaches.v poachs.v poises.v polls.v poses.v +postposes.v postprocesses.v power-dives.v practises.v preaches.v preachs.v preactivates.v +preallocates.v precompiles.v +precomputes.v precontracts.v preincubates.v premieres.v @@ -969,6 +981,7 @@ rears.v reassembles.v reawakens.v +rebalances.v rebates.v rebleeds.v rebreathes.v @@ -986,6 +999,7 @@ reciprocates.v recolonizes.v recommences.v +recompiles.v recomputes.v reconfirms.v reconnects.v @@ -996,6 +1010,7 @@ recrosses.v recruits.v recrystallizes.v +recurses.v recurves.v reddens.v redifferentiates.v @@ -1031,6 +1046,7 @@ rehearses.v rehydrates.v re-ignites.v +reimplements.v reincorporates.v reincubates.v reinduces.v @@ -1060,6 +1076,7 @@ reoxidizes.v repacks.v repairs.v +reparses.v rephosphorylates.v repolarizes.v repossesss.v @@ -1068,6 +1085,7 @@ reproduces.v re-reads.v reruns.v +rescales.v reseeds.v resensitizes.v resettles.v @@ -1113,7 +1131,9 @@ saponifies.v scans.v scar_cicatrises +scar-cicatrises.v scar_cicatrizes +scar-cicatrizes.v scatters.v scissors.v scleroses.v @@ -1273,6 +1293,7 @@ thrums.v thumps.v thunders.v +thunks.v tickles.v tilts.v tines.v @@ -1308,10 +1329,15 @@ twirls.v ulcerates.v umpires.v +unapplys.v unbends.v +unbundles.v +uncapitalizes.v uncoats.v uncoils.v +uncomments.v uncurls.v +undefines.v underbids.v undercorrects.v underfeeds.v @@ -1319,6 +1345,7 @@ underreports.v underspends.v undresses.v +unexpands.v unfastens.v unfolds.v unfreezes.v @@ -1336,13 +1363,16 @@ unrolls.v unscrews.v unseals.v +unselects.v unstresses.v untangles.v +unterminates.v unties.v untwists.v unwinds.v unwraps.v uploads.v +uppercases.v urbanizes.v vaporizes.v varicellizes.v @@ -1392,3 +1422,4 @@ yells.v yields.v yodels.v +zeroizes.v diff -Nru link-grammar-5.3.16/data/en/words/words.v.2.3 link-grammar-5.5.0/data/en/words/words.v.2.3 --- link-grammar-5.3.16/data/en/words/words.v.2.3 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.2.3 2018-02-22 06:44:54.000000000 +0000 @@ -229,6 +229,7 @@ coprecipitated.v-d coprocessed.v-d copurified.v-d +copy/pasted.v-d coregistered.v-d corrected.v-d correlated.v-d @@ -256,6 +257,8 @@ cross-checked.v-d crosschecked.v-d cross_fertilized +cross-fertilized.v-d +cross-linked.v-d crosslinked.v-d cryosectioned.v-d crystallized.v-d @@ -351,6 +354,7 @@ de-quenched.v-d dequenched.v-d derailed.v-d +dereferenced.v-d derived.v-d derogated.v-d de-saturated.v-d @@ -555,10 +559,12 @@ glucosylated.v-d gnashed.v-d gnawed.v-d +googled.v-d governed.v-d granulated.v-d grasped.v-d grated.v-d +grepd.v-d grieved.v-d groaned.v-d groped.v-d @@ -571,8 +577,8 @@ hailed.v-d haploidized.v-d hard_coded -hard-coded.v -hardcoded.v +hard-coded.v-d +hardcoded.v-d hardened.v-d harmonized.v-d hatched.v-d @@ -587,6 +593,7 @@ hemocoagulated.v-d hemolyzed.v-d heterotransplanted.v-d +histogramd.v-d hoarded.v-d hoed.v-d honked.v-d @@ -706,6 +713,7 @@ localized.v-d located.v-d lodged.v-d +lowercased.v-d luminesced.v-d lunched.v-d luteinized.v-d @@ -725,6 +733,7 @@ mediated.v-d melded.v-d meliorated.v-d +memoized.v-d mended.v-d merged.v-d meshed.v-d @@ -790,6 +799,7 @@ obsessed.v-d occluded.v-d off-loaded.v-d +offset.v-d ogled.v-d opacified.v-d operated.v-d @@ -809,8 +819,8 @@ overclocked.v-d overclouded.v-d overdressed.v-d -overdrove.v-d overdriven.v-d +overdrove.v-d overfeeded.v-d overfilled.v-d overflew.v-d @@ -880,18 +890,20 @@ plasticized.v-d plated.v-d platted.v-d -pleased.v-d plundered.v-d poached.v-d poised.v-d polled.v-d posed.v-d +postposed.v-d postprocessed.v-d power-dived.v-d practised.v-d preached.v-d preactivated.v-d +preallocated.v-d precompiled.v-d +precomputed.v-d precontracted.v-d preincubated.v-d premiered.v-d @@ -949,6 +961,7 @@ rearrested.v-d reassembled.v-d reawakened.v-d +rebalanced.v-d rebated.v-d rebleeded.v-d rebreathed.v-d @@ -967,6 +980,7 @@ reciprocated.v-d recolonized.v-d recommenced.v-d +recompiled.v-d recomputed.v-d reconfirmed.v-d reconnected.v-d @@ -977,6 +991,7 @@ recrossed.v-d recruited.v-d recrystallized.v-d +recursed.v-d recurved.v-d reddened.v-d redifferentiated.v-d @@ -1013,6 +1028,7 @@ rehearsed.v-d rehydrated.v-d re-ignited.v-d +reimplemented.v-d reincorporated.v-d reincubated.v-d reinduced.v-d @@ -1041,6 +1057,7 @@ reoxidized.v-d repacked.v-d repaired.v-d +reparsed.v-d rephosphorylated.v-d repolarized.v-d repossessed.v-d @@ -1049,6 +1066,7 @@ reproduced.v-d re-readed.v-d reruned.v-d +rescaled.v-d reseeded.v-d resensitized.v-d resettled.v-d @@ -1095,7 +1113,9 @@ saponified.v-d scanned.v-d scar_cicatrised +scar-cicatrised.v-d scar_cicatrized +scar-cicatrized.v-d scattered.v-d scissored.v-d sclerosed.v-d @@ -1249,6 +1269,7 @@ thrummed.v-d thumped.v-d thundered.v-d +thunked.v-d tickled.v-d tilted.v-d tined.v-d @@ -1285,16 +1306,22 @@ twirled.v-d ulcerated.v-d umpired.v-d +unapplied.v-d unbended.v-d +unbundled.v-d +uncapitalized.v-d uncoated.v-d uncoiled.v-d +uncommented.v-d uncurled.v-d +undefined.v-d undercorrected.v-d underfeeded.v-d underprescribed.v-d underreported.v-d underspent.v-d undressed.v-d +unexpanded.v-d unfastened.v-d unfolded.v-d unfreezed.v-d @@ -1311,13 +1338,16 @@ unrolled.v-d unscrewed.v-d unsealed.v-d +unselected.v-d unstressed.v-d untangled.v-d +unterminated.v-d untied.v-d untwisted.v-d unwound.v-d unwrapped.v-d uploaded.v-d +uppercased.v-d urbanized.v-d vaporized.v-d varicellized.v-d @@ -1366,3 +1396,4 @@ yelled.v-d yielded.v-d yodelled.v-d +zeroized.v-d diff -Nru link-grammar-5.3.16/data/en/words/words.v.2.4 link-grammar-5.5.0/data/en/words/words.v.2.4 --- link-grammar-5.3.16/data/en/words/words.v.2.4 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.2.4 2018-02-22 06:44:54.000000000 +0000 @@ -226,6 +226,7 @@ coprecipitating.v coprocessing.v copurifying.v +copy/pasting.v coregistering.v correcting.v correlating.v @@ -255,6 +256,8 @@ cross-checking.v crosschecking.v cross_fertilizing +cross-fertilizing.v +cross-linking.v crosslinking.v cryosectioning.v crystallizing.v @@ -350,6 +353,7 @@ de-quenching.v dequenching.v derailing.v +dereferencing.v deriving.v derogating.v de-saturating.v @@ -556,10 +560,13 @@ glucosylating.v gnashing.v gnawing.v +googling.v governing.v granulating.v grasping.v grating.v +greping.v +grepping.v grieving.v groaning.v groping.v @@ -587,6 +594,7 @@ hemocoagulating.v hemolyzing.v heterotransplanting.v +histograming.v hoarding.v hoeing.v honking.v @@ -706,6 +714,7 @@ localizing.v locating.v lodging.v +lowercasing.v luminescing.v lunching.v luteinizing.v @@ -726,6 +735,7 @@ meeting.v melding.v meliorating.v +memoizing.v mending.v merging.v meshing.v @@ -754,7 +764,6 @@ mobilizing.v modelling.v moderating.v -modernizing.g modernizing.v moistening.v monospacing.v @@ -792,6 +801,7 @@ obsessing.v occluding.v off-loading.v +offsetting.v ogling.v opacifying.v operating.v @@ -883,18 +893,20 @@ plasticizing.v plating.v platting.v -pleasing.v plundering.v poaching.v poising.v polling.v posing.v +postposing.v postprocessing.v power-diving.v practising.v preaching.v preactivating.v +preallocating.v precompiling.v +precomputing.v precontracting.v preincubating.v premiering.v @@ -950,6 +962,7 @@ rearresting.v reassembling.v reawakening.v +rebalancing.v rebateing.v rebleeding.v rebreathing.v @@ -968,6 +981,7 @@ reciprocating.v recolonizing.v recommencing.v +recompiling.v recomputing.v reconfirming.v reconnecting.v @@ -977,6 +991,7 @@ recrossing.v recruiting.v recrystallizing.v +recursing.v recurving.v reddening.v redifferentiating.v @@ -1013,6 +1028,7 @@ rehearsing.v rehydrating.v re-igniting.v +reimplementing.v reincorporating.v reincubating.v reinducing.v @@ -1041,6 +1057,7 @@ reoxidizing.v repacking.v repairing.v +reparsing.v rephosphorylating.v repolarizing.v repossessing.v @@ -1049,6 +1066,7 @@ reproducing.v re-reading.v reruning.v +rescaling.v reseeding.v resensitizing.v resettling.v @@ -1095,7 +1113,9 @@ saponifying.v scanning.v scar_cicatrising +scar-cicatrising.v scar_cicatrizing +scar-cicatrizing.v scattering.v scissoring.v sclerosing.v @@ -1253,6 +1273,7 @@ thrumming.v thumping.v thundering.v +thunking.v tickling.v tilting.v tinging.v @@ -1289,10 +1310,15 @@ twirling.v ulcerating.v umpiring.v +unapplying.v unbending.v +unbundling.v +uncapitalizing.v uncoating.v uncoiling.v +uncommenting.v uncurling.v +undefining.v underbidding.v undercorrecting.v underfeeding.v @@ -1300,6 +1326,7 @@ underreporting.v underspending.v undressing.v +unexpanding.v unfastening.v unfolding.v unfreezing.v @@ -1316,13 +1343,16 @@ unrolling.v unscrewing.v unsealing.v +unselecting.v unstressing.v untangling.v +unterminating.v untwisting.v untying.v unwinding.v unwrapping.v uploading.v +uppercasing.v urbanizing.v vaporizing.v varicellizing.v @@ -1372,3 +1402,4 @@ yelling.v yielding.v yodelling.v +zeroizing.v diff -Nru link-grammar-5.3.16/data/en/words/words.v.2.5 link-grammar-5.5.0/data/en/words/words.v.2.5 --- link-grammar-5.3.16/data/en/words/words.v.2.5 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.2.5 2018-02-22 06:44:54.000000000 +0000 @@ -226,6 +226,7 @@ coprecipitating.g coprocessing.g copurifying.g +copy/pasting.g coregistering.g correcting.g correlating.g @@ -255,6 +256,8 @@ cross-checking.g crosschecking.g cross_fertilizing +cross-fertilizing.g +cross-linking.g crosslinking.g cryosectioning.g crystallizing.g @@ -350,6 +353,7 @@ de-quenching.g dequenching.g derailing.g +dereferencing.g deriving.g derogating.g de-saturating.g @@ -557,10 +561,13 @@ glucosylating.g gnashing.g gnawing.g +googling.g governing.g granulating.g grasping.g grating.g +greping.g +grepping.g grieving.g groaning.g groping.g @@ -588,6 +595,7 @@ hemocoagulating.g hemolyzing.g heterotransplanting.g +histograming.g hoarding.g hoeing.g honking.g @@ -709,6 +717,7 @@ locating.g lodging.g logging.g +lowercasing.g luminescing.g lunching.g luteinizing.g @@ -729,6 +738,7 @@ meeting.g melding.g meliorating.g +memoizing.g mending.g merging.g meshing.g @@ -792,6 +802,7 @@ obsessing.g occluding.g off-loading.g +offsetting.g ogling.g opacifying.g operating.g @@ -883,18 +894,20 @@ plasticizing.g plating.g platting.g -pleasing.g plundering.g poaching.g poising.g polling.g posing.g +postposing.g postprocessing.g power-diving.g practising.g preaching.g preactivating.g +preallocating.g precompiling.g +precomputing.g precontracting.g preincubating.g premiering.g @@ -950,6 +963,7 @@ rearresting.g reassembling.g reawakening.g +rebalancing.g rebateing.g rebleeding.g rebreathing.g @@ -968,6 +982,7 @@ reciprocating.g recolonizing.g recommencing.g +recompiling.g recomputing.g reconfirming.g reconnecting.g @@ -977,6 +992,7 @@ recrossing.g recruiting.g recrystallizing.g +recursing.g recurving.g reddening.g redifferentiating.g @@ -1013,6 +1029,7 @@ rehearsing.g rehydrating.g re-igniting.g +reimplementing.g reincorporating.g reincubating.g reinducing.g @@ -1041,6 +1058,7 @@ reoxidizing.g repacking.g repairing.g +reparsing.g rephosphorylating.g repolarizing.g repossessing.g @@ -1049,6 +1067,7 @@ reproducing.g re-reading.g reruning.g +rescaling.g reseeding.g resensitizing.g resettling.g @@ -1095,7 +1114,9 @@ saponifying.g scanning.g scar_cicatrising +scar-cicatrising.g scar_cicatrizing +scar-cicatrizing.g scattering.g scissoring.g sclerosing.g @@ -1255,6 +1276,7 @@ thrumming.g thumping.g thundering.g +thunking.g tickling.g tilting.g tinging.g @@ -1291,10 +1313,15 @@ twirling.g ulcerating.g umpiring.g +unapplying.g unbending.g +unbundling.g +uncapitalizing.g uncoating.g uncoiling.g +uncommenting.g uncurling.g +undefining.g underbidding.g undercorrecting.g underfeeding.g @@ -1302,6 +1329,7 @@ underreporting.g underspending.g undressing.g +unexpanding.g unfastening.g unfolding.g unfreezing.g @@ -1318,13 +1346,16 @@ unrolling.g unscrewing.g unsealing.g +unselecting.g unstressing.g untangling.g +unterminating.g untwisting.g untying.g unwinding.g unwrapping.g uploading.g +uppercasing.g urbanizing.g vaporizing.g varicellizing.g @@ -1374,3 +1405,4 @@ yelling.g yielding.g yodelling.g +zeroizing.g diff -Nru link-grammar-5.3.16/data/en/words/words.v.4.1 link-grammar-5.5.0/data/en/words/words.v.4.1 --- link-grammar-5.3.16/data/en/words/words.v.4.1 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.4.1 2018-02-12 21:44:27.000000000 +0000 @@ -458,7 +458,6 @@ criticize.v crossbreed.v cross-examine.v -cross-fertilize.v cross-index.v cross-question.v crucify.v @@ -693,7 +692,6 @@ divert.v divest.v divorce.v -divulge.v dizzy.v doctor.v doff.v @@ -1163,7 +1161,7 @@ indenture.v index.v indict.v -individuate.n +individuate.v indoctrinate.v induct.v infatuate.v diff -Nru link-grammar-5.3.16/data/en/words/words.v.4.2 link-grammar-5.5.0/data/en/words/words.v.4.2 --- link-grammar-5.3.16/data/en/words/words.v.4.2 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.4.2 2018-02-22 06:44:54.000000000 +0000 @@ -457,7 +457,6 @@ criticizes.v crossbreeds.v cross-examines.v -cross-fertilizes.v cross-indexes.v cross-questions.v crucifies.v @@ -691,7 +690,6 @@ diverts.v divests.v divorces.v -divulges.v dizzies.v doctors.v doffs.v @@ -1459,7 +1457,6 @@ occupies.v offends.v offloads.v -offsets.v oils.v okays.v O.K.'s.v diff -Nru link-grammar-5.3.16/data/en/words/words.v.4.3 link-grammar-5.5.0/data/en/words/words.v.4.3 --- link-grammar-5.3.16/data/en/words/words.v.4.3 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.4.3 2018-02-12 21:36:25.000000000 +0000 @@ -455,7 +455,6 @@ criticized.v-d crossbred.v-d cross-examined.v-d -cross-fertilized.v-d cross-indexed.v-d cross-questioned.v-d crucified.v-d @@ -687,7 +686,6 @@ diverted.v-d divested.v-d divorced.v-d -divulged.v-d dizzied.v-d doctored.v-d doffed.v-d @@ -1905,7 +1903,7 @@ reproved.v-d repudiated.v-d repulsed.v-d -repurchased.vv-d +repurchased.v-d repurposed.v-d reputed.v-d requisitioned.v-d diff -Nru link-grammar-5.3.16/data/en/words/words.v.4.4 link-grammar-5.5.0/data/en/words/words.v.4.4 --- link-grammar-5.3.16/data/en/words/words.v.4.4 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.4.4 2018-02-22 06:44:54.000000000 +0000 @@ -458,7 +458,6 @@ criticizing.v crossbreeding.v cross-examining.v -cross-fertilizing.v cross-indexing.v cross-questioning.v crucifying.v @@ -569,6 +568,7 @@ denominating.v denoting.v denouncing.v +denting.v denuding.v deodorizing.v depleting.v @@ -690,7 +690,6 @@ diverting.v divesting.v divorcing.v -divulging.v dizzying.v doctoring.v doffing.v @@ -1450,7 +1449,6 @@ occupying.v offending.v offloading.v -offsetting.v oiling.v okaying.v O.K.'ing.v diff -Nru link-grammar-5.3.16/data/en/words/words.v.4.5 link-grammar-5.5.0/data/en/words/words.v.4.5 --- link-grammar-5.3.16/data/en/words/words.v.4.5 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.4.5 2018-02-22 06:44:54.000000000 +0000 @@ -458,7 +458,6 @@ criticizing.g crossbreeding.g cross-examining.g -cross-fertilizing.g cross-indexing.g cross-questioning.g crucifying.g @@ -570,7 +569,6 @@ denoting.g denouncing.g denting.g -denting.v denuding.g deodorizing.g depleting.g @@ -692,7 +690,6 @@ diverting.g divesting.g divorcing.g -divulging.g dizzying.g doctoring.g doffing.g @@ -919,7 +916,7 @@ filleting.g finalizing.g financing.g -fine-tuning. +fine-tuning.g fingering.g firebombing.g flagellating.g @@ -1453,7 +1450,6 @@ occupying.g offending.g offloading.g -offsetting.g oiling.g okaying.g O.K.'ing.g diff -Nru link-grammar-5.3.16/data/en/words/words.v.5.4 link-grammar-5.5.0/data/en/words/words.v.5.4 --- link-grammar-5.3.16/data/en/words/words.v.5.4 2015-01-12 03:55:03.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.5.4 2018-02-10 21:26:48.000000000 +0000 @@ -10,7 +10,6 @@ cawwing.v clamming.v clowning.v -coming.v crapping.v crawling.v creeping.v @@ -26,7 +25,7 @@ fading.v fizzling.v flocking.v -flowing +flowing.v foaming.v ganging.v gearing.v @@ -45,7 +44,7 @@ looming.v lunging.v lurching.v -lying +lying.v mellowing.v nestling.v opting.v @@ -53,7 +52,7 @@ peering.v perking.v petering.v -pigging.g +pigging.v pining.v plonking.v plopping.v diff -Nru link-grammar-5.3.16/data/en/words/words.v.6.2 link-grammar-5.5.0/data/en/words/words.v.6.2 --- link-grammar-5.3.16/data/en/words/words.v.6.2 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.6.2 2018-02-12 21:45:58.000000000 +0000 @@ -239,7 +239,7 @@ mucks.v nods.v oozes.v -opens +opens.v options.v overthrows.v packs.v diff -Nru link-grammar-5.3.16/data/en/words/words.v.6.3 link-grammar-5.5.0/data/en/words/words.v.6.3 --- link-grammar-5.3.16/data/en/words/words.v.6.3 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.6.3 2018-02-10 23:23:46.000000000 +0000 @@ -34,7 +34,7 @@ bored.v-d bounced.v-d bounded.v-d -bound.w +bound.v-d bowled.v-d boxed.v-d branched.v-d @@ -51,6 +51,7 @@ bumped.v-d bunched.v-d burned.v-d +burnt.v-d busted.v-d butted.v-d buttoned.v-d @@ -66,7 +67,7 @@ changed.v-d chatted.v-d cheated.v-d -checked +checked.v-d cheered.v-d cheesed.v-d chewed.v-d @@ -196,7 +197,7 @@ ironed.v-d jammed.v-d jerked.v-d -joined +joined.v-d jumbled.v-d jumped.v-d kayaked.v-d @@ -247,7 +248,7 @@ partied.v-d peeed.v-d peeled.v-d -piled +piled.v-d pissed.v-d played.v-d ploughed.v-d @@ -419,6 +420,6 @@ wiped.v-d won.v-d worked.v-d -wound.w +wound.v-d wrapped.v-d zeroed.v-d diff -Nru link-grammar-5.3.16/data/en/words/words.v.8.2 link-grammar-5.5.0/data/en/words/words.v.8.2 --- link-grammar-5.3.16/data/en/words/words.v.8.2 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/en/words/words.v.8.2 2018-02-12 21:46:31.000000000 +0000 @@ -10,7 +10,7 @@ bottles.v bundles.v cages.v -carries +carries.v carts.v chalks.v chases.v @@ -21,7 +21,7 @@ cuts.v dams.v dishes.v -divides +divides.v doles.v dredges.v dumps.v @@ -57,7 +57,7 @@ nails.v palms.v patches.v -paves +paves.v pens.v phases.v phones.v diff -Nru link-grammar-5.3.16/data/fa/4.0.dict link-grammar-5.5.0/data/fa/4.0.dict --- link-grammar-5.3.16/data/fa/4.0.dict 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/fa/4.0.dict 2018-04-27 00:49:11.000000000 +0000 @@ -167,7 +167,7 @@ : {Wd-}; -/fa/words/words.n /fa/words/words.proper UNKNOWN-WORD.n : % Nouns, normal +/fa/words/words.n /fa/words/words.proper .n : % Nouns, normal ( % Nouns in prepositional phrases {} & {} @@ -234,7 +234,7 @@ "i.nms" "|i.nms": % Noun morphemes, indefinite suffix NMSi-; -/fa/words/words.n.compound /fa/words/words.adj.compound UNKNOWN-WORD.nk: % Noun component of compound verbs +/fa/words/words.n.compound /fa/words/words.adj.compound .nk: % Noun component of compound verbs %? ( % Rule out non-compounding possiblities, like 'yek jAru zad' or 'jAru rA zad' % % & @@ -397,7 +397,7 @@ & {[(Wi- or C- or CC-)]} ; -/fa/words/words.v UNKNOWN-WORD.v: +/fa/words/words.v .v: ( % For Non-Future tense (Normal) & {@AV-} % Adverbs can occur almost anywhere, at a phrasal level @@ -677,7 +677,7 @@ : (CCAPR- or CCAPl+); -%/fa/words/words.adj UNKNOWN-WORD.adj: % Adjectives, normal +%/fa/words/words.adj .adj: % Adjectives, normal /fa/words/words.adj: % Adjectives, normal ( { AJM+ } diff -Nru link-grammar-5.3.16/data/fa/Makefile.in link-grammar-5.5.0/data/fa/Makefile.in --- link-grammar-5.3.16/data/fa/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/fa/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -217,6 +217,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -245,6 +246,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -253,13 +255,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -326,12 +331,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/fa/words/Makefile.in link-grammar-5.5.0/data/fa/words/Makefile.in --- link-grammar-5.3.16/data/fa/words/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/fa/words/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/he/4.0.dict link-grammar-5.5.0/data/he/4.0.dict --- link-grammar-5.3.16/data/he/4.0.dict 2017-01-27 22:04:07.000000000 +0000 +++ link-grammar-5.5.0/data/he/4.0.dict 2018-04-27 00:49:17.000000000 +0000 @@ -562,4 +562,4 @@ % With the following line in the dictionary, the parser will simply % skip over (null-link) unknown words. If you remove it, the parser % will output an error for any unknown words. -UNKNOWN-WORD: XXX+; +: XXX+; diff -Nru link-grammar-5.3.16/data/he/Makefile.in link-grammar-5.5.0/data/he/Makefile.in --- link-grammar-5.3.16/data/he/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/data/he/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/id/4.0.dict link-grammar-5.5.0/data/id/4.0.dict --- link-grammar-5.3.16/data/id/4.0.dict 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/id/4.0.dict 2018-04-27 00:49:21.000000000 +0000 @@ -7,14 +7,14 @@ % % %***************************************************************************% -% Dictionary version number is 5.2.1 (formatted as V5v2v1+) -: V5v2v1+; +% Dictionary version number is 5.4.5 (formatted as V5v4v5+) +: V5v4v5+; : ID4id+; anjing kucing wanita cewek pria cowok lelaki laki-laki taman lapangan tulang tetangga toko jalan burung palu hidung pesta teman rumah film sinetron kakak adik saudara saudari warung murid tes ujian: - {@A-} & Ds- & {@M+ or (R+ & Bs+)} & + {@A-} & Ds- & {@M+ or (R+ & Bs+)} & (J- or Os- or (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-); anjing-anjing kucing-kucing cewek-cewek wanita-wanita pria-pria cowok-cowok @@ -24,100 +24,100 @@ saudara-saudara saudari-saudari saudara-saudari warung-warung murid-murid tes-tes ujian-ujian perang-perang "musim-dingin-musim-dingin" aksi-aksi hukum-hukum kesuksesan-kesuksesan: -{@A-} & {Dmc-} & {@M+ or (R+ & Bp+)} & +{@A-} & {Dmc-} & {@M+ or (R+ & Bp+)} & (J- or Op- or (Sp+ & (({@CO-} & {C-}) or R-)) or SIp-); air kemarahan uang politik masalah: -{@A-} & {Dmu-} & {@M+ or (R+ & Bs+)} & +{@A-} & {Dmu-} & {@M+ or (R+ & Bs+)} & (J- or Os- or (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-); hukum "musim-dingin" aksi perang sukses: -{@A-} & {D*u-} & {@M+ or (R+ & Bs+)} & +{@A-} & {D*u-} & {@M+ or (R+ & Bs+)} & (J- or Os- or (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-); CAPITALIZED-WORDS: J- or O- or (S+ & (({@CO-} & {C-}) or R-)) or SI-; -dia ia: (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-; -aku saya dia mereka kami kita: J- or O-; -dia ia: D+ or J- or O-; -saya aku kamu mereka kami kita: D+; -dia ia: D+; - -kamu mereka kami kita saya aku: J- or O- or (Sp+ & (({@CO-} & {C-}) or R-)) or SIp-; -dia ia: J- or O- or (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-; - -ini: (J- or O- or (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-) or D*u+; -ini: (J- or O- or (Sp+ & (({@CO-} & {C-}) or R-)) or SIp-) or Dmc+; -itu: (Dmc+) or (({P+} or {{C+} & Bp+}) & +dia.1 ia.1: (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-; +aku.1 saya.1 dia.2 mereka.1 kami.1 kita.1: J- or O-; +dia.3 ia.2: D+ or J- or O-; +saya.2 aku.2 kamu.3 mereka.2 kami.2 kita.2: D+; +dia.4 ia.3: D+; + +kamu.4 mereka.3 kami.3 kita.3 saya.3 aku.3: J- or O- or (Sp+ & (({@CO-} & {C-}) or R-)) or SIp-; +dia.5 ia: J- or O- or (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-; + +ini.1: (J- or O- or (Ss+ & (({@CO-} & {C-}) or R-)) or SIs-) or D*u+; +ini.2: (J- or O- or (Sp+ & (({@CO-} & {C-}) or R-)) or SIp-) or Dmc+; +itu.2: (Dmc+) or (({P+} or {{C+} & Bp+}) & (J- or O- or (Sp+ & (({@CO-} & {C-}) or R-)) or SIp- or Xb-)); -sebuah si sang: D+; -sebuah: Ds+; +sebuah.1 si sang: D+; +sebuah.2: Ds+; -melakukan berbuat: ({Q-} & SI+ & I+) or ({@E-} & (S- or +melakukan.1 berbuat.1: ({Q-} & SI+ & I+) or ({@E-} & (S- or (RS- & B-)) & (((B- or O+) & {@MV+}) or I+)); -melakukan berbuat: (SIp+ & I+) or ({@E-} & (Sp- or +melakukan.2 berbuat.2: (SIp+ & I+) or ({@E-} & (Sp- or (RS- & Bp-) or I-) & (((B- or O+) & {@MV+}) or I+)); -melakukan berbuat: ({Q-} & SIs+ & I+) or ({@E-} & (Ss- or (RS- & Bs-)) & +melakukan.3 berbuat.3: ({Q-} & SIs+ & I+) or ({@E-} & (Ss- or (RS- & Bs-)) & (((B- or O+) & {@MV+}) or I+)); -"telah-melakukan": {@E-} & (Pv- or M- or (PP- & (B- or O+) & {@MV+})); -"sedang-melakukan": {@E-} & (Pg- or Mg-) & (O+ or B-) & {@MV+}; +"telah-melakukan.1": {@E-} & (Pv- or M- or (PP- & (B- or O+) & {@MV+})); +"sedang-melakukan.2": {@E-} & (Pg- or Mg-) & (O+ or B-) & {@MV+}; -telah sudah: ({Q-} & SIs+ & PP+) or ({@E-} & (Ss- or (RS- & B-)) & +telah.1 sudah.1: ({Q-} & SIs+ & PP+) or ({@E-} & (Ss- or (RS- & B-)) & (TO+ or ((B- or O+) & {@MV+}) or PP+)); -telah sudah: ({Q-} & SIp+ & PP+) or ({@E-} & (Sp- or +telah.2 sudah.2: ({Q-} & SIp+ & PP+) or ({@E-} & (Sp- or (RS- & Bp-) or I-) & (TO+ or ((B- or O+) & {@MV+}))); -telah sudah: ({Q-} & SI+ & PP+) or ({@E-} & (S- or (RS- & B-) or PP-) & +telah.3 sudah.3: ({Q-} & SI+ & PP+) or ({@E-} & (S- or (RS- & B-) or PP-) & (TO+ or ((B- or O+) & {@MV+}) or PP+)); -telah sudah: {@E-} & (M- or Pg-) & (TO+ or ((B- or O+) & {@MV+}) or PP+); +telah.4 sudah.4: {@E-} & (M- or Pg-) & (TO+ or ((B- or O+) & {@MV+}) or PP+); -adalah ialah yaitu: ((Ss- or (RS- & Bs-) or ({Q-} & SIs+)) +adalah.1 ialah.1 yaitu.1: ((Ss- or (RS- & Bs-) or ({Q-} & SIs+)) & (((O+ or B-) & {@MV+}) or P+ or AF-)); -adalah ialah yaitu: ((Sp- or (RS- & Bp-) or ({Q-} & +adalah.2 ialah.2 yaitu.2: ((Sp- or (RS- & Bp-) or ({Q-} & SIp+)) & (((O+ or B-) & {@MV+}) or P+ or AF-)); -adalah ialah yaitu: I- & (((O+ or B-) & {@MV+}) or P+ or AF-); -adalah ialah yaitu: PP- & (((O+ or B-) & {@MV+}) or P+ or AF-); -adalah ialah yaitu: {@E-} & (M- or Pg-) & (((O+ or B-) & {@MV+}) or P+ or AF-); +adalah.3 ialah.3 yaitu.3: I- & (((O+ or B-) & {@MV+}) or P+ or AF-); +adalah.4 ialah.4 yaitu.4: PP- & (((O+ or B-) & {@MV+}) or P+ or AF-); +adalah.5 ialah.5 yaitu.5: {@E-} & (M- or Pg-) & (((O+ or B-) & {@MV+}) or P+ or AF-); akan dapat.v bisa.v harus seharusnya sebaiknya mungkin: (({Q-} & SI+) or S- or (RS- & B-)) & I+; -lari datang: {@E-} & (Sp- or (RS- & Bp-) or I- or W- or PP-) & {@MV+}; -lari datang pergi: {@E-} & (Ss- or (RS- & Bs-)) & {@MV+}; -lari datang pergi: {@E-} & (S- or (RS- & B-)) & {@MV+}; -pergi: {@E-} & (Sp- or (RS- & Bp-) or I-) & {@MV+}; -pergi: {@E-} & PP- & {@MV+}; -pergi: {@E-} & (Pg- or M-) & {TO+} & {@MV+}; -lari datang: {@E-} & (Pg- or M-) & {@MV+}; +lari.1 datang.1: {@E-} & (Sp- or (RS- & Bp-) or I- or W- or PP-) & {@MV+}; +lari.2 datang.2 pergi.1: {@E-} & (Ss- or (RS- & Bs-)) & {@MV+}; +lari.3 datang.3 pergi.2: {@E-} & (S- or (RS- & B-)) & {@MV+}; +pergi.3: {@E-} & (Sp- or (RS- & Bp-) or I-) & {@MV+}; +pergi.4: {@E-} & PP- & {@MV+}; +pergi.5: {@E-} & (Pg- or M-) & {TO+} & {@MV+}; +lari.4 datang.4: {@E-} & (Pg- or M-) & {@MV+}; -bicara.v berbicara.v ngomong.v datang mati meninggal "meninggal-dunia" wafat tewas: +bicara.1.v.1 berbicara.1.v.1 ngomong.1.v.1 datang.5 mati.1 meninggal.1 "meninggal-dunia.1" wafat.1 tewas.1: {@E-} & (Sp- or (RS- & Bp-) or I-) & {@MV+}; -bicara.v berbicara.v ngomong.v datang mati meninggal "meninggal-dunia" wafat tewas: +bicara.2v berbicara.2v ngomong.2v datang.6 mati.2 meninggal.2 "meninggal-dunia.2" wafat.2 tewas.2: {@E-} & (Ss- or (RS- & Bs-)) & {@MV+}; -bicara.v berbicara.v ngomong.v datang mati meninggal "meninggal-dunia" wafat tewas: +bicara.3v berbicara.4v ngomong.3v datang.7 mati.3 meninggal.3 "meninggal-dunia.3" wafat.3 tewas.3: {@E-} & (S- or (RS- & B-) or PP-) & {@MV+}; -bicara.v berbicara.v ngomong.v datang berdatangan mati meninggal "meninggal-dunia" wafat tewas: +bicara.4v berbicara.4. ngomong.4v datang.8 berdatangan mati.4 meninggal.4 "meninggal-dunia.4" wafat.4 tewas.4: {@E-} & (Pg- or M-) & {@MV+}; -lihat melihat ketemu bertemu menemui kejar mengejar undang mengundang menangkap: +lihat.1 melihat.1 ketemu.1 bertemu.1 menemui.1 kejar.1 mengejar.1 undang.1 mengundang.1 menangkap.1: {@E-} & (Sp- or (RS- & Bp-) or I-) & (O+ or B-) & {@MV+}; -lihat melihat ketemu bertemu menemui kejar mengejar undang mengundang menangkap: +lihat.2 melihat.2 ketemu.2 bertemu.2 menemui.2 kejar.2 mengejar.2 undang.2 mengundang.2 menangkap.2: {@E-} & (Ss- or (RS- & Bs-)) & (O+ or B-) & {@MV+}; -lihat melihat ketemu bertemu menemui kejar mengejar undang mengundang menangkap: +lihat.3 melihat.3 ketemu.3 bertemu.3 menemui.3 kejar.3 mengejar.3 undang.3 mengundang.3 menangkap.3: {@E-} & (M- or Pv- or ((S- or (RS- & B-) or PP-) & (B- or O+))) & {@MV+}; -lihat melihat: {@E-} & (S- or (RS- & B-)) & (B- or O+) & {@MV+}; -lihat melihat: {@E-} & (Pv- or M- or (PP- & (B- or O+))) & {@MV+}; -lihat melihat ketemu bertemu menemui kejar mengejar undang mengundang menangkap: +lihat.4 melihat.4: {@E-} & (S- or (RS- & B-)) & (B- or O+) & {@MV+}; +lihat.5 melihat.5: {@E-} & (Pv- or M- or (PP- & (B- or O+))) & {@MV+}; +lihat.6 melihat.6 ketemu.4 bertemu.4 menemui.4 kejar.4 mengejar.4 undang.4 mengundang.4 menangkap.4: {@E-} & (Pg- or M-) & (O+ or B-) & {@MV+}; -bilang memberitahu menyampaikan: {@E-} & (Sp- or (RS- & Bp-) or I-) & ((O+ or B-) & +bilang.1 memberitahu.1 menyampaikan.1: {@E-} & (Sp- or (RS- & Bp-) or I-) & ((O+ or B-) & {TH+ or C+ or QI+ or @MV+}); -bilang memberitahu menyampaikan: {@E-} & (Ss- or (RS- & Bs-)) & ((O+ or B-) & {TH+ or C+ +bilang.2 memberitahu.2 menyampaikan.2: {@E-} & (Ss- or (RS- & Bs-)) & ((O+ or B-) & {TH+ or C+ or QI+ or @MV+}); -bilang memberitahu menyampaikan: {@E-} & (M- or Pv- or ((S- or (RS- & B-) or PP-) & (O+ or B-))) & +bilang.3 memberitahu.3 menyampaikan.3: {@E-} & (M- or Pv- or ((S- or (RS- & B-) or PP-) & (O+ or B-))) & {TH+ or C+ or QI+ or @MV+}; -bilang memberitahu menyampaikan: {@E-} & (Pg- or M-) & ((O+ or B-) & {TH+ or C+ or QI+ or +bilang.4 memberitahu.4 menyampaikan.4: {@E-} & (Pg- or M-) & ((O+ or B-) & {TH+ or C+ or QI+ or @MV+}); "akhir-akhir-ini" "barusan" "baru-saja" kadang-kadang "sebentar-lagi" "sesaat-lagi" @@ -125,30 +125,30 @@ sekarang "sedihnya" umumnya: E+ or MV-; -dari sambil di ke "di-belakang" antara "di-antara" "di-bawah" "di-atas" +dari sambil di ke.1 "di-belakang" antara "di-antara" "di-bawah" "di-atas" tanpa untuk "di-seberang" "ke-atas" "ke-bawah" sembari sekitar -"ke-dalam" "di-dalam" sekira oleh: +"ke-dalam" "di-dalam" sekira oleh.1: J+ & (Mp- or MV- or Pp-); -oleh: J+ & Mp-; +oleh.2: J+ & Mp-; "di-sini" "di-sana": MV- or Mp- or Pp-; -yang: (C+ & TH-) or Ds+ or (R- & (C+ or RS+)) or SIs- or (Ss+ & {{@CO-} & {C-}}) or J- or O-; +yang.1: (C+ & TH-) or Ds+ or (R- & (C+ or RS+)) or SIs- or (Ss+ & {{@CO-} & {C-}}) or J- or O-; -ke: (I+ & TO-) or ((MV- or Mp- or Pp-) & J+); +ke.2: (I+ & TO-) or ((MV- or Mp- or Pp-) & J+); siapa: (R- & (C+ or RS+)) or S+ or B+; apa: S+ or B+; -yang "di-mana": (R- & (C+ or RS+)) or S+ or B-; +yang.2 "di-mana.1": (R- & (C+ or RS+)) or S+ or B-; -karena kecuali meski tapi tetapi namun meskipun "meski-pun" walau walaupun "walau-pun": +karena kecuali meski.1 tapi.1 tetapi.1 namun.1 meskipun.1 "meski-pun.1" walau.1 walaupun.1 "walau-pun.1": (C+ & (({Xc+} & CO+) or MV-)); setelah sebelum sesudah sejak sampai: (C+ or J+) & (({Xc+} & CO+) or MV- or Mp-); jika bila andai: C+ & (({Xc+} & CO+) or MV-); kapan: (QI- & C+) or Q+ or (C+ & (({Xc+} & CO+) or MV-)); -"di-mana":(QI- & C+) or Q+; +"di-mana.2":(QI- & C+) or Q+; bagaimana gimana: (QI- & (C+ or EA+)) or Q+ or EA+; cepat lambat tangkas pendek panjang hitam putih besar kecil cantik tampan rupawan @@ -161,7 +161,7 @@ sangat amat: EA+; sekali banget bingit: EA-; -tapi tetapi namun meski meskipun "meski-pun" walau walaupun "walau-pun" dan: MV- & C+; +tapi.2 tetapi.2 namun.2 meski.2 meskipun.2 "meski-pun.2" walau.2 walaupun.2 "walau-pun.2" dan: MV- & C+; ",": Xc-; diff -Nru link-grammar-5.3.16/data/id/4.0.regex link-grammar-5.5.0/data/id/4.0.regex --- link-grammar-5.3.16/data/id/4.0.regex 2014-12-28 00:44:46.000000000 +0000 +++ link-grammar-5.5.0/data/id/4.0.regex 2018-04-27 00:51:04.000000000 +0000 @@ -6,119 +6,6 @@ % % %***************************************************************************% -% This file contains regular expressions that are used to match -% tokens not found in the dictionary. Each regex is given a name which -% determines the disjuncts assigned when the regex matches; this name -% must be defined in the dictionary along with the appropriate disjuncts. -% Note that the order of the regular expressions matters: matches will -% be attempted in the order in which the regexs appear in this file, -% and only the first match will be used. - -% Numbers. -% XXX, we need to add utf8 U+00A0 "no-break space" -% -% Allows at most two colons in hour-muinute-second HH:MM:SS expressions -% Allows at most two digits between colons -HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/ - -% e.g. 1950's leading number can be higher, for science fiction. -% Must be four digits, or possible three. Must end in s, 's ’s -DECADE-DATE: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/ - -% Similar to above, but does not end in s. Only allows four digits. -% We process this before NUMBERS below, so that this is matched first. -YEAR-DATE: /^([1-4][0-9]{3}|[1-9][0-9]{0,2})$/ - -% Day-of-month names; this regex will match before the one below. -DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/ - -% Ordinal numbers; everything except 1st through 13th -% is handled by regex. -ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/ - -% Allows any number of commas or periods -% Be careful not match the period at the end of a sentence; -% for example: "It happened in 1942." -NUMBERS: /^[0-9,.]*[0-9]$/ -% This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5" -NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/ -% Parses simple fractions e.g. "1/60" with no decimal points or anything fancy -FRACTION: /^[0-9]+\/[0-9]+$/ -% "10(3)" exponent (used in PubMed) -NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/ - -% Roman numerals -% The first expr has the problem that it matches an empty string. The -% cure for this is to use look-ahead, but neither the Gnu nor the BSD -% regex libs support look-ahead. I cant' think of a bettter solution. -ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/ -% ROMAN-NUMERAL-WORDS: /^(?=(M|C|D|L|X|V|I)+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/ -% ROMAN-NUMERAL-WORDS: /^(?=.+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/ - -% Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St." -% Make it at least two letters long, as otherwise it clobbers -% single-letter handling in the dict, which is different. -INITIALS: /^[A-Z]\.([A-Z]\.)+$/ - -% Greek letters with numbers -GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/ -PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/ - -% Some "safe" derived units. Simple units are in dictionary. -% The idea here is for the regex to match something that is almost -% certainly part of a derived unit, and allow the rest to be -% anything; this way we can capture difficult derived units such -% as "mg/kg/day" and even oddities such as "micrograms/mouse/day" -% without listing them explicitly. -% TODO: add more. -% Some (real) misses from these: -% micrograms.kg-1.h-1 microM-1 J/cm2 %/day mN/m cm/yr -% m/s days/week ml/s degrees/sec cm/sec cm/s mm/s N/mm (is that a unit?) -% cuts/minute clicks/s beats/minute x/week W/kg/W %/patient-year -% microIU/ml degrees/s counts/mm2 cells/mm3 tumors/mouse -% mm/sec ml/hr mJ/cm(2) m2/g amol/mm2 animals/group -% h-1 min-1 day-1 cm-1 mg-1 kg-1 mg.m-2.min-1 ms.cm-1 g-1 -% sec-1 ms-1 ml.min.-1kg-1 ml.hr-1 -% also, both kilometer and kilometers seem to be absent(!) -% remember "mm"! - -% grams/anything -UNITS: /^([npmk]|milli|micro|nano|pico|femto|atto|kilo|mega|tera)?(g|grams?)\// - -% mol/anything -UNITS: /^([fnmp]|milli|micro|nano|pico|femto|atto|mu)?mol(es)?\// - -% common endings -UNITS: /^[a-zA-Z\/.]+\/((m|micro)?[lLg]|mg|kg|mol|min|day|h|hr)$/ - -% common endings, except in the style "mg.kg-1" instead of "mg/kg". -UNITS: /^[a-zA-Z\/.1-]+\.((m|micro)?[lLg]|mg|kg|mol|min|day|h|hr)(-1|\(-1\))$/ - -% combinations of numbers and units, e.g. "50-kDa", "1-2h" -% TODO: Clean up and check that these are up-to-date wrt the -% dictionary-recognized units; this is quite a mess currently. -% TODO: Extend the "number" part of the regex to allow anything -% that the NUMBER regex matches. -% One problem here is a failure to split up the expression ... -% e.g. "2hr" becomes 2 - ND - hr with the ND link. But 2-hr is treated -% as a single word ('I is a 2-hr wait') -% NUMBER-AND-UNIT: /^[0-9.,-]+(msec|s|min|hour|h|hr|day|week|wk|month|year|yr|kDa|kilodalton|base|kilobase|base-pair|kD|kd|kDa|bp|nt|kb|mm|mg|cm|nm|g|Hz|ms|kg|ml|mL|km|microm|\%)$/ -% Comment out above, it screws up handling of unit suffixes, for -% example: "Zangbert stock fell 30% to $2.50 yesterday." - -% fold-words. Matches NUMBER-fold, where NUMBER can be either numeric -% or a spelled-out number, and the hyphen is optional. Note that for -% spelled-out numbers, anything is allowed between the "initial" number -% and "fold" to catch e.g. "two-to-three fold" ("fourteen" etc. are absent -% as the prefix "four" is sufficient to match). -FOLD-WORDS: /^[0-9.,:-]*[0-9]([0-9.,:-]|\([0-9.,:-]*[0-9][0-9.,:-]*\)|\+\/-)*-?fold$/ -FOLD-WORDS: /^(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fifteen|twenty|thirty|fifty|hundred|thousand|million).*fold$/ - -% Plural proper nouns. -% Make sure that apostrophe-s is split out correctly. -PL-CAPITALIZED-WORDS: /^[[:upper:]].*[^iuoys'’]s$/ - -% Other proper nouns. % We demand that these end with an alphanumeric, i.e. explicitly % reject punctuation. We don't want this regex to "swallow" any trailing % commas, colons, or periods/question-marks at the end of sentences. @@ -126,138 +13,5 @@ % (... any affix, for that matter ...) and so no embedded apostrophe CAPITALIZED-WORDS: /^[[:upper:]][^'’]*[^[:punct:]]$/ -% SUFFIX GUESSING -% For all suffix-guessing patterns, we insist that the pattern start -% with an alphanumeric. This is needed to guarentee that the -% prefix-stripping code works correctly, as otherwise, the regex will -% gobble the prefix. So for example: "We left (carrying the dog) and -% Fred followed." Since "(carrying" is not in the dict, we need to be -% sure to not match the leading paren so that it will get tripped. -% -ING-WORDS: /^\w.+ing$/ - -% Plurals or verb-s. Make sure that apostrophe-s is split out correctly. -% e.g. "The subject's name is John Doe." should be -% +--Ds--+---YS--+--Ds-+ -% | | | | -% the subject.n 's.p name.n -S-WORDS: /^\w.+[^iuoys'’]s$/ - -% Verbs ending -ed. -ED-WORDS: /^\w.+ed$/ - -% Advebs ending -ly. -LY-WORDS: /^\w.+ly$/ - -% Nouns ending in -ism, -asm (chiliasm .. ) Usualy mass nouns -% Stubbed out for now; I'm not convinced this improves accuracy. -% ISM-WORDS: /^\w.+asm$/ -% ISM-WORDS: /^\w.+ism$/ - -% Corresponding count noun version of above (chiliast...) -% AST-WORDS: /^\w.+ast$/ -% AST-WORDS: /^\w.+ist$/ - -% Corresponding adjectival form of above -ADJ-WORDS: /^\w.+astic$/ -ADJ-WORDS: /^\w.+istic$/ - -% Nouns ending -ation stubbed out in BioLG, stub out here ... -%ATION-WORDS: /^\w.+ation$/ - -% Extension by LIPN 11/10/2005 -% nouns -- typically seen in (bio-)chemistry texts -% synthetase, kinase -% 5-(hydroxymethyl)-2’-deoxyuridine -% hydroxyethyl, hydroxymethyl -% septation, reguion -% isomaltotetraose, isomaltotriose -% glycosylphosphatidylinositol -% iodide, oligodeoxynucleotide -% chronicity, hypochromicity -MC-NOUN-WORDS: /^\w.+ase$/ -MC-NOUN-WORDS: /^\w.+ene$/ -MC-NOUN-WORDS: /^\w.+ine?$/ -MC-NOUN-WORDS: /^\w.+yl$/ -MC-NOUN-WORDS: /^\w.+ion$/ -MC-NOUN-WORDS: /^\w.+ose$/ -MC-NOUN-WORDS: /^\w.+ol$/ -MC-NOUN-WORDS: /^\w.+ide$/ -MC-NOUN-WORDS: /^\w.+ity$/ - -% Can take TOn+. Must appear after above, to avoid clash with +ity -NOUN-TO-WORDS: /^\w.+ty$/ -NOUN-TO-WORDS: /^\w.+cy$/ -NOUN-TO-WORDS: /^\w.+nce$/ - -% replicon, intron -C-NOUN-WORDS: /^\w.+o[rn]$/ - -% adjectives -% exogenous, heterologous -% intermolecular, intramolecular -% glycolytic, ribonucleic, uronic -% ribosomal, ribsosomal -% nonpermissive, thermosensitive -% inducible, metastable -ADJ-WORDS: /^\w.+ous$/ -ADJ-WORDS: /^\w.+ar$/ -ADJ-WORDS: /^\w.+ic$/ -ADJ-WORDS: /^\w.+al$/ -ADJ-WORDS: /^\w.+ive$/ -ADJ-WORDS: /^\w.+ble$/ - -% latin (postposed) adjectives -% influenzae, tarentolae -% pentosaceus, luteus, carnosus -LATIN-ADJ-WORDS: /^\w.+ae$/ -LATIN-ADJ-WORDS: /^\w.+us$/ % must appear after -ous in this file - -% latin (postposed) adjectives or latin plural noun -% brevis, israelensis -% japonicum, tabacum, xylinum -LATIN-ADJ-P-NOUN-WORDS: /^\w.+is?$/ -LATIN-ADJ-S-NOUN-WORDS: /^\w.+um$/ - - -% Hyphenated words. In the original LG morpho-guessing system that -% predated the regex-based system, hyphenated words were detected -% before ING-WORDS, S-WORDS etc., causing e.g. "cross-linked" to be -% treated as a HYPHENATED-WORD (a generic adjective/noun), and -% never a verb. To return to this ordering, move this regex just -% after the CAPITALIZED-WORDS regex. -% We also match on commas, dots, brackets: n-amino-3-azabicyclo[3.3.0]octane -% []] means "match right-bracket" -% Explicitly call out (5'|3') so that we don't all a generic match to 'll -% But something is funky about this 5'-3' business since 2' also matches ??? -% /^[[:alnum:]][][:alnum:],:.\[-]*-[][:alnum:],:.\[-]*[[:alnum:]]$/ -HYPHENATED-WORDS: - /^[[:alnum:](5'|3')][][:alnum:](5'|3'),:.\(\)\[-]*-[][:alnum:],:.\(\)\[-]*[[:alnum:]]$/ - -% Emoticon checks must come *after* the above, so that the above take precedence. -% -% Emoticons must be entirely made of punctuation, length 2 or longer ;) -% XXX [:punct:] is strangely broken, I have to add ;-< explicitly -% XXX: Don't use [:punct:]. Do NOT include period!! -% XXX: The problem with below is that 5. 7. 8. get recognized as emoticons, -% which then prevents splitting for list numbers. (e.g "step 5. Do this.") -% -% Arghh. Other valid number expressions are clobbered by the emoticons. -% For example: $5 $7 8% The quick fix is to remove the numbers. -% Other breakages: The below clobbers "Bob, who ..." becuase it -% matches Bob, as an emoticon. -% -% EMOTICON: /^[[:punct:];BDOpTX0578C☆ಠ●@◎~][[:punct:]?@[\\\]^_`{|}~;BDOpTX0578C☆ಠ●@◎~][!"#$%&'()*+,\-/:;<=>?@[\\\]^_`{|}~?@[\\^_`{|}~: /^[.,-]{4}[.,-]*$/ diff -Nru link-grammar-5.3.16/data/id/Makefile.in link-grammar-5.5.0/data/id/Makefile.in --- link-grammar-5.3.16/data/id/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/data/id/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/kz/4.0.dict link-grammar-5.5.0/data/kz/4.0.dict --- link-grammar-5.3.16/data/kz/4.0.dict 2017-01-27 22:04:07.000000000 +0000 +++ link-grammar-5.5.0/data/kz/4.0.dict 2018-04-27 00:49:29.000000000 +0000 @@ -133,4 +133,4 @@ LATIN-ADJ-P-NOUN-WORDS: XXX-; LATIN-ADJ-S-NOUN-WORDS: XXX-; HYPHENATED-WORDS: XXX-; -UNKNOWN-WORD: XXX-; +: XXX-; diff -Nru link-grammar-5.3.16/data/kz/Makefile.in link-grammar-5.5.0/data/kz/Makefile.in --- link-grammar-5.3.16/data/kz/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/data/kz/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/lt/4.0.dict link-grammar-5.5.0/data/lt/4.0.dict --- link-grammar-5.3.16/data/lt/4.0.dict 2017-01-27 22:01:05.000000000 +0000 +++ link-grammar-5.5.0/data/lt/4.0.dict 2018-04-27 00:49:34.000000000 +0000 @@ -1044,4 +1044,4 @@ % With the following line in the dictionary, the parser will simply % skip over (null-link) unknown words. If you remove it, the parser % will output an error for any unknown words. -UNKNOWN-WORD: XXX+; +: XXX+; diff -Nru link-grammar-5.3.16/data/lt/4.0.regex link-grammar-5.5.0/data/lt/4.0.regex --- link-grammar-5.3.16/data/lt/4.0.regex 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/lt/4.0.regex 2018-04-27 00:51:13.000000000 +0000 @@ -15,5 +15,5 @@ % tokenizer explicitly mangles based on these punctution marks. % % Look for at least four in a row. -UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/ +: /^[.,-]{4}[.,-]*$/ diff -Nru link-grammar-5.3.16/data/lt/Makefile.in link-grammar-5.5.0/data/lt/Makefile.in --- link-grammar-5.3.16/data/lt/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/data/lt/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/Makefile.am link-grammar-5.5.0/data/Makefile.am --- link-grammar-5.3.16/data/Makefile.am 2017-02-12 11:48:14.000000000 +0000 +++ link-grammar-5.5.0/data/Makefile.am 2018-04-29 21:23:41.000000000 +0000 @@ -1,8 +1,11 @@ SUBDIRS=ady amy any ar de en fa he id kz lt ru tr vn demo-sql -# Include the README in the tarball, but do not install it. -EXTRA_DIST= README +dictdir=$(pkgdatadir) +dict_DATA=command-help-en.txt # The make uninstall target should remove directories we created. uninstall-hook: -rmdir $(pkgdatadir) + +# Include the README in the tarball, but do not install it. +EXTRA_DIST= README $(dict_DATA) diff -Nru link-grammar-5.3.16/data/Makefile.in link-grammar-5.5.0/data/Makefile.in --- link-grammar-5.3.16/data/Makefile.in 2017-04-15 21:49:49.000000000 +0000 +++ link-grammar-5.5.0/data/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -13,6 +13,7 @@ # PARTICULAR PURPOSE. @SET_MAKE@ + VPATH = @srcdir@ am__is_gnu_make = { \ if test -z '$(MAKELEVEL)'; then \ @@ -126,6 +127,35 @@ n|no|NO) false;; \ *) (install-info --version) >/dev/null 2>&1;; \ esac +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +am__installdirs = "$(DESTDIR)$(dictdir)" +DATA = $(dict_DATA) RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ distclean-recursive maintainer-clean-recursive am__recursive_targets = \ @@ -187,6 +217,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -215,6 +246,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -223,13 +255,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -296,12 +331,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -363,9 +398,11 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ SUBDIRS = ady amy any ar de en fa he id kz lt ru tr vn demo-sql +dictdir = $(pkgdatadir) +dict_DATA = command-help-en.txt # Include the README in the tarball, but do not install it. -EXTRA_DIST = README +EXTRA_DIST = README $(dict_DATA) all: all-recursive .SUFFIXES: @@ -404,6 +441,27 @@ clean-libtool: -rm -rf .libs _libs +install-dictDATA: $(dict_DATA) + @$(NORMAL_INSTALL) + @list='$(dict_DATA)'; test -n "$(dictdir)" || list=; \ + if test -n "$$list"; then \ + echo " $(MKDIR_P) '$(DESTDIR)$(dictdir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(dictdir)" || exit 1; \ + fi; \ + for p in $$list; do \ + if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ + echo "$$d$$p"; \ + done | $(am__base_list) | \ + while read files; do \ + echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(dictdir)'"; \ + $(INSTALL_DATA) $$files "$(DESTDIR)$(dictdir)" || exit $$?; \ + done + +uninstall-dictDATA: + @$(NORMAL_UNINSTALL) + @list='$(dict_DATA)'; test -n "$(dictdir)" || list=; \ + files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ + dir='$(DESTDIR)$(dictdir)'; $(am__uninstall_files_from_dir) # This directory's subdirectories are mostly independent; you can cd # into them and run 'make' without going through this Makefile. @@ -561,9 +619,12 @@ done check-am: all-am check: check-recursive -all-am: Makefile +all-am: Makefile $(DATA) installdirs: installdirs-recursive installdirs-am: + for dir in "$(DESTDIR)$(dictdir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done install: install-recursive install-exec: install-exec-recursive install-data: install-data-recursive @@ -614,7 +675,7 @@ info-am: -install-data-am: +install-data-am: install-dictDATA install-dvi: install-dvi-recursive @@ -658,7 +719,7 @@ ps-am: -uninstall-am: +uninstall-am: uninstall-dictDATA @$(NORMAL_INSTALL) $(MAKE) $(AM_MAKEFLAGS) uninstall-hook .MAKE: $(am__recursive_targets) install-am install-strip uninstall-am @@ -667,14 +728,15 @@ check-am clean clean-generic clean-libtool cscopelist-am ctags \ ctags-am distclean distclean-generic distclean-libtool \ distclean-tags distdir dvi dvi-am html html-am info info-am \ - install install-am install-data install-data-am install-dvi \ - install-dvi-am install-exec install-exec-am install-html \ - install-html-am install-info install-info-am install-man \ - install-pdf install-pdf-am install-ps install-ps-am \ - install-strip installcheck installcheck-am installdirs \ - installdirs-am maintainer-clean maintainer-clean-generic \ - mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \ - ps ps-am tags tags-am uninstall uninstall-am uninstall-hook + install install-am install-data install-data-am \ + install-dictDATA install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs installdirs-am maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-generic \ + mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \ + uninstall-am uninstall-dictDATA uninstall-hook .PRECIOUS: Makefile diff -Nru link-grammar-5.3.16/data/ru/4.0.dict link-grammar-5.5.0/data/ru/4.0.dict --- link-grammar-5.3.16/data/ru/4.0.dict 2017-01-27 22:04:07.000000000 +0000 +++ link-grammar-5.5.0/data/ru/4.0.dict 2018-04-27 00:49:39.000000000 +0000 @@ -1077,6 +1077,8 @@ %% Слова с большой буквы Вам.m2sd: <мест-sub> & (Jd- or [[MVad-]] or [[MVIad+]] or Nd- or <макро-сущ-d>); % МС,2л,ед,дт +LENGTH-LIMIT-1: LL*+; + % morphology linkages #include "/ru/morph.dict"; @@ -1092,4 +1094,4 @@ % With the following line in the dictionary, the parser will simply % skip over (null-link) unknown words. If you remove it, the parser % will output an error for any unknown words. -UNKNOWN-WORD: XXX+; +: XXX+; diff -Nru link-grammar-5.3.16/data/ru/4.0.regex link-grammar-5.5.0/data/ru/4.0.regex --- link-grammar-5.3.16/data/ru/4.0.regex 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/data/ru/4.0.regex 2018-04-27 00:51:21.000000000 +0000 @@ -54,7 +54,7 @@ % tokenizer explicitly mangles based on these punctution marks. % % Look for at least four in a row. -UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/ +: /^[.,-]{4}[.,-]*$/ MORPH-END-оряя: /оряя$/ MORPH-END-еряя: /еряя$/ diff -Nru link-grammar-5.3.16/data/ru/Makefile.in link-grammar-5.5.0/data/ru/Makefile.in --- link-grammar-5.3.16/data/ru/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/data/ru/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -217,6 +217,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -245,6 +246,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -253,13 +255,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -326,12 +331,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/ru/words/Makefile.in link-grammar-5.5.0/data/ru/words/Makefile.in --- link-grammar-5.3.16/data/ru/words/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/data/ru/words/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/tr/4.0.dict link-grammar-5.5.0/data/tr/4.0.dict --- link-grammar-5.3.16/data/tr/4.0.dict 2017-01-27 22:04:07.000000000 +0000 +++ link-grammar-5.5.0/data/tr/4.0.dict 2018-04-27 00:50:01.000000000 +0000 @@ -119,4 +119,4 @@ PL-GREEK-LETTER-AND-NUMBER: XXX-; CAPITALIZED-WORDS: XXX-; HYPHENATED-WORDS: XXX-; -UNKNOWN-WORD: XXX-; +: XXX-; diff -Nru link-grammar-5.3.16/data/tr/Makefile.in link-grammar-5.5.0/data/tr/Makefile.in --- link-grammar-5.3.16/data/tr/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/data/tr/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/data/vn/4.0.dict link-grammar-5.5.0/data/vn/4.0.dict --- link-grammar-5.3.16/data/vn/4.0.dict 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/vn/4.0.dict 2018-04-27 00:50:06.000000000 +0000 @@ -1,30 +1,35 @@ -% Vietnamese Dictionary +% Vietnamese Dictionary -% Dictionary version number is 5.3.3 (formatted as V5v3v3+) -: V5v3v3+; +% Dictionary version number is 5.4.5 (formatted as V5v4v5+) +: V5v4v5+; : VI4vn+; % See https://bitbucket.org/ngocminh/lienkate for the master -% project; this includes a version of the link-grammar perser +% project; this includes a version of the link-grammar parser % written in Java. % Guide to links: -% ĐT = động từ = verb +% VT = động từ = verb % DT = danh từ = noun -% TĐT=trước động từ=preceding verb +% TVT=trước động từ=preceeding verb + +% Due clashes with the conventions of the library, the following +% link characters are replaced: +% _ is replaced by U. +% Đ is replaced by V % Danh từ bàn ghế giường áo quần sách quyển_sách báo nhà cửa người hổ voi chó mèo cá cá_vàng cá_chép rắn thằn_lằn đất đá gỗ sắt thép núi sông biển trăng -sao trời cỏ cây hoa lá nước bố mẹ con anh chị +sao trời cỏ cây hoa lá nước bố mẹ con.n anh chị em ông bà cô chú bác giờ lít thước tấc mét héc_ta tỉnh huyện xã phường phòng ban xe_đạp xe_máy tôi tao tớ hắn ta chúng_ta chúng_tôi mình chúng_mình mày mi chúng_mày cậu nó chúng_nó họ sinh_viên học_sinh giáo_viên lò_xo âm_mưu quân_sự vụ quan_hệ ngoại_giao độc_lập vùng lực_lượng mật_vụ đơn_vị tăng -chuyện thông_tin áp_lực cuộc_chiến chữ ti_vi thủ_quân +chuyện.n thông_tin áp_lực cuộc_chiến chữ ti_vi thủ_quân con_đường vô_địch trận_đấu cơ_hội danh_hiệu đội mùa bóng cầu_thủ thành_tích chuyên_môn tuổi trận câu_lạc_bộ đội_tuyển quốc_gia mùa_giải thử_thách phản_ứng tối_hậu_thư @@ -42,11 +47,11 @@ giới khoa_học lũ con_cháu vật thí_nghiệm : % [n] (SDT1- & {SDT1+}) - or ({TDT3-} & {TDT2- or ST_DT-} & {TDT1-} & {SDT1+} + or ({TDT3-} & {TDT2- or STUDT-} & {TDT1-} & {SDT1+} & {SDT2+} & {SDT3+} & {SDT4+} & {SDT6+} - & {{{MA+} & SDT5+} & (SS_DT- or DT_LA+)} + & {{{MA+} & SDT5+} & (SSUDT- or DTULA+)} & {{{MA+} & SDT5+} - & (GT_DT- or ĐT_DT- or DT_ĐT+ or DT_TT+ or DT_THI+ or DT_LA+ or LA_DT-)}); + & (GTUDT- or VTUDT- or DTUVT+ or DTUTT+ or DTUTHI+ or DTULA+ or LAUDT-)}); % Danh từ tổng hợp đất_đá núi_non sông_biển trăng_sao : @@ -54,71 +59,71 @@ (SDT1- & {SDT1+}) or ({TDT1-} & {SDT1+} & {@SDT2+} & {SDT3+} & {SDT4+} & {{MA+} & SDT5+} & {SDT6+} - & {SS_DT- or DT_LA+} - & {GT_DT- or ĐT_DT- or DT_ĐT+ or DT_TT+ or DT_THI+}); + & {SSUDT- or DTULA+} + & {GTUDT- or VTUDT- or DTUVT+ or DTUTT+ or DTUTHI+}); % Tên riêng -NAME Tbilisi Nga Saakashvili Gruzia Arsenal MU Nou_Camp -Fabregas Seoul Bình_Nhưỡng Bảo_an Liên_Hợp_Quốc Kaka -Carlo_Ancelotti Chelsea David_Beckham Paolo_Maldini -Ancelotti Roman_Abramovich San_Siro Man City Man_City -Milan Y SRY Trochilidae châu_Mỹ protein virus -Hideyuki_Okano Okano : +NAME Tbilisi Nga Saakashvili Gruzia Arsenal MU NouUCamp +Fabregas Seoul BìnhUNhưỡng Bảo_an LiênUHợpUQuốc Kaka +CarloUAncelotti Chelsea DavidUBeckham PaoloUMaldini +Ancelotti RomanUAbramovich SanUSiro Man City ManUCity +Milan Y SRY Trochilidae châuUMỹ protein virus +HideyukiUOkano Okano : (SDT1- & {SDT1+}) or ({SDT1+} & {@SDT2+} & {SDT4+} - & {{{MA+} & SDT5+} & (SS_DT- or DT_LA+)} + & {{{MA+} & SDT5+} & (SSUDT- or DTULA+)} & {{{MA+} & SDT5+} - & (GT_DT- or ĐT_DT- or DT_ĐT+ or @DT_TT+ or DT_THI+)}); + & (GTUDT- or VTUDT- or DTUVT+ or @DTUTT+ or DTUTHI+)}); % Trước danh từ 3 cái chiếc tấm bộ con : TDT3+; % Trước danh từ 2 những các mỗi mọi : TDT2+; -% từng : TDT2+ or TĐT2_1+ or TTT2_1+ or TST2_1+; +% từng : TDT2+ or TVT2U1+ or TTT2U1+ or TST2U1+; % Trước danh từ 1 toàn_thể tất_thảy tất_cả toàn_bộ bất_kỳ : TDT1+; % Số từ một hai ba 22 200 40 3 vài phần_lớn 9 5 : - (({TST2_1- or TST2_2-} & {{TST4-} & TST3-} - & {TST4-}) & {TST-}) & {DT_ĐT-} & ST_DT+; + (({TST2U1- or TST2U2-} & {{TST4-} & TST3-} + & {TST4-}) & {TST-}) & {DTUVT-} & STUDT+; khoảng chừng : TST+; % Sau danh từ 3 -bằng về : (SDT3- or ĐT_GT- or TT_GT-) & GT_DT+; +bằng về : (SDT3- or VTUGT- or TTUGT-) & GTUDT+; % Sau danh từ 4 -của : SDT4- & GT_DT+; +của : SDT4- & GTUDT+; dưới trước sau ngoài với tại trên như cho đối_với trong : - GT_DT+ & (TT_GT- or ĐT_GT- or SDT4- or ({PH+} & CL+)); + GTUDT+ & (TTUGT- or VTUGT- or SDT4- or ({PH+} & CL+)); -từ bởi cùng khỏi : GT_DT+ & (TT_GT- or ĐT_GT- or ({PH+} & CL+)); +từ bởi cùng khỏi : GTUDT+ & (TTUGT- or VTUGT- or ({PH+} & CL+)); ra vào theo sang : - (GT_DT+ & (TT_GT- or ĐT_GT- or ({PH+} & CL+))) - or (((({TĐT1-} & {TĐT2_1-} & {{TĐT4-} & TĐT3-} & {TĐT4-}) - or {TT_ĐT-} or ({TT_ĐT-} & TĐT5-)) - & {DT_ĐT- or ĐT_ĐT- or THI_ĐT- or LT_ĐT-} - & {ĐT_XONG+} & {ĐT_DT+} & {ĐT_TT+} - & {ĐT_GT+} & {ĐT_LT+} & {THT- or THS+} - & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DT_ĐT-} & BI-)); + (GTUDT+ & (TTUGT- or VTUGT- or ({PH+} & CL+))) + or (((({TVT1-} & {TVT2U1-} & {{TVT4-} & TVT3-} & {TVT4-}) + or {TTUVT-} or ({TTUVT-} & TVT5-)) + & {DTUVT- or VTUVT- or THIUVT- or LTUVT-} + & {VTUXONG+} & {VTUDT+} & {VTUTT+} + & {VTUGT+} & {VTULT+} & {THT- or THS+} + & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DTUVT-} & BI-)); -để : LT_ĐT+ & (TT_GT- or ĐT_GT- or SDT4- or ({PH+} & CL+)); +để : LTUVT+ & (TTUGT- or VTUGT- or SDT4- or ({PH+} & CL+)); % Sau danh từ 6 -đó đấy kia ấy đây : GT_DT- or SDT6- or DT_LA+; -thế : GT_DT- or SDT6- or DT_LA+ or ĐT_DT-; -này : GT_DT- or SDT6-; +đó đấy kia ấy đây : GTUDT- or SDT6- or DTULA+; +thế : GTUDT- or SDT6- or DTULA+ or VTUDT-; +này : GTUDT- or SDT6-; % Từ "là" là : - (({TĐT1-} & {TĐT2_1-} & {TĐT3-} & {TĐT4-}) - or TĐT5- or TT_ĐT-) - & {DT_LA- or THI_ĐT-} & {LA_DT+ or ĐT_TT+}; + (({TVT1-} & {TVT2U1-} & {TVT3-} & {TVT4-}) + or TVT5- or TTUVT-) + & {DTULA- or THIUVT-} & {LAUDT+ or VTUTT+}; % Động từ đi đứng chạy ngồi bơi nhảy khóc cười rơi rụng @@ -126,12 +131,12 @@ nấu_ăn gắng đứng_dậy hành_xử giao_kèo ở_lại tiến_hóa sinh_sản vật_lộn tắt_thở rơi_xuống thụ_phấn phát_quang : % [v] - ((({TĐT1-} & {TĐT2_1-} & {{TĐT4-} & TĐT3-} & {TĐT4-}) - or TT_ĐT- or ({TT_ĐT-} & TĐT5-)) - & {DT_ĐT- or ĐT_ĐT- or THI_ĐT- or LT_ĐT-} - & {ĐT_XONG+} & {ĐT_TT+} & {ĐT_GT+} & {ĐT_LT+} + ((({TVT1-} & {TVT2U1-} & {{TVT4-} & TVT3-} & {TVT4-}) + or TTUVT- or ({TTUVT-} & TVT5-)) + & {DTUVT- or VTUVT- or THIUVT- or LTUVT-} + & {VTUXONG+} & {VTUTT+} & {VTUGT+} & {VTULT+} & {THT- or THS+} & {EV+} & {SDT5- or CL-} & {CO-}) - or ({DT_ĐT-} & BI-); + or ({DTUVT-} & BI-); tới học làm ăn đọc viết nghe xem gặp thấy đánh đuổi đợi phát_hiện đảo_chính tấn_công cắt công_nhận @@ -142,85 +147,85 @@ giúp sản_xuất phát_ra sử_dụng đưa sinh_hạ tìm_ra điều_trị đối_phó truyền tạo hy_vọng : % [v] - ((({TĐT1-} & {TĐT2_1-} & {TĐT3-} & {TĐT4-}) - or TT_ĐT- - or ({TT_ĐT-} & TĐT5-)) - & {DT_ĐT- or ĐT_ĐT- or THI_ĐT- or LT_ĐT- or BI-} - & {ĐT_XONG+} & {ĐT_DT+} & {ĐT_TT+} & {ĐT_GT+} - & {ĐT_LT+} & {THT- or THS+} - & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DT_ĐT-} & BI-); + ((({TVT1-} & {TVT2U1-} & {TVT3-} & {TVT4-}) + or TTUVT- + or ({TTUVT-} & TVT5-)) + & {DTUVT- or VTUVT- or THIUVT- or LTUVT- or BI-} + & {VTUXONG+} & {VTUDT+} & {VTUTT+} & {VTUGT+} + & {VTULT+} & {THT- or THS+} + & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DTUVT-} & BI-); nói biết hỏi.v xác_nhận bác_bỏ bảo cho_rằng khẳng_định cho_biết trả lời.v : % [v] - ((({TĐT1-} & {TĐT2_1-} & {{TĐT4-} & TĐT3-} & {TĐT4-}) - or TT_ĐT- - or ({TT_ĐT-} & TĐT5-)) & {DT_ĐT- or ĐT_ĐT- or THI_ĐT- or LT_ĐT-} - & {ĐT_XONG+} & {ĐT_TT+} & {ĐT_GT+} - & {ĐT_DT+ or R+ or CL+} & {ĐT_LT+} + ((({TVT1-} & {TVT2U1-} & {{TVT4-} & TVT3-} & {TVT4-}) + or TTUVT- + or ({TTUVT-} & TVT5-)) & {DTUVT- or VTUVT- or THIUVT- or LTUVT-} + & {VTUXONG+} & {VTUTT+} & {VTUGT+} + & {VTUDT+ or R+ or CL+} & {VTULT+} & {THT- or THS+} - & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DT_ĐT-} & BI-); + & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DTUVT-} & BI-); % Động từ yêu cầu, sai khiến ra_lệnh bắt bắt_buộc ép nài_ép hỏi.w đòi_hỏi cấm cho_phép yêu_cầu đề_nghị kêu_gọi quyết_định cân_nhắc cảm_thấy : % [v] - ((({TĐT1-} & {TĐT2_1-} & {{TĐT4-} & TĐT3-} & {TĐT4-}) - or TT_ĐT- - or ({TT_ĐT-} & TĐT5-)) & {DT_ĐT- or THI_ĐT- or LT_ĐT-} - & {ĐT_XONG+} & {CL+} & {ĐT_LT+ or ĐT_SS+} - & {THT- or THS+} & {EV+} & {SDT5- or CL-} & {CO-}) or ({DT_ĐT-} & BI-); + ((({TVT1-} & {TVT2U1-} & {{TVT4-} & TVT3-} & {TVT4-}) + or TTUVT- + or ({TTUVT-} & TVT5-)) & {DTUVT- or THIUVT- or LTUVT-} + & {VTUXONG+} & {CL+} & {VTULT+ or VTUSS+} + & {THT- or THS+} & {EV+} & {SDT5- or CL-} & {CO-}) or ({DTUVT-} & BI-); % Động từ tình thái phải dám nỡ nên.v có_thể không_thể định : % [v] - (({TĐT1-} & {TĐT2_1- or TĐT2_2-} & {{TĐT4-} & TĐT3-} & {TĐT4-}) - or TT_ĐT- - or ({TT_ĐT-} & TĐT5-)) & {DT_ĐT- or THI_ĐT- or LT_ĐT-} - & ĐT_ĐT+ & {ĐT_LT+ or ĐT_SS+} & {THT- or THS+} + (({TVT1-} & {TVT2U1- or TVT2U2-} & {{TVT4-} & TVT3-} & {TVT4-}) + or TTUVT- + or ({TTUVT-} & TVT5-)) & {DTUVT- or THIUVT- or LTUVT-} + & VTUVT+ & {VTULT+ or VTUSS+} & {THT- or THS+} & ({EV+} & {SDT5- or CL-} & {CO-}); muốn cần thích sợ mong mong_chờ tiếc hối_tiếc lo_lắng nhớ mơ.v ước quên kiên_nhẫn giận : % [v] - ((({TĐT1-} & {TĐT2_1- or TĐT2_2-} & {{TĐT4-} & TĐT3-} & {TĐT4-}) - or TT_ĐT- - or ({TT_ĐT-} & TĐT5-)) & {DT_ĐT- or THI_ĐT- or LT_ĐT-} - & {ĐT_DT+ or ĐT_ĐT+ or ĐT_TT+} & {SĐT+} & {THT- or THS+} - & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DT_ĐT-} & BI-); + ((({TVT1-} & {TVT2U1- or TVT2U2-} & {{TVT4-} & TVT3-} & {TVT4-}) + or TTUVT- + or ({TTUVT-} & TVT5-)) & {DTUVT- or THIUVT- or LTUVT-} + & {VTUDT+ or VTUVT+ or VTUTT+} & {SVT+} & {THT- or THS+} + & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DTUVT-} & BI-); % Động từ chỉ sự bắt đầu, tiếp diễn bắt_đầu tiếp_tục thôi dừng kết_thúc : % [v] - ((({TĐT1-} & {TĐT2_1-} & {{TĐT4-} & TĐT3-} & {TĐT4-}) - or TT_ĐT- - or ({TT_ĐT-} & TĐT5-)) & {DT_ĐT- or ĐT_ĐT- or THI_ĐT- or LT_ĐT-} - & {ĐT_XONG+} & {ĐT_DT+ or @ĐT_ĐT+} & {ĐT_TT+} - & {ĐT_GT+} & {ĐT_LT+ or ĐT_SS+} & {THT- or THS+} - & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DT_ĐT-} & BI-); + ((({TVT1-} & {TVT2U1-} & {{TVT4-} & TVT3-} & {TVT4-}) + or TTUVT- + or ({TTUVT-} & TVT5-)) & {DTUVT- or VTUVT- or THIUVT- or LTUVT-} + & {VTUXONG+} & {VTUDT+ or @VTUVT+} & {VTUTT+} + & {VTUGT+} & {VTULT+ or VTUSS+} & {THT- or THS+} + & ({EV+} & {SDT5- or CL-} & {CO-})) or ({DTUVT-} & BI-); % Trước động từ 1 và tính từ 1 -thường hay luôn : TĐT1+ or TTT1+ or TST1+; +thường hay luôn : TVT1+ or TTT1+ or TST1+; % Trước động từ 2.1 và tính từ 2.1 -không chẳng chưa chưa_từng : TĐT2_1+ or TTT2_1+ or TST2_1+; +không chẳng chưa chưa_từng : TVT2U1+ or TTT2U1+ or TST2U1+; % Trước động từ 2.2 và tính từ 2.2 -rất hơi khá chỉ : TĐT2_2+ or TTT2_2+ or TST2_2+; +rất hơi khá chỉ : TVT2U2+ or TTT2U2+ or TST2U2+; % Trước động từ 3 và tính từ 3 -đã đang sẽ vừa mới sắp sắp_sửa : TĐT3+ or TTT3+ or TST3+; +đã đang sẽ vừa mới sắp sắp_sửa : TVT3+ or TTT3+ or TST3+; % Trước động từ 4 và tính từ 4 -cũng vẫn còn cứ : TĐT4+ or TTT4+ or TST4+; +cũng vẫn còn cứ : TVT4+ or TTT4+ or TST4+; % Trước động từ 5 -đừng chớ : TĐT5+ or TTT5+; +đừng chớ : TVT5+ or TTT5+; % Từ "bị", "được" -bị : (({TĐT1-} & {TĐT2_1- or TĐT2_2-} & {TĐT3-} & {TĐT4-}) or TĐT5-) & {DT_ĐT- or SDT5- or THI_ĐT- or LT_ĐT- or ĐT_ĐT-} & BI+ & {ĐT_LT+} & ({THT-} or {THS+} or ({EV+} & {CL-} & {CO-})); +bị : (({TVT1-} & {TVT2U1- or TVT2U2-} & {TVT3-} & {TVT4-}) or TVT5-) & {DTUVT- or SDT5- or THIUVT- or LTUVT- or VTUVT-} & BI+ & {VTULT+} & ({THT-} or {THS+} or ({EV+} & {CL-} & {CO-})); -được : ((({TĐT1-} & {TĐT2_1- or TĐT2_2-} & {TĐT3-} & {TĐT4-}) or TĐT5-) & {DT_ĐT- or SDT5- or THI_ĐT- or LT_ĐT- or ĐT_ĐT-} & BI+ & {ĐT_LT+} & ({THT-} or {THS+} or ({EV+} & {CL-} & {CO-}))) or ĐT_XONG-; +được : ((({TVT1-} & {TVT2U1- or TVT2U2-} & {TVT3-} & {TVT4-}) or TVT5-) & {DTUVT- or SDT5- or THIUVT- or LTUVT- or VTUVT-} & BI+ & {VTULT+} & ({THT-} or {THS+} or ({EV+} & {CL-} & {CO-}))) or VTUXONG-; % Tính từ tốt đẹp xấu sạch bẩn đỏ xanh vàng đen trắng @@ -231,32 +236,32 @@ riêng muộn đắt chăm thấp tuyệt_vời rõ_ràng chính_thức hoàn_toàn có_thật chung đơn_tính quan_trọng có_ích hiện_đại hiệu_quả lùn : -(({TTT1-} & {TTT2_1- or TTT2_2-} & {{TTT4-} & TTT3-} & {TTT4-}) or TTT5- or TT_TT-) & {TT_TT+} & {DT_TT- or SDT2- or TT- or ĐT_TT- or THI_TT- or BI-} & {TT_ĐT+ or TT_GT+ or TT_LT+} & {STT+ or TT_SS+} & {THT- or THS+} & {EV+} & {CL-} & {CO-}; +(({TTT1-} & {TTT2U1- or TTT2U2-} & {{TTT4-} & TTT3-} & {TTT4-}) or TTT5- or TTUTT-) & {TTUTT+} & {DTUTT- or SDT2- or TT- or VTUTT- or THIUTT- or BI-} & {TTUVT+ or TTUGT+ or TTULT+} & {STT+ or TTUSS+} & {THT- or THS+} & {EV+} & {CL-} & {CO-}; nhiều ít tân : -(({TTT1-} & {TTT2_1- or TTT2_2-} & {{TTT4-} & TTT3-} & {TTT4-}) or TTT5- or TT_TT-) & (({TT_TT+} & {DT_TT- or SDT2- or ĐT_TT- or THI_TT- or BI-} & {TT_ĐT+ or TT_GT+} & {STT+ or TT_SS+} & {THT- or THS+} & ({EV+} & {CL-} & {CO-})) or ST_DT-); +(({TTT1-} & {TTT2U1- or TTT2U2-} & {{TTT4-} & TTT3-} & {TTT4-}) or TTT5- or TTUTT-) & (({TTUTT+} & {DTUTT- or SDT2- or VTUTT- or THIUTT- or BI-} & {TTUVT+ or TTUGT+} & {STT+ or TTUSS+} & {THT- or THS+} & ({EV+} & {CL-} & {CO-})) or STUDT-); % Sau tính từ -lắm quá : STT- or SĐT-; +lắm quá : STT- or SVT-; % Từ "thì" -thì : DT_THI- & (THI_TT+ or THI_ĐT+); +thì : DTUTHI- & (THIUTT+ or THIUVT+); % Các từ liên kết vị ng -rồi liền bèn lại đành : ĐT_LT- & LT_ĐT+; +rồi liền bèn lại đành : VTULT- & LTUVT+; -mà : (ĐT_LT- & LT_ĐT+) or MA-; +mà : (VTULT- & LTUVT+) or MA-; % Các từ so sánh -hơn kém : {TT_SS-} & SS_DT+; -nhất : TT_SS-; +hơn kém : {TTUSS-} & SSUDT+; +nhất : TTUSS-; % Từ "xong" -xong: ĐT_XONG-; +xong: VTUXONG-; % Từ "rằng" rằng : R- & CL+; -chuyện : (R- or GT_DT-) & (CL+ or DT_DT+); +chuyện : (R- or GTUDT-) & (CL+ or DTUDT+); % Các từ hỏi tại_sao vì_sao : THT+ & {R-}; @@ -264,11 +269,11 @@ khi_nào bao_giờ: (THT+ or THS-) & {R-}; ai : - {ĐT_DT- or SS_DT- or GT_DT- or DT_LA+ or DT_ĐT+ or DT_TT+ or DT_THI+} & {R-}; + {VTUDT- or SSUDT- or GTUDT- or DTULA+ or DTUVT+ or DTUTT+ or DTUTHI+} & {R-}; % Các từ nối trong câu ghép tuy_nhiên nhưng nên.w cho_nên nhờ_đó : CL+ & (EV- or QHT-); -bởi_vì vì : (GT_DT+ or CL+) & {PH+} & (EV- or (CO+ or QHT+)); +bởi_vì vì : (GTUDT+ or CL+) & {PH+} & (EV- or (CO+ or QHT+)); dù : CL+ & {PH+} & (EV- or (CO+ or QHT+)); sau_khi khi : CL+ & {PH+} & (EV- or CO+); @@ -279,9 +284,9 @@ và : (); % các từ đặc biệt -lẽ_ra có_lẽ : CL+ or (DT_GT- & (GT_ĐT+ or GT_TT+)); -một_cách : ĐT_TT- & TT+; -sau_đó ngày_nay vì_thế : CL+ & {ĐT_GT- or EV-}; +lẽ_ra có_lẽ : CL+ or (DTUGT- & (GTUVT+ or GTUTT+)); +một_cách : VTUTT- & TT+; +sau_đó ngày_nay vì_thế : CL+ & {VTUGT- or EV-}; % Regex not supported. HMS-TIME: XXX-; @@ -300,4 +305,4 @@ PART-NUMBER: XXX-; CAPITALIZED-WORDS: XXX-; HYPHENATED-WORDS: XXX-; -UNKNOWN-WORD: XXX-; +: XXX-; diff -Nru link-grammar-5.3.16/data/vn/4.0.regex link-grammar-5.5.0/data/vn/4.0.regex --- link-grammar-5.3.16/data/vn/4.0.regex 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/data/vn/4.0.regex 2018-04-27 00:51:32.000000000 +0000 @@ -129,4 +129,4 @@ % tokenizer explicitly mangles based on these punctuation marks. % % Look for at least four in a row. -UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/ +: /^[.,-]{4}[.,-]*$/ diff -Nru link-grammar-5.3.16/data/vn/Makefile.in link-grammar-5.5.0/data/vn/Makefile.in --- link-grammar-5.3.16/data/vn/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/data/vn/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -157,6 +157,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -185,6 +186,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -193,13 +195,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -266,12 +271,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/debian/changelog link-grammar-5.5.0/debian/changelog --- link-grammar-5.3.16/debian/changelog 2017-06-26 05:37:55.000000000 +0000 +++ link-grammar-5.5.0/debian/changelog 2018-05-12 23:17:57.000000000 +0000 @@ -1,3 +1,40 @@ +link-grammar (5.5.0-1ubuntu1) cosmic; urgency=medium + + * d/python3-link-grammar.install: Expect the installation in + usr/local/lib/python3. + + -- Matthias Klose Sat, 12 May 2018 19:17:57 -0400 + +link-grammar (5.5.0-1) unstable; urgency=medium + + * QA upload. + * New upstream release. + * Update symbols file. + * Remove trailing whitespace from debian/changelog in order to + silence the file-contains-trailing-whitespace Lintian tag. + * Upgrade to debhelper compat version 11. + * Remove the empty file debian/patches/series. + * Use HTTPS URI in debian/watch. + * Upgrade to Standards-Version 4.1.4 (no changes). + * Add debian/python3-link-grammar.lintian-overrides to override the + python-package-depends-on-package-from-other-python-variant + Lintian tag. + * Update Vcs-Browser and Vcs-Git fields in debian/control. + * Update Homepage field in debian/control and Source field in + debian/copyright to use HTTPS. + * Remove incorrect Multi-Arch fields in debian/control. + * Update debian/copyright. + * Update debian/link-grammar.install. + + -- Fabian Wolff Mon, 30 Apr 2018 20:53:21 +0200 + +link-grammar (5.3.16-3) unstable; urgency=medium + + * QA upload. + * Build-depend on python3-distutils. + + -- Matthias Klose Sun, 29 Apr 2018 22:37:59 +0200 + link-grammar (5.3.16-2) unstable; urgency=medium * QA upload. @@ -89,7 +126,7 @@ * QA upload * New upstream release - Includes "reproducible builds" patch from Chris Lamb (Closes: #829011) - * Drop all patches since they've been applied in new release + * Drop all patches since they've been applied in new release * debian/tests/control: Add Restrictions: build-needed * Have autopkgtest run upstream python tests too * Add python-link-grammar.examples @@ -301,7 +338,7 @@ introduced in the last two versions. Next time, I won't remove quilt, even when I have no patches. * Bump standards-version to 3.7.2 - * Fix homepage pseudo-field to have 2 spaces at the beginning so p.d.o + * Fix homepage pseudo-field to have 2 spaces at the beginning so p.d.o can parse it correctly. -- Ken Bloom Tue, 4 Jul 2006 22:49:41 -0500 @@ -320,10 +357,10 @@ * Updated manpage to be more clear. Thanks to Maru Dubshinki (Closes: #364532) - * Merged patch from upstream CVS to handle unset $LANG. + * Merged patch from upstream CVS to handle unset $LANG. (Closes: #364738) * Don't use sentence to generate error message after deleting it. - (Closes: #366136). I have not done an audit to find other cases of + (Closes: #366136). I have not done an audit to find other cases of this. -- Ken Bloom Sun, 23 Apr 2006 22:44:03 -0500 @@ -335,7 +372,7 @@ - Patches for dictionary selection were merged upstream - Rename the commandline parser using debian/rules rather than reautotooling. - * Install -T is broken despite being documented in the man page. Work + * Install -T is broken despite being documented in the man page. Work around that in debian/rules (Closes: #359280) -- Ken Bloom Tue, 18 Apr 2006 11:27:15 -0500 @@ -354,22 +391,21 @@ * Initial release Closes: #337277 * Patches: - dictionary-path-references - Since the dictionaries live in data/en, they don't need to refer to - en/words as this would really mean data/en/en/words which doesn't + Since the dictionaries live in data/en, they don't need to refer to + en/words as this would really mean data/en/en/words which doesn't exist. - binary-name - Named this link-grammar instead of grammar-parse. Partly personal - preference, and partly because if other kinds of grammar get added + Named this link-grammar instead of grammar-parse. Partly personal + preference, and partly because if other kinds of grammar get added to Debian, this one shouldn't have the generic name. - default-english-dictionary - Make link-grammar use the english dictionary in data/en as the - default. (Upstream stored the dictionary in data/en, but told the + Make link-grammar use the english dictionary in data/en as the + default. (Upstream stored the dictionary in data/en, but told the program to refer to the nonexistent one in data/) - acinclude_binreloc Needed to reautotoolize. - reautotoolize - Reautotoolize to deal with the stuff done in binary-name and + Reautotoolize to deal with the stuff done in binary-name and default-english-dictionary -- Ken Bloom Thu, 3 Nov 2005 16:53:09 -0600 - diff -Nru link-grammar-5.3.16/debian/compat link-grammar-5.5.0/debian/compat --- link-grammar-5.3.16/debian/compat 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/compat 2018-04-30 18:53:21.000000000 +0000 @@ -1 +1 @@ -10 +11 diff -Nru link-grammar-5.3.16/debian/control link-grammar-5.5.0/debian/control --- link-grammar-5.3.16/debian/control 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/control 2018-04-30 18:53:21.000000000 +0000 @@ -5,7 +5,7 @@ Build-Depends: ant [!hppa !hurd-i386 !m68k !sh4], autoconf-archive, - debhelper (>= 10), + debhelper (>= 11), default-jdk [!hppa !hurd-i386 !m68k !sh4], dh-python, libedit-dev, @@ -14,16 +14,16 @@ pkg-config, python-dev, python3-dev, + python3-distutils, swig, zlib1g-dev, -Standards-Version: 3.9.8 -Homepage: http://www.abisource.com/projects/link-grammar/ -Vcs-Browser: https://anonscm.debian.org/git/collab-maint/link-grammar.git -Vcs-Git: https://anonscm.debian.org/git/collab-maint/link-grammar.git +Standards-Version: 4.1.4 +Homepage: https://www.abisource.com/projects/link-grammar/ +Vcs-Browser: https://salsa.debian.org/debian/link-grammar +Vcs-Git: https://salsa.debian.org/debian/link-grammar.git/ Package: link-grammar Architecture: any -Multi-Arch: foreign Depends: link-grammar-dictionaries-en, ${misc:Depends}, @@ -102,7 +102,6 @@ Package: liblink-grammar-java Section: java Architecture: alpha amd64 arm64 armel armhf i386 ia64 kfreebsd-amd64 kfreebsd-i386 mips mips64 mips64el mipsel powerpc powerpcspe ppc64 ppc64el s390x sparc sparc64 x32 -Multi-Arch: same Depends: liblink-grammar5, ${misc:Depends}, diff -Nru link-grammar-5.3.16/debian/copyright link-grammar-5.5.0/debian/copyright --- link-grammar-5.3.16/debian/copyright 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/copyright 2018-04-30 18:53:21.000000000 +0000 @@ -1,6 +1,6 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: link-grammar -Source: http://www.abisource.com/downloads/link-grammar/ +Source: https://www.abisource.com/downloads/link-grammar/ Files: * Copyright: 2003-2004 Daniel Sleator @@ -12,8 +12,8 @@ 2007 Mike Ross 2008 Filip Maric 2008-2010 Borislav Iordanov - 2008-2016 Linas Vepstas - 2014-2016 Amir Plivatsky + 2008-2018 Linas Vepstas + 2014-2018 Amir Plivatsky License: LGPL-2.1 Files: bindings/java/org/linkgrammar/JSONReader.java diff -Nru link-grammar-5.3.16/debian/liblink-grammar5.symbols link-grammar-5.5.0/debian/liblink-grammar5.symbols --- link-grammar-5.3.16/debian/liblink-grammar5.symbols 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/liblink-grammar5.symbols 2018-04-30 18:53:21.000000000 +0000 @@ -6,7 +6,6 @@ dict_display_word_expr@Base 5.3.7 dict_display_word_info@Base 5.3.7 dictionary_create_default_lang@Base 4.2.2 - dictionary_create_from_utf8@Base 5.3.7 dictionary_create_lang@Base 4.2.2 dictionary_delete@Base 4.2.2 dictionary_get_data_dir@Base 4.6.5 @@ -16,13 +15,14 @@ eliminate_duplicate_disjuncts@Base 4.6.5 free_disjuncts@Base 4.6.5 free_lookup_list@Base 5.3.7 - left_print_string@Base 5.3.7 lg_compute_disjunct_strings@Base 4.6.5 lg_error_clearall@Base 5.3.14 + lg_error_flush@Base 5.4.4 lg_error_formatmsg@Base 5.3.14 lg_error_printall@Base 5.3.14 lg_error_set_handler@Base 5.3.14 lg_error_set_handler_data@Base 5.3.14 + lg_exp_get_string@Base 5.5.0 lg_expand_disjunct_list@Base 4.6.5 linkage_corpus_cost@Base 4.6.5 linkage_create@Base 4.2.2 @@ -50,9 +50,12 @@ linkage_get_num_words@Base 4.2.2 linkage_get_violation_name@Base 4.2.2 linkage_get_word@Base 4.2.2 + linkage_get_word_byte_end@Base 5.4.4 + linkage_get_word_byte_start@Base 5.4.4 + linkage_get_word_char_end@Base 5.4.4 + linkage_get_word_char_start@Base 5.4.4 linkage_get_words@Base 4.2.2 linkage_link_cost@Base 4.2.2 - linkage_post_process@Base 4.2.2 linkage_print_constituent_tree@Base 4.2.2 linkage_print_diagram@Base 4.2.2 linkage_print_disjuncts@Base 4.6.5 @@ -61,9 +64,11 @@ linkage_print_pp_msgs@Base 5.3.7 linkage_print_senses@Base 4.6.5 linkage_unused_word_cost@Base 4.2.2 + linkgrammar_get_configuration@Base 5.5.0 linkgrammar_get_dict_locale@Base 5.3.8 linkgrammar_get_dict_version@Base 4.6.7 linkgrammar_get_version@Base 4.6.5 + linkgrammar_open_data_file@Base 5.5.0 object_open@Base 4.6.5 parse_options_create@Base 4.2.2 parse_options_delete@Base 4.2.2 @@ -112,8 +117,6 @@ parse_options_set_use_viterbi@Base 5.3.7 parse_options_set_verbosity@Base 4.2.2 parse_options_timer_expired@Base 4.2.2 - post_process_close@Base 4.2.2 - post_process_open@Base 4.2.2 print_dictionary_data@Base 5.3.7 print_one_disjunct@Base 4.6.5 prt_error@Base 4.3.9 @@ -121,6 +124,7 @@ sentence_create@Base 4.2.2 sentence_delete@Base 4.2.2 sentence_disjunct_cost@Base 4.2.2 + sentence_display_wordgraph@Base 5.5.0 sentence_length@Base 4.2.2 sentence_link_cost@Base 4.3.5 sentence_null_count@Base 4.2.2 @@ -130,3 +134,4 @@ sentence_num_violations@Base 4.2.2 sentence_parse@Base 4.2.2 sentence_split@Base 4.6.5 + utf8_strwidth@Base 5.4.4 diff -Nru link-grammar-5.3.16/debian/link-grammar.install link-grammar-5.5.0/debian/link-grammar.install --- link-grammar-5.3.16/debian/link-grammar.install 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/link-grammar.install 2018-04-30 18:53:21.000000000 +0000 @@ -1 +1,3 @@ usr/bin +usr/share/man/man1 +usr/share/link-grammar/command-help-en.txt diff -Nru link-grammar-5.3.16/debian/manpages link-grammar-5.5.0/debian/manpages --- link-grammar-5.3.16/debian/manpages 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/manpages 2018-04-30 18:53:21.000000000 +0000 @@ -1 +1 @@ -man/link-parser.man1 +usr/share/man/man1/link-parser.1 diff -Nru link-grammar-5.3.16/debian/python3-link-grammar.install link-grammar-5.5.0/debian/python3-link-grammar.install --- link-grammar-5.3.16/debian/python3-link-grammar.install 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/python3-link-grammar.install 2018-05-12 23:15:29.000000000 +0000 @@ -1 +1 @@ -usr/lib/python3* +usr/local/lib/python3* diff -Nru link-grammar-5.3.16/debian/python3-link-grammar.lintian-overrides link-grammar-5.5.0/debian/python3-link-grammar.lintian-overrides --- link-grammar-5.3.16/debian/python3-link-grammar.lintian-overrides 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/debian/python3-link-grammar.lintian-overrides 2018-04-30 18:53:21.000000000 +0000 @@ -0,0 +1,5 @@ +# python3-link-grammar recommends python-link-grammar-examples. The +# examples are the same for Python 2 and 3, which is why creating a +# separate python3-link-grammar-examples package would be redundant. +# The following Lintian tag is therefore inappropriate in this case: +python3-link-grammar: python-package-depends-on-package-from-other-python-variant Recommends: python-link-grammar-examples diff -Nru link-grammar-5.3.16/debian/rules link-grammar-5.5.0/debian/rules --- link-grammar-5.3.16/debian/rules 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/rules 2018-04-30 18:53:21.000000000 +0000 @@ -26,7 +26,10 @@ debian/liblink-grammar-java/usr/lib/$(DEB_HOST_MULTIARCH)/jni endif find debian/ -name "*.la" -delete - dh_install --fail-missing + dh_install + +override_dh_missing: + dh_missing --fail-missing override_dh_compress: dh_compress --exclude=.py diff -Nru link-grammar-5.3.16/debian/tests/control link-grammar-5.5.0/debian/tests/control --- link-grammar-5.3.16/debian/tests/control 2017-06-26 05:37:55.000000000 +0000 +++ link-grammar-5.5.0/debian/tests/control 2018-04-30 18:53:21.000000000 +0000 @@ -1,4 +1,4 @@ Tests: unit-tests -Depends: @, build-essential, hunspell-en-us, locales-all, +Depends: @, python3-distutils, build-essential, hunspell-en-us, locales-all, default-jdk [!hppa !hurd-i386 !m68k !sh4], Restrictions: build-needed diff -Nru link-grammar-5.3.16/debian/watch link-grammar-5.5.0/debian/watch --- link-grammar-5.3.16/debian/watch 2017-06-19 16:42:30.000000000 +0000 +++ link-grammar-5.5.0/debian/watch 2018-04-30 18:53:21.000000000 +0000 @@ -1,4 +1,4 @@ version=4 opts=pgpsigurlmangle=s/$/.asc/ \ -http://www.abisource.com/downloads/@PACKAGE@/([0-9.]+)/ \ - @PACKAGE@@ANY_VERSION@@ARCHIVE_EXT@ +https://www.abisource.com/downloads/@PACKAGE@/([0-9.]+)/ \ + @PACKAGE@@ANY_VERSION@@ARCHIVE_EXT@ diff -Nru link-grammar-5.3.16/link-grammar/analyze-linkage.c link-grammar-5.5.0/link-grammar/analyze-linkage.c --- link-grammar-5.3.16/link-grammar/analyze-linkage.c 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/analyze-linkage.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,81 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2012, 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - - -#include -#include "analyze-linkage.h" -#include "string-set.h" -#include "structures.h" - -/** - * This returns a string that is the the GCD of the two given strings. - * If the GCD is equal to one of them, a pointer to it is returned. - * Otherwise a new string for the GCD is put in the string set. - */ -static const char * intersect_strings(String_set *sset, const char * s, const char * t) -{ - int i, j, d; - const char *w, *s0; - char u0[MAX_TOKEN_LENGTH]; /* Links are *always* less than 10 chars long */ - char *u; - - /* The head indicator is lower-case h, the dependent indicator is - * lower-case d. If they are present, skip them. The intersection - * cannot include them. */ - if (islower((int) *s)) s++; - if (islower((int) *t)) t++; - - if (strcmp(s,t) == 0) return s; /* would work without this */ - i = strlen(s); - j = strlen(t); - if (j > i) { - w = s; s = t; t = w; - } - /* s is now the longer (at least not the shorter) string */ - u = u0; - d = 0; - s0 = s; - while (*t != '\0') { - if ((*s == *t) || (*t == '*')) { - *u = *s; - } else { - assert(*s == '*', "Invalid intersection!"); - d++; - *u = *t; - } - s++; t++; u++; - } - if (d == 0) { - return s0; - } else { - strcpy(u, s); /* get the remainder of s */ - return string_set_add(u0, sset); - } -} - -/** - * The name of the link is set to be the GCD of the names of - * its two endpoints. Must be called after each extract_links(), - * etc. since that call issues a brand-new set of links into - * parse_info. - */ -void compute_link_names(Linkage lkg, String_set *sset) -{ - size_t i; - for (i = 0; i < lkg->num_links; i++) - { - lkg->link_array[i].link_name = intersect_strings(sset, - connector_get_string(lkg->link_array[i].lc), - connector_get_string(lkg->link_array[i].rc)); - } -} diff -Nru link-grammar-5.3.16/link-grammar/analyze-linkage.h link-grammar-5.5.0/link-grammar/analyze-linkage.h --- link-grammar-5.3.16/link-grammar/analyze-linkage.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/analyze-linkage.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,20 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _ANALYZE_LINKAGE_H -#define _ANALYZE_LINKAGE_H - -#include "api-types.h" -#include "link-includes.h" - -void compute_link_names(Linkage, String_set *); -#endif /* _ANALYZE_LINKAGE_H */ diff -Nru link-grammar-5.3.16/link-grammar/anysplit.c link-grammar-5.5.0/link-grammar/anysplit.c --- link-grammar-5.3.16/link-grammar/anysplit.c 2017-02-03 00:05:48.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/anysplit.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,632 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2014 Amir Plivatsky */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/** - * anysplit.c -- code that splits words into random morphemes. - * This is used for the language-learning/morpheme-learning project. - */ - -/* General assumptions: - * - false is binary 0 (for memset()) - * - int is >= 32 bit (for random number) - */ - -#include "utilities.h" /* included first, for MSVC rand_s() */ - -#include -#include -#include -#include -#include -#include -#include - -#include "dict-common.h" -#include "error.h" -#include "regex-morph.h" -#include "structures.h" -#include "tokenize.h" - -#include "anysplit.h" - - -#define MAX_WORD_TO_SPLIT 31 /* in codepoins */ - -extern const char * const afdict_classname[]; - -typedef int p_start; /* partition start in a word */ -typedef p_start *p_list; /* list of partitions in a word */ - -typedef struct split_cache /* split cached by word length */ -{ - size_t nsplits; /* number of splits */ - p_list sp; /* list of splits */ - bool *p_tried; /* list of tried splits */ - bool *p_selected; /* list of selected splits */ -} split_cache; - -typedef struct anysplit_params -{ - int nparts; /* maximum number of suffixes to split to */ - size_t altsmin; /* minimum number of alternatives to generate */ - size_t altsmax; /* maximum number of alternatives to generate */ - Regex_node *regpre, *regmid, *regsuf; /* issue matching combinations */ - split_cache scl[MAX_WORD_TO_SPLIT+1]; /* split cache according to word length */ -} anysplit_params; - -#define DEBUG_ANYSPLIT 0 - - -#if DEBUG_ANYSPLIT -static const char *gw; -/* print the current partitions */ -static void printsplit(int *ps, int n) -{ - static int sn = 0; /* split number */ - int pos = 0; - int p; - int l = strlen(gw); - - printf("split %d: ", sn++); - for (pos = 0, p = 0; pos < l && p <= n; pos++) - { - if (pos == ps[p]) - { - p++; - putchar(' '); - } - putchar(gw[pos]); - } - putchar('\n'); -} -static void printps(int *ps, int n) -{ - int i; - - printf("printps:"); - for (i = 0; i<=n; i++) printf(" ps[%d]=%d", i, ps[i]); - printf("\n"); -} -#endif - -static void cache_partitions(p_list pl, int *ps, int p) -{ - memcpy(pl, ps, sizeof(p_start) * p); -} - - /* p = 5 */ - /* 0 1 2 3 */ - /* | | | | */ - /* 123456789 */ - /* l = 9 */ - /* */ - /* n = 4 */ - /* ps[0] = 2 */ - /* ps[1] = 5 */ - /* ps[2] = 7 */ - /* ps[3] = 9 */ - -/** - * `scl`: If NULL, return the index of the last split, else cache the - * splits into scl. - */ -static int split_and_cache(int word_length, int nparts, split_cache *scl) -{ - - int n; - int maxindex; - p_list ps = alloca(sizeof(p_start)*nparts); /* partition start */ - - if (0 == word_length) return 0; - - /* The first partitioning is the whole word. - * (Using a first dummy partition would make this code unneeded.) - * But in any case the whole word here is unneeded, and I'm - * too lazy to change that. - */ - ps[0] = word_length; - maxindex = 0; - if (scl) cache_partitions(&scl->sp[0], ps, nparts); - - /* Generate all possible partitions up to nparts partitions */ - for (n = 1; n < nparts; n++) - { - /* increase the number of partitions */ - int m = 0; - int t; - - ps[0] = 1; - ps[n] = word_length; /* set last partition end (dummy partition start) */ - - //printf("New number of partitions: n=%d\n", n); - do - { - /* set next initial partitions lengths to 1 */ - //printf("Initialize: m=%d\n", m); - for (t = m; t < n; t++) - { - ps[t] = ps[m] + (t-m); - //printf("ps[%d]=%d ", t, ps[t]); - } - //printf("\n"); - - /* move last partition */ - //printf("Moving m=%d ps[m]=%d ps[m+1]=%d\n", n-1, ps[n-1], ps[n]); - for (m = n-1; ps[m] < ps[m+1]; ps[m]++) - { - maxindex++; - if (scl) cache_partitions(&scl->sp[maxindex*nparts], ps, nparts); - -#if DEBUG_ANYSPLIT - printsplit(ps, n); - printps(ps, n); -#endif - } - - /* last partition got to size 1, backtrack */ - do - { - //printf("Backtrack m %d->%d\n", m, m-1); - m--; - /* continue as long as there is a place to move for partition m */ - } while (m >= 0 && ps[m] + 1 == ps[m+1]); - if (m >= 0) ps[m]++; - } while (m >= 0); /* we have still positions to move */ - //printf("End (n=%d)\n", n); - } - - return maxindex+1; -} - -void free_anysplit(Dictionary afdict) -{ - size_t i; - anysplit_params *as = afdict->anysplit; - - if (NULL == as) return; - - for (i = 0; i < ARRAY_SIZE(as->scl); i++) - { - if (NULL == as->scl[i].sp) continue; - free(as->scl[i].sp); - free(as->scl[i].p_selected); - free(as->scl[i].p_tried); - } - free_regexs(as->regpre); - free_regexs(as->regmid); - free_regexs(as->regsuf); - free(as); - afdict->anysplit = NULL; -} - -/* - * Returns: Number of splits. - */ -static int split(int word_length, int nparts, split_cache *scl) -{ - size_t nsplits; - - if (NULL == scl->sp) - { - nsplits = split_and_cache(word_length, nparts, NULL); - //printf("nsplits %zu\n", nsplits); - if (0 == nsplits) - { - prt_error("Error: nsplits=0 (word_length=%d, nparts=%d)\n", - word_length, nparts); - return 0; - } - scl->sp = malloc(sizeof(p_start)*nparts * nsplits); - scl->p_selected = malloc(sizeof(*(scl->p_selected)) * nsplits); - scl->p_tried = malloc(sizeof(*(scl->p_tried)) * nsplits); - split_and_cache(word_length, nparts, scl); - scl->nsplits = nsplits; - } - - memset(scl->p_selected, false, sizeof(*(scl->p_selected)) * scl->nsplits); - memset(scl->p_tried, false, sizeof(*(scl->p_tried)) * scl->nsplits); - return scl->nsplits; -} - -/** - * Return a number between 0 and nsplits-1, including. - * No need for a good randomness; mediocre randomness is enough. - * We suppose int is 32 bit. - */ -static int rng_uniform(unsigned int *seedp, size_t nsplits) -{ - int res; - - res = rand_r(seedp); - - /* I don't mind the slight skew */ - return res % nsplits; - -} - -/* lutf is the length of the string, measured in code-points, - * blen is the length of the string, measured in bytes. - */ -#define D_MM 7 -static bool morpheme_match(Sentence sent, - const char *word, size_t lutf, p_list pl) -{ - Dictionary afdict = sent->dict->affix_table; - anysplit_params *as = afdict->anysplit; - size_t bos = 0, cpos = 0; /* byte offset, code-point offset */ - int p; - Regex_node *re; - size_t blen = strlen(word); - char *prefix_string = alloca(blen+1); - - lgdebug(+D_MM, "word=%s: ", word); - for (p = 0; p < as->nparts; p++) - { - size_t b = utf8_strncpy(prefix_string, &word[bos], pl[p]-cpos); - prefix_string[b] = '\0'; - bos += b; - - /* For flexibility, REGRPE is matched only to the prefix part, - * REGMID only to the middle suffixes, and REGSUF only to the - * suffix part - which cannot be the prefix. */ - if (0 == p) re = as->regpre; - else if (pl[p] == (int) lutf) re = as->regsuf; - else re = as->regmid; - lgdebug(D_MM, "re=%s part%d=%s: ", re->name, p, prefix_string); - - /* A NULL regex always matches */ - if ((NULL != re) && (NULL == match_regex(re, prefix_string))) - { - lgdebug(D_MM, "No match\n"); - return false; - } - - cpos = pl[p]; - if (cpos == lutf) break; - } - - lgdebug(D_MM, "Match\n"); - return true; -} -#undef D_MM - -static Regex_node * regbuild(const char **regstring, int n, int classnum) -{ - Regex_node *regex_root = NULL; - Regex_node **tail = ®ex_root; /* Last Regex_node in list */ - Regex_node *new_re; - int i; - - char *s; - char *sm; - - for (i = 0; i < n; i++) - { - /* read_entry() (read-dict.c) invokes patch_subscript() also for the affix - * file. As a result, if a regex contains a dot it is patched by - * SUBSCRIPT_MARK. We undo it here. */ - s = strdup(regstring[i]); - sm = strrchr(s, SUBSCRIPT_MARK); - if (sm) *sm = SUBSCRIPT_DOT; - - /* Create a new Regex_node and add to the list. */ - new_re = malloc(sizeof(*new_re)); - new_re->name = strdup(afdict_classname[classnum]); - new_re->pattern = s; - new_re->re = NULL; - new_re->next = NULL; - new_re->neg = false; /* TODO (if needed): Negative regex'es. */ - *tail = new_re; - tail = &new_re->next; - } - return regex_root; -} - - -/** - * Affix classes: - * REGPARTS Max number of word partitions. Value 0 disables anysplit. - * REGPRE Regex for prefix - * REGMID Regex for middle suffixes - * REGSUF Regex for suffix - * REGALTS Number of alternatives to issue for a word. - * Two values: minimum and maximum. - * If the word has more possibilities to split than the minimum, - * but less then the maximum, then issue them unconditionally. - */ - -/** - * Initialize the anysplit parameter and cache structure. - * Return true if initialization succeeded, or if dictionary does not use - * anysplit (its not an error to not use anysplit!). Return false if - * init failed. - */ -#define D_AI 10 -bool anysplit_init(Dictionary afdict) -{ - anysplit_params *as; - size_t i; - - Afdict_class *regpre = AFCLASS(afdict, AFDICT_REGPRE); - Afdict_class *regmid = AFCLASS(afdict, AFDICT_REGMID); - Afdict_class *regsuf = AFCLASS(afdict, AFDICT_REGSUF); - - Afdict_class *regalts = AFCLASS(afdict, AFDICT_REGALTS); - Afdict_class *regparts = AFCLASS(afdict, AFDICT_REGPARTS); - - if (0 == regparts->length) - { - if (verbosity_level(+D_AI)) - prt_error("Warning: File %s: Anysplit disabled (%s not defined)\n", - afdict->name, afdict_classname[AFDICT_REGPARTS]); - return true; - } - if (1 != regparts->length) - { - prt_error("Error: File %s: Must have %s defined with one value\n", - afdict->name, afdict_classname[AFDICT_REGPARTS]); - return false; - } - - as = malloc(sizeof(anysplit_params)); - for (i = 0; i < ARRAY_SIZE(as->scl); i++) as->scl[i].sp = NULL; - afdict->anysplit = as; - - as->regpre = regbuild(regpre->string, regpre->length, AFDICT_REGPRE); - as->regmid = regbuild(regmid->string, regmid->length, AFDICT_REGMID); - as->regsuf = regbuild(regsuf->string, regsuf->length, AFDICT_REGSUF); - - if (compile_regexs(as->regpre, NULL) != 0) return false; - if (compile_regexs(as->regmid, NULL) != 0) return false; - if (compile_regexs(as->regsuf, NULL) != 0) return false; - - as->nparts = atoi(regparts->string[0]); - if (as->nparts < 0) - { - free_anysplit(afdict); - prt_error("Error: File %s: Value of %s must be a non-negative number\n", - afdict->name, afdict_classname[AFDICT_REGPARTS]); - return false; - } - if (0 == as->nparts) - { - free_anysplit(afdict); - prt_error("Warning: File %s: Anysplit disabled (0: %s)\n", - afdict->name, afdict_classname[AFDICT_REGPARTS]); - return true; - } - - if (2 != regalts->length) - { - free_anysplit(afdict); - prt_error("Error: File %s: Must have %s defined with 2 values\n", - afdict->name, afdict_classname[AFDICT_REGALTS]); - return false; - } - as->altsmin = atoi(regalts->string[0]); - as->altsmax = atoi(regalts->string[1]); - if ((atoi(regalts->string[0]) <= 0) || (atoi(regalts->string[1]) <= 0)) - { - free_anysplit(afdict); - prt_error("Error: File %s: Value of %s must be 2 positive numbers\n", - afdict->name, afdict_classname[AFDICT_REGALTS]); - return false; - } - - return true; -} -#undef D_AI - -/** - * Split randomly. - * Return true on success. - * Return false when: - * - disabled (i.e. when doing regular language processing). - * - an error occurs (the behavior then is undefined). - * Such an error has not been observed yet. - */ -#define D_AS 5 -bool anysplit(Sentence sent, Gword *unsplit_word) -{ - const char * word = unsplit_word->subword; - Dictionary afdict = sent->dict->affix_table; - anysplit_params *as; - Afdict_class * stemsubscr; - - size_t l = strlen(word); - size_t lutf = utf8_strlen(word); - p_list pl; - size_t bos, cpos; /* byte offset, codepoint offset */ - int p; - int sample_point; - size_t nsplits; - size_t rndtried = 0; - size_t rndissued = 0; - size_t i; - unsigned int seed = sent->rand_state; - char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */ - bool use_sampling = true; - - if (NULL == afdict) return false; - as = afdict->anysplit; - - if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */ - - if (TS_ANYSPLIT == unsplit_word->tokenizing_step) - return true; /* We already handled this token. */ - - if (lutf > MAX_WORD_TO_SPLIT) - { - Gword *w = issue_word_alternative(sent, unsplit_word, "AS>", - 0,NULL, 1,&word, 0,NULL); - w->tokenizing_step = TS_ANYSPLIT; - return true; - } - - if (0 == l) - { - prt_error("Warning: anysplit(): word length 0\n"); - return false; - } - - stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR); - - // seed = time(NULL)+(unsigned int)(long)&seed; - -#if DEBUG_ANYSPLIT - gw = word; -#endif - - nsplits = split(lutf, as->nparts, &as->scl[lutf]); - if (0 == nsplits) - { - prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n"); - return false; - } - - if (as->altsmax >= nsplits) - { - /* Issue everything */ - sample_point = -1; - use_sampling = false; - } - - lgdebug(+D_AS, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, " - "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no", - word, nsplits, as->nparts, as->altsmin, as->altsmax); - - while (rndtried < nsplits && (!use_sampling || (rndissued < as->altsmax))) - { - if (use_sampling) - { - sample_point = rng_uniform(&seed, nsplits); - - if (sample_point < 0) /* Cannot happen with rand_r() */ - { - prt_error("Error: rng: %s\n", strerror(errno)); - return false; - } - } - else - { - sample_point++; - } - - lgdebug(D_AS, "Sample: %d ", sample_point); - if (as->scl[lutf].p_tried[sample_point]) - { - lgdebug(D_AS+1, "(repeated)\n"); - continue; - } - lgdebug(D_AS+1, "(new)"); - rndtried++; - as->scl[lutf].p_tried[sample_point] = true; - if (morpheme_match(sent, word, lutf, &as->scl[lutf].sp[sample_point*as->nparts])) - { - as->scl[lutf].p_selected[sample_point] = true; - rndissued++; - } - else - { - lgdebug(D_AS, "\n"); - } - } - - lgdebug(D_AS, "Results: word '%s' (byte-length=%zu utf-chars=%zu): %zu/%zu:\n", - word, lutf, l, rndissued, nsplits); - - for (i = 0; i < nsplits; i++) - { - const char **affixes = NULL; - int num_sufixes; - int num_affixes = 0; - - if (!as->scl[lutf].p_selected[i]) continue; - - pl = &as->scl[lutf].sp[i*as->nparts]; - bos = 0; - cpos = 0; - for (p = 0; p < as->nparts; p++) - { - size_t b = 0; - if (pl[0] == (int)lutf) /* This is the whole word */ - { - b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); - affix[b] = '\0'; - } - else - if (0 == cpos) /* The first, but not the only morpheme */ - { - b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); - affix[b] = '\0'; - } - else /* 2nd and subsequent morphemes */ - { - b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); - affix[b] = '\0'; - num_affixes++; - } - altappend(sent, &affixes, affix); - - bos += b; - cpos = pl[p]; - // if (cpos == lutf) break; /* Same thing as below...*/ - if (bos == l) break; - } - - const char **prefix_position, **stem_position , **suffix_position; - switch (num_affixes) - { - case 0: - prefix_position = NULL; - stem_position = &affixes[0]; /* May be just a word here */ - suffix_position = NULL; - num_sufixes = 0; - break; - case 1: - prefix_position = NULL; - stem_position = &affixes[0]; - suffix_position = &affixes[1]; - num_sufixes = 1; - break; - default: - prefix_position =&affixes[0]; - stem_position = &affixes[1]; - suffix_position = &affixes[2]; - num_sufixes = num_affixes - 1; - break; - } - if (num_affixes > 0) - { - if (0 != stemsubscr->length) { - strcpy(affix, stem_position[0]); - strcat(affix, stemsubscr->string[0]); - stem_position[0] = affix; - } - } - - // XXX FIXME -- this is wrong - it assumes a - // variable number of suffixes. - /* Here a leading INFIX_MARK is added to the suffixes if needed. */ - Gword *alt = issue_word_alternative(sent, unsplit_word, "AS", - (NULL == prefix_position) ? 0 : 1, prefix_position, - 1, stem_position, - num_sufixes, suffix_position); - for (Gword *w = alt; w->alternative_id == alt; w = w->next[0]) - w->tokenizing_step = TS_ANYSPLIT; - free(affixes); - } - - /* 0 == sent->rand_state denotes "repeatable rand". */ - if (0 != sent->rand_state) sent->rand_state = seed; - return true; -} -#undef D_AS diff -Nru link-grammar-5.3.16/link-grammar/anysplit.h link-grammar-5.5.0/link-grammar/anysplit.h --- link-grammar-5.3.16/link-grammar/anysplit.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/anysplit.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2014 Amir Plivatsky */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "api-types.h" -#include "link-includes.h" - -bool anysplit(Sentence, Gword *); -void free_anysplit(Dictionary); -bool anysplit_init(Dictionary); diff -Nru link-grammar-5.3.16/link-grammar/api.c link-grammar-5.5.0/link-grammar/api.c --- link-grammar-5.3.16/link-grammar/api.c 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/api.c 2018-04-22 14:57:33.000000000 +0000 @@ -11,40 +11,35 @@ /* */ /*************************************************************************/ -#include -#include #include -#include -#include "analyze-linkage.h" +#include "api-structures.h" #include "corpus/corpus.h" -#include "count.h" -#include "dict-common.h" -#include "disjunct-utils.h" -#include "error.h" -#include "externs.h" -#include "extract-links.h" -#include "fast-match.h" -#include "linkage.h" -#include "post-process.h" -#include "preparation.h" -#include "print.h" -#include "prune.h" -#include "regex-morph.h" +#include "dict-common/dict-utils.h" // for free_X_nodes +#include "disjunct-utils.h" // for free_disjuncts +#include "linkage/linkage.h" +#include "memory-pool.h" +#include "parse/histogram.h" // for PARSE_NUM_OVERFLOW +#include "parse/parse.h" +#include "post-process/post-process.h" // for post_process_new() +#include "prepare/exprune.h" +#include "string-set.h" #include "resources.h" -#include "score.h" #include "sat-solver/sat-encoder.h" -#include "spellcheck.h" -#include "string-set.h" -#include "structures.h" -#include "tokenize.h" +#include "tokenize/spellcheck.h" +#include "tokenize/tokenize.h" +#include "tokenize/word-structures.h" // Needed for Word_struct/free_X_node #include "utilities.h" -#include "wordgraph.h" -#include "word-utils.h" /* Its OK if this is racey across threads. Any mild shuffling is enough. */ static unsigned int global_rand_state = 0; +int verbosity; +/* debug and test should not be NULL since they can be used before they + * are assigned a value by parse_options_get_...() */ +char * debug = (char *)""; +char * test = (char *)""; + /*************************************************************** * * Routines for setting Parse_Options @@ -102,7 +97,7 @@ Parse_Options po; init_memusage(); - po = (Parse_Options) xalloc(sizeof(struct Parse_Options_s)); + po = (Parse_Options) malloc(sizeof(struct Parse_Options_s)); /* Here's where the values are initialized */ @@ -155,10 +150,10 @@ return po; } -int parse_options_delete(Parse_Options opts) +int parse_options_delete(Parse_Options opts) { resources_delete(opts->resources); - xfree(opts, sizeof(struct Parse_Options_s)); + free(opts); return 0; } @@ -363,7 +358,7 @@ } void parse_options_set_short_length(Parse_Options opts, int short_length) { - opts->short_length = short_length; + opts->short_length = MIN(short_length, UNLIMITED_LEN); } int parse_options_get_short_length(Parse_Options opts) { @@ -442,230 +437,7 @@ /*************************************************************** * -* Routines for postprocessing -* -****************************************************************/ - -static Linkage linkage_array_new(int num_to_alloc) -{ - Linkage lkgs = (Linkage) exalloc(num_to_alloc * sizeof(struct Linkage_s)); - memset(lkgs, 0, num_to_alloc * sizeof(struct Linkage_s)); - return lkgs; -} - -void free_linkage(Linkage linkage) -{ - exfree((void *) linkage->word, sizeof(const char *) * linkage->num_words); - exfree(linkage->chosen_disjuncts, linkage->num_words * sizeof(Disjunct *)); - free(linkage->link_array); - - /* Q: Why isn't this in a string set ?? A: Because there is no - * string-set handy when we compute this. */ - if (linkage->disjunct_list_str) - { - size_t j; - for (j=0; jnum_words; j++) - { - if (linkage->disjunct_list_str[j]) - free(linkage->disjunct_list_str[j]); - } - free(linkage->disjunct_list_str); - } -#ifdef USE_CORPUS - lg_sense_delete(linkage); -#endif - - linkage_free_pp_info(linkage); - - /* XXX FIXME */ - free(linkage->wg_path); - free(linkage->wg_path_display); -} - -static void free_linkages(Sentence sent) -{ - size_t in; - Linkage lkgs = sent->lnkages; - if (!lkgs) return; - - for (in=0; innum_linkages_alloced; in++) - { - free_linkage(&lkgs[in]); - } - - exfree(lkgs, sent->num_linkages_alloced * sizeof(struct Linkage_s)); - sent->num_linkages_alloced = 0; - sent->num_linkages_found = 0; - sent->num_linkages_post_processed = 0; - sent->num_valid_linkages = 0; - sent->lnkages = NULL; -} - -/* Partial, but not full initialization of the linakge struct ... */ -void partial_init_linkage(Sentence sent, Linkage lkg, unsigned int N_words) -{ - lkg->num_links = 0; - lkg->lasz = 2 * N_words; - lkg->link_array = (Link *) malloc(lkg->lasz * sizeof(Link)); - memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); - - lkg->num_words = N_words; - lkg->cdsz = N_words; - lkg->chosen_disjuncts = (Disjunct **) exalloc(lkg->cdsz * sizeof(Disjunct *)); - memset(lkg->chosen_disjuncts, 0, N_words * sizeof(Disjunct *)); - - lkg->disjunct_list_str = NULL; -#ifdef USE_CORPUS - lkg->sense_list = NULL; -#endif - - lkg->pp_info = NULL; - lkg->sent = sent; -} - -/** - * This does basic post-processing for all linkages. - */ -static void post_process_lkgs(Sentence sent, Parse_Options opts) -{ - size_t in; - size_t N_linkages_post_processed = 0; - size_t N_valid_linkages = sent->num_valid_linkages; - size_t N_linkages_alloced = sent->num_linkages_alloced; - bool twopass = sent->length >= opts->twopass_length; - - /* Special-case the "amy/ady" morphology handling. */ - if (sent->dict->affix_table->anysplit) - { - sent->num_linkages_post_processed = sent->num_valid_linkages; - return; - } - - /* (optional) First pass: just visit the linkages */ - /* The purpose of the first pass is to make the post-processing - * more efficient. Because (hopefully) by the time the real work - * is done in the 2nd pass, the relevant rule set has been pruned - * in the first pass. - */ - if (twopass) - { - for (in=0; in < N_linkages_alloced; in++) - { - Linkage lkg = &sent->lnkages[in]; - Linkage_info *lifo = &lkg->lifo; - - if (lifo->discarded || lifo->N_violations) continue; - - post_process_scan_linkage(sent->postprocessor, lkg); - - if ((49 == in%50) && resources_exhausted(opts->resources)) break; - } - } - - /* Second pass: actually perform post-processing */ - for (in=0; in < N_linkages_alloced; in++) - { - PP_node *ppn; - Linkage lkg = &sent->lnkages[in]; - Linkage_info *lifo = &lkg->lifo; - - if (lifo->discarded || lifo->N_violations) continue; - - ppn = do_post_process(sent->postprocessor, lkg, twopass); - - /* XXX There is no need to set the domain names if we are not - * printing them. However, deferring this until later requires - * a huge code re-org, because pp_data is needed to get the - * domain type array, and pp_data is deleted immediately below. - * Basically, pp_data and pp_node should be a part of the linkage, - * and not part of the Postprocessor struct. - * This costs about 1% performance penalty. */ - build_type_array(sent->postprocessor); - linkage_set_domain_names(sent->postprocessor, lkg); - - post_process_free_data(&sent->postprocessor->pp_data); - - if (NULL != ppn->violation) - { - N_valid_linkages--; - lifo->N_violations++; - - /* Set the message, only if not set (e.g. by sane_morphism) */ - if (NULL == lifo->pp_violation_msg) - lifo->pp_violation_msg = ppn->violation; - } - N_linkages_post_processed++; - - linkage_score(lkg, opts); - if ((9 == in%10) && resources_exhausted(opts->resources)) break; - } - - /* If the timer expired, then we never finished post-processing. - * Mark the remaining sentences as bad, as otherwise strange - * results get reported. */ - for (; in < N_linkages_alloced; in++) - { - Linkage lkg = &sent->lnkages[in]; - Linkage_info *lifo = &lkg->lifo; - - if (lifo->discarded || lifo->N_violations) continue; - - N_valid_linkages--; - lifo->N_violations++; - - /* Set the message, only if not set (e.g. by sane_morphism) */ - if (NULL == lifo->pp_violation_msg) - lifo->pp_violation_msg = "Timeout during postprocessing"; - } - - print_time(opts, "Postprocessed all linkages"); - - if (verbosity_level(6)) - { - err_msg(lg_Info, "%zu of %zu linkages with no P.P. violations\n", - N_valid_linkages, N_linkages_post_processed); - } - - sent->num_linkages_post_processed = N_linkages_post_processed; - sent->num_valid_linkages = N_valid_linkages; -} - -static void sort_linkages(Sentence sent, Parse_Options opts) -{ - if (0 == sent->num_linkages_found) return; - - /* It they're randomized, don't bother sorting */ - if (0 != sent->rand_state && sent->dict->shuffle_linkages) return; - - qsort((void *)sent->lnkages, sent->num_linkages_alloced, - sizeof(struct Linkage_s), - (int (*)(const void *, const void *))opts->cost_model.compare_fn); - -#ifdef DEBUG - /* Skip in case of a timeout - sent->lnkages may be inconsistent then. */ - if (!resources_exhausted(opts->resources)) - { - /* num_linkages_post_processed sanity check (ONLY). */ - size_t in; - size_t N_linkages_post_processed = 0; - for (in=0; in < sent->num_linkages_alloced; in++) - { - Linkage_info *lifo = &sent->lnkages[in].lifo; - if (lifo->discarded) break; - N_linkages_post_processed++; - } - assert(sent->num_linkages_post_processed==N_linkages_post_processed, - "Bad num_linkages_post_processed (%zu!=%zu)", - sent->num_linkages_post_processed, N_linkages_post_processed); - } -#endif - - print_time(opts, "Sorted all linkages"); -} - -/*************************************************************** -* -* Routines for creating and destroying processing Sentences +* Routines for creating destroying and processing Sentences * ****************************************************************/ @@ -673,12 +445,13 @@ { Sentence sent; - sent = (Sentence) xalloc(sizeof(struct Sentence_s)); + sent = (Sentence) malloc(sizeof(struct Sentence_s)); memset(sent, 0, sizeof(struct Sentence_s)); sent->dict = dict; sent->string_set = string_set_create(); sent->rand_state = global_rand_state; + sent->disjuncts_connectors_memblock = NULL; sent->postprocessor = post_process_new(dict->base_knowledge); @@ -736,81 +509,31 @@ static void free_sentence_words(Sentence sent) { - size_t i; - - for (i = 0; i < sent->length; i++) + for (WordIdx i = 0; i < sent->length; i++) { free_X_nodes(sent->word[i].x); - free_disjuncts(sent->word[i].d); free(sent->word[i].alternatives); } + free_sentence_disjuncts(sent); free((void *) sent->word); sent->word = NULL; } -static void wordgraph_delete(Sentence sent) -{ - Gword *w = sent->wordgraph; - - while(NULL != w) - { - Gword *w_tofree = w; - - free(w->prev); - free(w->next); - free(w->hier_position); - free(w->null_subwords); - w = w->chain_next; - free(w_tofree); - } - sent->wordgraph = sent->last_word = NULL; -} - -static void word_queue_delete(Sentence sent) -{ - struct word_queue *wq = sent->word_queue; - while (NULL != wq) - { - struct word_queue *wq_tofree = wq; - wq = wq->next; - free(wq_tofree); - }; - sent->word_queue = NULL; -} - -/** - * Delete the gword_set associated with the Wordgraph. - * @w First Wordgraph word. - */ -static void gword_set_delete(Gword *w) -{ - for (w = w->chain_next; NULL != w; w = w->chain_next) - { - gword_set *n; - for (gword_set *f = w->gword_set_head.chain_next; NULL != f; f = n) - { - n = f->chain_next; - free(f); - } - } -} - void sentence_delete(Sentence sent) { if (!sent) return; sat_sentence_delete(sent); free_sentence_words(sent); - gword_set_delete(sent->wordgraph); wordgraph_delete(sent); - word_queue_delete(sent); string_set_delete(sent->string_set); - free_parse_info(sent->parse_info); free_linkages(sent); post_process_free(sent->postprocessor); post_process_free(sent->constituent_pp); global_rand_state = sent->rand_state; - xfree((char *) sent, sizeof(struct Sentence_s)); + pool_delete(sent->fm_Match_node); + pool_delete(sent->Table_connector_pool); + free(sent); } int sentence_length(Sentence sent) @@ -872,639 +595,6 @@ return sent->lnkages[i].lifo.link_cost; } -/** - * Construct word paths (one or more) through the Wordgraph. - * - * Add 'current_word" to the potential path. - * Add "p" to the path queue, which defines the start of the next potential - * paths to be checked. - * - * Each path is up to the current word (not including). It doesn't actually - * construct a full path if there are null words - they break it. The final path - * is constructed when the Wordgraph termination word is encountered. - * - * Note: The final path doesn't match the linkage word indexing if the linkage - * contains empty words, at least until empty words are eliminated from the - * linkage (in compute_chosen_words()). Further processing of the path is done - * there in case morphology splits are to be hidden or there are morphemes with - * null linkage. - */ -static void wordgraph_path_append(Wordgraph_pathpos **nwp, const Gword **path, - Gword *current_word, /* add to the path */ - Gword *p) /* add to the path queue */ -{ - size_t n = wordgraph_pathpos_len(*nwp); - - assert(NULL != p, "Tried to add a NULL word to the word queue"); - - /* Check if the path queue already contains the word to be added to it. */ - if (NULL != *nwp) - { - const Wordgraph_pathpos *wpt; - - for (wpt = *nwp; NULL != wpt->word; wpt++) - { - if (p == wpt->word) - { - /* If we are here, there are 2 or more paths leading to this word - * (p) that end with the same number of consecutive null words that - * consist an entire alternative. These null words represent - * different ways to split the subword upward in the hierarchy, but - * since they don't have linkage we don't care which of these - * paths is used. */ - return; /* The word is already in the queue */ - } - } - } - - /* Not already in the path queue - add it. */ - *nwp = wordgraph_pathpos_resize(*nwp, n+1); - (*nwp)[n].word = p; - - if (MT_INFRASTRUCTURE == p->prev[0]->morpheme_type) - { - /* Previous word is the Wordgraph dummy word. Initialize the path. */ - (*nwp)[n].path = NULL; - } - else - { - /* We branch to another path. Duplicate it from the current path and add - * the current word to it. */ - size_t path_arr_size = (gwordlist_len(path)+1)*sizeof(*path); - - (*nwp)[n].path = malloc(path_arr_size); - memcpy((*nwp)[n].path, path, path_arr_size); - } - /* FIXME (cast) but anyway gwordlist_append() doesn't modify Gword. */ - gwordlist_append((Gword ***)&(*nwp)[n].path, current_word); -} - -/** - * Free the Wordgraph paths and the Wordgraph_pathpos array. - * In case of a match, the final path is still needed so this function is - * then invoked with free_final_path=false. - */ -static void wordgraph_path_free(Wordgraph_pathpos *wp, bool free_final_path) -{ - Wordgraph_pathpos *twp; - - if (NULL == wp) return; - for (twp = wp; NULL != twp->word; twp++) - { - if (free_final_path || (MT_INFRASTRUCTURE != twp->word->morpheme_type)) - free(twp->path); - } - free(wp); -} - -/* ============================================================== */ -/* A kind of morphism post-processing */ - -/* These letters create a string that should be matched by a - * SANEMORPHISM regex, given in the affix file. The empty word - * doesn't have a letter. E.g. for the Russian dictionary: "w|ts". - * It is converted here to: "^((w|ts)b)+$". - * It matches "wbtsbwbtsbwb" but not "wbtsbwsbtsb". - * FIXME? In this version of the function, 'b' is not yet supported, - * so "w|ts" is converted to "^(w|ts)+$" for now. - */ -#define AFFIXTYPE_PREFIX 'p' /* prefix */ -#define AFFIXTYPE_STEM 't' /* stem */ -#define AFFIXTYPE_SUFFIX 's' /* suffix */ -#define AFFIXTYPE_MIDDLE 'm' /* middle morpheme */ -#define AFFIXTYPE_WORD 'w' /* regular word */ -#ifdef WORD_BOUNDARIES -#define AFFIXTYPE_END 'b' /* end of input word */ -#endif - -/** - * This routine solves the problem of mis-linked alternatives, - * i.e a morpheme in one alternative that is linked to a morpheme in - * another alternative. This can happen due to the way in which word - * alternatives are implemented. - * - * It does so by checking that all the chosen disjuncts in a linkage - * (including null words) match, in the same order, a path in the - * Wordgraph. - * - * An important side effect of this check is that if the linkage is - * good, its Wordgraph path is found. - * - * Optionally (if SANEMORPHISM regex is defined in the affix file), it - * also validates that the morpheme-type sequence is permitted for the - * language. This is a sanity check of the program and the dictionary. - * - * Return true if the linkage is good, else return false. - */ -#define D_SLM 7 -bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts) -{ - Wordgraph_pathpos *wp_new = NULL; - Wordgraph_pathpos *wp_old = NULL; - Wordgraph_pathpos *wpp; - Gword **next; /* next Wordgraph words of the current word */ - size_t i; - - bool match_found = true; /* if all the words are null - it's still a match */ - Gword **lwg_path; - - Dictionary afdict = sent->dict->affix_table; /* for SANEMORPHISM */ - char *const affix_types = alloca(sent->length*2 + 1); /* affix types */ - affix_types[0] = '\0'; - - lkg->wg_path = NULL; - - /* Populate the path word queue, initializing the path to NULL. */ - for (next = sent->wordgraph->next; *next; next++) - { - wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next); - } - assert(NULL != wp_new, "Path word queue is empty"); - - for (i = 0; i < lkg->num_words; i++) - { - Disjunct *cdj; /* chosen disjunct */ - - lgdebug(D_SLM, "lkg=%p Word %zu: ", lkg, i); - - if (NULL == wp_new) - { - lgdebug(D_SLM, "- No more words in the wordgraph\n"); - match_found = false; - break; - } - - if (wp_old != wp_new) - { - wordgraph_path_free(wp_old, true); - wp_old = wp_new; - } - wp_new = NULL; - //wordgraph_pathpos_print(wp_old); - - cdj = lkg->chosen_disjuncts[i]; - /* Handle null words */ - if (NULL == cdj) - { - lgdebug(D_SLM, "- Null word\n"); - /* A null word matches any word in the Wordgraph - - * so, unconditionally proceed in all paths in parallel. */ - match_found = false; - for (wpp = wp_old; NULL != wpp->word; wpp++) - { - if (NULL == wpp->word->next) - continue; /* This path encountered the Wordgraph end */ - - /* The null words cannot be marked here because wpp->path consists - * of pointers to the Wordgraph words, and these words are common to - * all the linkages, with potentially different null words in each - * of them. However, the position of the null words can be inferred - * from the null words in the word array of the Linkage structure. - */ - for (next = wpp->word->next; NULL != *next; next++) - { - match_found = true; - wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); - } - } - continue; - } - - if (!match_found) - { - const char *e = "Internal error: Too many words in the linkage"; - lgdebug(D_SLM, "- %s\n", e); - prt_error("Error: %s.\n", e); - break; - } - - if (verbosity_level(D_SLM)) print_with_subscript_dot(cdj->string); - - match_found = false; - /* Proceed in all the paths in which the word is found. */ - for (wpp = wp_old; NULL != wpp->word; wpp++) - { - for (gword_set *gl = cdj->originating_gword; NULL != gl; gl = gl->next) - { - if (gl->o_gword == wpp->word) - { - match_found = true; - for (next = wpp->word->next; NULL != *next; next++) - { - wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); - } - break; - } - } - } - - if (!match_found) - { - /* FIXME? A message can be added here if there are too many words - * in the linkage (can happen only if there is an internal error). */ - lgdebug(D_SLM, "- No Wordgraph match\n"); - break; - } - lgdebug(D_SLM, "\n"); - } - - if (match_found) - { - match_found = false; - /* Validate that there are no missing words in the linkage. - * It is so, if the dummy termination word is found in the - * new pathpos queue. - */ - if (NULL != wp_new) - { - for (wpp = wp_new; NULL != wpp->word; wpp++) - { - if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) { - match_found = true; - /* Exit the loop with with wpp of the termination word. */ - break; - } - } - } - if (!match_found) - lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg); - } - -#define DEBUG_morpheme_type 0 - /* Check the morpheme type combination. - * If null_count > 0, the morpheme type combination may be invalid - * due to null subwords, so skip this check. */ - if (match_found && (0 == sent->null_count) && - (NULL != afdict) && (NULL != afdict->regex_root)) - { - const Gword **w; - char *affix_types_p = affix_types; - - /* Construct the affix_types string. */ -#if DEBUG_morpheme_type - print_lwg_path(wpp->path); -#endif - i = 0; - for (w = wpp->path; *w; w++) - { - i++; - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch ((*w)->morpheme_type) - { -#pragma GCC diagnostic pop - default: - /* What to do with the rest? */ - case MT_WORD: - *affix_types_p = AFFIXTYPE_WORD; - break; - case MT_PREFIX: - *affix_types_p = AFFIXTYPE_PREFIX; - break; - case MT_STEM: - *affix_types_p = AFFIXTYPE_STEM; - break; - case MT_MIDDLE: - *affix_types_p = AFFIXTYPE_MIDDLE; - break; - case MT_SUFFIX: - *affix_types_p = AFFIXTYPE_SUFFIX; - break; - } - -#if DEBUG_morpheme_type - lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n", - i, (*w)->subword, *affix_types_p); -#endif - - affix_types_p++; - } - *affix_types_p = '\0'; - -#ifdef WORD_BOUNDARIES /* not yet implemented */ - { - const Gword *uw; - - /* If w is an "end subword", return its unsplit word, else NULL. */ - uw = word_boundary(w); /* word_boundary() unimplemented */ - - if (NULL != uw) - { - *affix_types_p++ = AFFIXTYPE_END; - lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword); - } - } -#endif - - /* Check if affix_types is valid according to SANEMORPHISM. */ - if (('\0' != affix_types[0]) && - (NULL == match_regex(afdict->regex_root, affix_types))) - { - /* Morpheme type combination is invalid */ - match_found = false; - /* Notify to stdout, so it will be shown along with the result. - * XXX We should have a better way to notify. */ - if (0 < opts->verbosity) - prt_error("Warning: Invalid morpheme type combination '%s'.\n" - "Run with !bad and !verbosity>"STRINGIFY(D_USER_MAX) - " to debug\n", affix_types); - } - } - - if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */ - wordgraph_path_free(wp_old, true); - wordgraph_path_free(wp_new, !match_found); - - if (match_found) - { - if ('\0' != affix_types[0]) - { - lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types); - } - lgdebug(+D_SLM, "%p SUCCEEDED\n", lkg); - lkg->wg_path = lwg_path; - return true; - } - - /* Oh no ... invalid morpheme combination! */ - lgdebug(D_SLM, "%p FAILED\n", lkg); - return false; -} -#undef D_SLM - -static void free_sentence_disjuncts(Sentence sent) -{ - size_t i; - - for (i = 0; i < sent->length; ++i) - { - free_disjuncts(sent->word[i].d); - sent->word[i].d = NULL; - } -} - -static bool setup_linkages(Sentence sent, fast_matcher_t* mchxt, - count_context_t* ctxt, - Parse_Options opts) -{ - bool overflowed = build_parse_set(sent, mchxt, ctxt, sent->null_count, opts); - print_time(opts, "Built parse set"); - - if (overflowed && (1 < opts->verbosity)) - { - err_ctxt ec = { sent }; - err_msgc(&ec, lg_Warn, "Warning: Count overflow.\n" - "Considering a random subset of %zu of an unknown and large number of linkages", - opts->linkage_limit); - } - - if (sent->num_linkages_found == 0) - { - sent->num_linkages_alloced = 0; - sent->num_linkages_post_processed = 0; - sent->num_valid_linkages = 0; - sent->lnkages = NULL; - return overflowed; - } - - size_t N_linkages_alloced = sent->num_linkages_found; - if (N_linkages_alloced > opts->linkage_limit) - N_linkages_alloced = opts->linkage_limit; - - sent->num_linkages_alloced = N_linkages_alloced; - - /* Now actually malloc the array in which we will process linkages. */ - /* We may have been called before, e.g. this might be a panic parse, - * and the linkages array may still be there from last time. - * XXX free_linkages() zeros sent->num_linkages_found. */ - if (sent->lnkages) free_linkages(sent); - sent->lnkages = linkage_array_new(N_linkages_alloced); - - return overflowed; -} - -/** - * This fills the linkage array with morphologically-acceptable - * linakges. - */ -static void process_linkages(Sentence sent, bool overflowed, Parse_Options opts) -{ - if (0 == sent->num_linkages_found) return; - - /* Pick random linkages if we get more than what was asked for. */ - bool pick_randomly = overflowed || - (sent->num_linkages_found != (int) sent->num_linkages_alloced); - - Parse_info pi = sent->parse_info; - pi->rand_state = sent->rand_state; - sent->num_valid_linkages = 0; - size_t N_invalid_morphism = 0; - - size_t itry = 0; - size_t in = 0; - size_t maxtries = sent->num_linkages_alloced; - - /* If we're picking randomly, then try as many as we are allowed. */ - if (pick_randomly) maxtries = sent->num_linkages_found; - - /* In the case of overflow, which will happen for some long - * sentences, but is particularly common for the amy/ady random - * splitters, we want to find as many morpho-acceptable linkages - * as possible, but keep the CPU usage down, as these might be - * very rare. This is due to a bug/feature in the interaction - * between the word-graph and the parser: valid morph linkages - * can be one-in-a-thousand.. or worse. Search for them, but - * don't over-do it. - */ -#define MAX_TRIES 250000 - if (MAX_TRIES < maxtries) maxtries = MAX_TRIES; - - bool need_init = true; - for (itry=0; itrylnkages[in]; - Linkage_info * lifo = &lkg->lifo; - - /* Negative values tell extract-links to pick randomly; for - * reproducible-rand, the actual value is the rand seed. */ - lifo->index = pick_randomly ? -(itry+1) : itry; - - if (need_init) - { - partial_init_linkage(sent, lkg, pi->N_words); - need_init = false; - } - extract_links(lkg, pi); - compute_link_names(lkg, sent->string_set); - remove_empty_words(lkg); - - if (sane_linkage_morphism(sent, lkg, opts)) - { - need_init = true; - in++; - sent->num_valid_linkages ++; - if (in >= sent->num_linkages_alloced) break; - } - else - { - N_invalid_morphism ++; - lkg->num_links = 0; - lkg->num_words = pi->N_words; - // memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); - memset(lkg->chosen_disjuncts, 0, pi->N_words * sizeof(Disjunct *)); - } - } - - /* The last one was alloced, but never actually used. Free it. */ - if (!need_init) free_linkage(&sent->lnkages[in]); - - /* The remainder of the array is garbage; we never filled it in. - * So just pretend that it's shorter than it is */ - sent->num_linkages_alloced = sent->num_valid_linkages; - - if (verbosity_level(5)) - { - prt_error("Info: sane_morphism(): %zu of %zu linkages had " - "invalid morphology construction\n", - N_invalid_morphism, sent->num_linkages_alloced); - } -} - -/** - * classic_parse() -- parse the given sentence. - * Perform parsing, using the original link-grammar parsing algorithm - * given in the original link-grammar papers. - * - * Do the parse with the minimum number of null-links within the range - * specified by opts->min_null_count and opts->max_null_count. - * - * To that end, call do_parse() with an increasing null_count, from - * opts->min_null_count up to (including) opts->max_null_count, until a - * parse is found. - * - * A note about the disjuncts save/restore that is done here: - * To increase the parsing speed, before invoking do_parse(), - * pp_and_power_prune() is invoked to removes connectors which have no - * possibility to connect. It includes a significant optimization when - * null_count==0 that makes a more aggressive removal, but this - * optimization is not appropriate when null_count>0. - * - * So in case this optimization has been done and a complete parse (i.e. - * a parse when null_count==0) is not found, we are left with sentence - * disjuncts which are not appropriate to continue do_parse() tries with - * null_count>0. To solve that, we need to restore the original - * disjuncts of the sentence and call pp_and_power_prune() once again. - */ -static void classic_parse(Sentence sent, Parse_Options opts) -{ - fast_matcher_t * mchxt = NULL; - count_context_t * ctxt; - bool pp_and_power_prune_done = false; - Disjunct **disjuncts_copy = NULL; - bool is_null_count_0 = (0 == opts->min_null_count); - int max_null_count = MIN((int)sent->length, opts->max_null_count); - - /* Build lists of disjuncts */ - prepare_to_parse(sent, opts); - if (resources_exhausted(opts->resources)) return; - ctxt = alloc_count_context(sent->length); - - if (is_null_count_0 && (0 < max_null_count)) - { - /* Save the disjuncts in case we need to parse with null_count>0. */ - disjuncts_copy = alloca(sent->length * sizeof(Disjunct *)); - for (size_t i = 0; i < sent->length; i++) - disjuncts_copy[i] = disjuncts_dup(sent->word[i].d); - } - - /* A parse set may have been already been built for this sentence, - * if it was previously parsed. If so we free it up before - * building another. Huh ?? How could that happen? */ -#ifdef DEBUG - if (sent->parse_info) err_msg(lg_Debug, "XXX Freeing parse_info\n"); -#endif - free_parse_info(sent->parse_info); - sent->parse_info = parse_info_new(sent->length); - - for (int nl = opts->min_null_count; nl <= max_null_count; nl++) - { - Count_bin hist; - s64 total; - - if (!pp_and_power_prune_done) - { - if (0 != nl) - { - pp_and_power_prune_done = true; - if (is_null_count_0) - opts->min_null_count = 1; /* Don't optimize for null_count==0. */ - - /* We are parsing now with null_count>0, when previously we - * parsed with null_count==0. Restore the save disjuncts. */ - if (NULL != disjuncts_copy) - { - for (size_t i = 0; i < sent->length; i++) - { - free_disjuncts(sent->word[i].d); - sent->word[i].d = disjuncts_copy[i]; - } - disjuncts_copy = NULL; - } - } - pp_and_power_prune(sent, opts); - if (is_null_count_0) opts->min_null_count = 0; - if (resources_exhausted(opts->resources)) break; - - free_fast_matcher(mchxt); - mchxt = alloc_fast_matcher(sent); - print_time(opts, "Initialized fast matcher"); - } - - if (resources_exhausted(opts->resources)) break; - free_linkages(sent); - - sent->null_count = nl; - hist = do_parse(sent, mchxt, ctxt, sent->null_count, opts); - total = hist_total(&hist); - - if (verbosity_level(5)) - { - prt_error("Info: Total count with %zu null links: %lld\n", - sent->null_count, total); - } - - /* total is 64-bit, num_linkages_found is 32-bit. Clamp */ - total = (total > INT_MAX) ? INT_MAX : total; - total = (total < 0) ? INT_MAX : total; - - sent->num_linkages_found = (int) total; - print_time(opts, "Counted parses"); - - bool ovfl = setup_linkages(sent, mchxt, ctxt, opts); - process_linkages(sent, ovfl, opts); - post_process_lkgs(sent, opts); - - if (sent->num_valid_linkages > 0) break; - if ((0 == nl) && (0 < max_null_count) && verbosity > 0) - prt_error("No complete linkages found.\n"); - - /* If we are here, then no valid linkages were found. - * If there was a parse overflow, give up now. */ - if (PARSE_NUM_OVERFLOW < total) break; - //if (sent->num_linkages_found > 0 && nl>0) printf("NUM_LINKAGES_FOUND %d\n", sent->num_linkages_found); - } - sort_linkages(sent, opts); - - if (NULL != disjuncts_copy) - { - for (size_t i = 0; i < sent->length; i++) - free_disjuncts(disjuncts_copy[i]); - } - free_count_context(ctxt); - free_fast_matcher(mchxt); -} - int sentence_parse(Sentence sent, Parse_Options opts) { int rc; @@ -1520,6 +610,14 @@ rc = sentence_split(sent, opts); if (rc) return -1; } + else + { + /* During a panic parse, we enter here a second time, with leftover + * garbage. Free it. We really should make the code that is panicking + * do this free, but right now, they have no API for it, so we do it + * as a favor. XXX FIXME someday. */ + free_sentence_disjuncts(sent); + } /* Check for bad sentence length */ if (MAX_SENTENCE <= sent->length) @@ -1529,12 +627,12 @@ return -2; } - /* Initialize/free any leftover garbage */ - free_sentence_disjuncts(sent); /* Is this really needed ??? */ resources_reset(opts->resources); - /* Expressions were previously set up during the tokenize stage. */ - expression_prune(sent); + /* Expressions were set up during the tokenize stage. + * Prune them, and then parse. + */ + expression_prune(sent, opts); print_time(opts, "Finished expression pruning"); if (opts->use_sat_solver) { @@ -1556,3 +654,113 @@ } return sent->num_valid_linkages; } + +/* + * Definitions for linkgrammar_get_configuration(). + */ + +/* __VA_ARGS__ must be used because arguments may contain commas. */ +#define lg_xstr(...) lg_str(__VA_ARGS__) +#define lg_str(...) #__VA_ARGS__ + +#ifdef __STDC_VERSION__ +#define LG_S1 "__STDC_VERSION__=" lg_xstr(__STDC_VERSION__) +#else +#define LG_S1 +#endif + +/* -DCC=$(CC) is added in the Makefile. */ +#ifdef CC +#define LG_CC CC +#elif _MSC_VER +#define LG_CC "lc" +#else +#define LG_CC "(unknown)" +#endif + +#ifdef __VERSION__ +#define LG_V1 "__VERSION__=" lg_xstr(__VERSION__) +#else +#define LG_V1 +#endif + +#ifdef _MSC_FULL_VER +#define LG_V2 "_MSC_FULL_VER=" lg_xstr(_MSC_FULL_VER) +#else +#define LG_V2 +#endif + +#define LG_COMP LG_CC " " LG_V1 " " LG_V2 +#define LG_STD LG_S1 + +#ifdef __unix__ +#define LG_unix "__unix__ " +#else +#define LG_unix +#endif + +#ifdef _WIN32 +#define LG_WIN32 "_WIN32 " +#else +#define LG_WIN32 +#endif + +#ifdef _WIN64 +#define LG_WIN64 "_WIN64 " +#else +#define LG_WIN64 +#endif + +#ifdef __CYGWIN__ +#define LG_CYGWIN "__CYGWIN__ " +#else +#define LG_CYGWIN +#endif + +#ifdef __MINGW32__ +#define LG_MINGW32 "__MINGW32__ " +#else +#define LG_MINGW32 +#endif + +#ifdef __MINGW64__ +#define LG_MINGW64 "__MINGW64__ " +#else +#define LG_MINGW64 +#endif + +#ifdef __APPLE__ +#define LG_APPLE "__APPLE__ " +#else +#define LG_APPLE +#endif + +#ifdef __MACH__ +#define LG_MACH "__MACH__ " +#else +#define LG_MACH +#endif + +#ifndef DICTIONARY_DIR +#define DICTIONARY_DIR "None" +#endif + +#define LG_windows LG_WIN32 LG_WIN64 LG_CYGWIN LG_MINGW32 LG_MINGW64 +#define LG_mac LG_APPLE LG_MACH + +/** + * Return information about the configuration as a static string. + */ +const char *linkgrammar_get_configuration(void) +{ + return "Compiled with: " LG_COMP "\n" + "OS: " LG_HOST_OS " " LG_unix LG_windows LG_mac "\n" + "Standards: " LG_STD "\n" + "Configuration (source code):\n\t" + LG_CPPFLAGS "\n\t" + LG_CFLAGS "\n" + "Configuration (features):\n\t" + "DICTIONARY_DIR=" DICTIONARY_DIR "\n\t" + LG_DEFS + ; +} diff -Nru link-grammar-5.3.16/link-grammar/api-structures.h link-grammar-5.5.0/link-grammar/api-structures.h --- link-grammar-5.3.16/link-grammar/api-structures.h 2017-01-27 22:01:05.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/api-structures.h 2018-04-22 14:57:33.000000000 +0000 @@ -42,11 +42,11 @@ #endif /* HAVE_LOCALE_T_IN_XLOCALE_H */ #include "api-types.h" -#include "dict-structures.h" #include "corpus/corpus.h" -#include "error.h" +#include "memory-pool.h" #include "utilities.h" +typedef struct Cost_Model_s Cost_Model; struct Cost_Model_s { Cost_Model_type type; @@ -97,117 +97,20 @@ /* Options governing post-processing */ bool perform_pp_prune; /* Perform post-processing-based pruning */ size_t twopass_length; /* min sent length for two-pass post processing */ - Cost_Model cost_model; /* For sorting linkages in post_processing */ + Cost_Model cost_model; /* For sorting linkages after parsing. */ /* Options governing the generation of linkages. */ size_t linkage_limit; /* The maximum number of linkages processed 100 */ bool display_morphology;/* if true, print morpho analysis of words */ }; -struct Connector_set_s +typedef struct word_queue_s word_queue_t; +struct word_queue_s { - Connector ** hash_table; - unsigned int table_size; + Gword *word; + word_queue_t *next; }; -struct Afdict_class_struct -{ - size_t mem_elems; /* number of memory elements allocated */ - size_t length; /* number of strings */ - char const ** string; -}; - -/* Used for memory management */ -struct Exp_list_s -{ - Exp * exp_list; -}; - -struct Dictionary_s -{ - Dict_node * root; - Regex_node * regex_root; - const char * name; - const char * lang; - const char * version; - const char * locale; /* Locale name */ - locale_t lctype; /* Locale argument for the *_l() functions */ - int num_entries; - - bool use_unknown_word; - bool unknown_word_defined; - bool left_wall_defined; - bool right_wall_defined; - bool shuffle_linkages; - - /* Affixes are used during the tokenization stage. */ - Dictionary affix_table; - Afdict_class * afdict_class; - - /* Random morphology generator */ - struct anysplit_params * anysplit; - - /* If not null, then use spelling guesser for unknown words */ - void * spell_checker; /* spell checker handle */ -#if USE_CORPUS - Corpus * corpus; /* Statistics database */ -#endif -#ifdef HAVE_SQLITE - void * db_handle; /* database handle */ -#endif - - void (*insert_entry)(Dictionary, Dict_node *, int); - Dict_node* (*lookup_list)(Dictionary, const char*); - void (*free_lookup)(Dictionary, Dict_node*); - bool (*lookup)(Dictionary, const char*); - void (*close)(Dictionary); - - pp_knowledge * base_knowledge; /* Core post-processing rules */ - pp_knowledge * hpsg_knowledge; /* Head-Phrase Structure rules */ - Connector_set * unlimited_connector_set; /* NULL=everything is unlimited */ - String_set * string_set; /* Set of link names in the dictionary */ - Word_file * word_file_header; - - /* exp_list links together all the Exp structs that are allocated - * in reading this dictionary. Needed for freeing the dictionary - */ - Exp_list exp_list; - - /* Private data elements that come in play only while the - * dictionary is being read, and are not otherwise used. - */ - const char * input; - const char * pin; - bool recursive_error; - bool is_special; - char already_got_it; - int line_number; - char token[MAX_TOKEN_LENGTH]; -}; - -struct Link_s -{ - size_t lw; /* Offset into Linkage->word NOT Sentence->word */ - size_t rw; /* Offset into Linkage->word NOT Sentence->word */ - Connector * lc; - Connector * rc; - const char * link_name; /* Spelling of full link name */ -}; - -struct Parse_info_struct -{ - unsigned int x_table_size; - unsigned int log2_x_table_size; - X_table_connector ** x_table; /* Hash table */ - Parse_set * parse_set; - int N_words; /* Number of words in current sentence; - Computed by separate_sentence() */ - - /* thread-safe random number state */ - unsigned int rand_state; -}; - - struct Sentence_s { Dictionary dict; /* Words are defined from this dictionary */ @@ -215,22 +118,20 @@ size_t length; /* Number of words */ Word *word; /* Array of words after tokenization */ String_set * string_set; /* Used for assorted strings */ + Pool_desc * fm_Match_node; /* Fast-matcher Match_node memory pool */ + Pool_desc * Table_connector_pool; /* Count memoizing memory pool */ - /* Wordgraph stuff. FIXME: typedef for structs. */ + /* Wordgraph stuff. FIXME: create stand-alone struct for these. */ Gword *wordgraph; /* Tokenization wordgraph */ Gword *last_word; /* FIXME Last issued word */ - struct word_queue /* Element in queue of words to tokenize */ - { - Gword *word; - struct word_queue *next; - } *word_queue; - struct word_queue *word_queue_last; + word_queue_t *word_queue; /* Element in queue of words to tokenize */ + word_queue_t *word_queue_last; size_t gword_node_num; /* Debug - for differentiating between wordgraph nodes with identical subwords. */ /* Parse results */ int num_linkages_found; /* Total number before postprocessing. This - is returned by the count() function */ + is returned by the do_count() function */ size_t num_linkages_alloced;/* Total number of linkages allocated. the number post-processed might be fewer because some are non-canonical */ @@ -245,148 +146,13 @@ Postprocessor * postprocessor; Postprocessor * constituent_pp; - /* parse_info not used by SAT solver */ - Parse_info parse_info; /* Set of parses for the sentence */ - /* thread-safe random number state */ unsigned int rand_state; #ifdef USE_SAT_SOLVER void *hook; /* Hook for the SAT solver */ #endif /* USE_SAT_SOLVER */ -}; - -/********************************************************* - * - * Post processing - * XXX FIXME: most of these structures should not be in the - * public API; they're here because they're tangled into the - * hpsg post-processing. - **********************************************************/ - -struct Domain_s -{ - const char * string; - List_o_links * lol; - DTreeLeaf * child; - Domain * parent; - size_t size; - size_t start_link; /* the link that started this domain */ - char type; /* one letter name */ -}; - - -struct DTreeLeaf_s -{ - Domain * parent; - DTreeLeaf * next; - int link; -}; - -struct PP_data_s -{ - List_o_links ** word_links; - size_t wowlen; - size_t N_domains; - Domain * domain_array; /* The domains, sorted by size */ - size_t domlen; /* Allocated size of domain_array */ - size_t num_words; /* Number of words in linkage */ - List_o_links * links_to_ignore; - - bool *visited; /* For the depth-first search */ - size_t vlength; /* Length of visited array */ -}; - -/* A new Postprocessor struct is alloc'ed for each sentence. It contains - * sentence-specific post-processing information. - */ -struct Postprocessor_s -{ - pp_knowledge * knowledge; /* Internal rep'n of the actual rules */ - int n_global_rules_firing; /* this & the next are diagnostic */ - int n_local_rules_firing; - pp_linkset *set_of_links_of_sentence; /* seen in *any* linkage of sent */ - pp_linkset *set_of_links_in_an_active_rule;/*used in *some* linkage of sent*/ - int *relevant_contains_one_rules; /* -1-terminated list of indices */ - int *relevant_contains_none_rules; - bool q_pruned_rules; /* don't prune rules more than once in p.p. */ - String_set *string_set; /* Link names seen for sentence */ - - /* Per-linkage state; this data must be reset prior to processing - * each new linkage. */ - PP_node *pp_node; - PP_data pp_data; -}; - -/********************************************************* - * - * Linkages - * - **********************************************************/ - -/** - * This summarizes the linkage status. - */ -struct Linkage_info_struct -{ - int index; /* Index into the parse_set */ - bool discarded; - short N_violations; - short unused_word_cost; - short link_cost; - - double disjunct_cost; - double corpus_cost; - const char *pp_violation_msg; -}; - -struct PP_info_s -{ - size_t num_domains; - const char ** domain_name; -}; - -/** - * num_links: - * The number of links in the current linkage. Computed by - * extract_linkage(). - * - * chosen_disjuncts[] - * This is an array pointers to disjuncts, one for each word, that is - * computed by extract_links(). It represents the chosen disjuncts - * for the current linkage. It is used to compute the cost of the - * linkage, and also by compute_chosen_words() to compute the - * chosen_words[]. - * - * link_array[] - * This is an array of links. These links define the current linkage. - * It is computed by extract_links(). It is used by analyze_linkage(). - */ -struct Linkage_s -{ - WordIdx num_words; /* Number of (tokenized) words */ - bool is_sent_long; /* num_words >= twopass_length */ - const char * * word; /* Array of word spellings */ - - size_t num_links; /* Number of links in array */ - Link * link_array; /* Array of links */ - size_t lasz; /* Alloc'ed length of link_array */ - - Disjunct ** chosen_disjuncts; /* Disjuncts used, one per word */ - size_t cdsz; /* Alloc'ed length of chosen_disjuncts */ - char ** disjunct_list_str; /* Stringified version of above */ -#ifdef USE_CORPUS - Sense ** sense_list; /* Word senses, inferred from disjuncts */ -#endif - - Gword **wg_path; /* Linkage Wordgraph path */ - Gword **wg_path_display; /* ... for !morphology=0. Experimental. */ - //size_t *wg_path_index; /* Displayed-word indices in wg_path (FIXME?)*/ - - Linkage_info lifo; /* Parse_set index and cost information */ - PP_info * pp_info; /* PP domain info, one for each link */ - - Sentence sent; /* Used for common linkage data */ + void *disjuncts_connectors_memblock; }; #endif diff -Nru link-grammar-5.3.16/link-grammar/api-types.h link-grammar-5.5.0/link-grammar/api-types.h --- link-grammar-5.3.16/link-grammar/api-types.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/api-types.h 2018-01-22 18:48:21.000000000 +0000 @@ -10,59 +10,35 @@ /* */ /*************************************************************************/ -/* This file is somewhat misnamed, as everything here defines the - * link-private, internal-use-only "api", which is subject to change - * from revision to revision. No external code should link to this - * stuff. +/* This file is somewhat misnamed, as everything here is private -- + * these are the internal-use-only API objects interfacing between + * different subsystems. They are all subject to change without notice. + * No external code should attempt to access this stuff. */ #ifndef _API_TYPES_H_ #define _API_TYPES_H_ -#define MAX_TOKEN_LENGTH 250 /* Maximum number of chars in a token */ - -/* MAX_SENTENCE cannot be more than 65534, because word MAX_SENTENCE+1 is - * used to indicate that nothing can connect to this connector, and this - * should fit in two bytes (because the word field of a connector is an - * unsigned short). - */ -#define MAX_SENTENCE 254 /* Maximum number of words in a sentence */ - -/* Widely used private typedefs */ +/* Widely used typedefs */ +typedef struct Exp_struct Exp; typedef struct Connector_struct Connector; -typedef struct Cost_Model_s Cost_Model; -typedef struct Domain_s Domain; -typedef struct DTreeLeaf_s DTreeLeaf; -typedef struct Exp_list_s Exp_list; -typedef struct Image_node_struct Image_node; typedef struct Linkage_info_struct Linkage_info; -typedef struct Parse_info_struct *Parse_info; typedef struct Postprocessor_s Postprocessor; -typedef struct PP_data_s PP_data; -typedef struct PP_info_s PP_info; -typedef struct Regex_node_s Regex_node; typedef struct Resources_s * Resources; /* Some of the more obscure typedefs */ -typedef struct count_context_s count_context_t; -typedef struct fast_matcher_s fast_matcher_t; - typedef struct Connector_set_s Connector_set; typedef struct Disjunct_struct Disjunct; typedef struct Link_s Link; -typedef struct List_o_links_struct List_o_links; -typedef struct Parse_set_struct Parse_set; typedef struct String_set_s String_set; -typedef struct Afdict_class_struct Afdict_class; typedef struct Word_struct Word; typedef struct Gword_struct Gword; -typedef struct X_table_connector_struct X_table_connector; - +typedef struct gword_set gword_set; /* Post-processing structures */ typedef struct pp_knowledge_s pp_knowledge; typedef struct pp_linkset_s pp_linkset; -typedef struct PP_node_struct PP_node; +typedef struct PP_domains_s PP_domains; typedef struct corpus_s Corpus; typedef struct sense_s Sense; @@ -71,4 +47,3 @@ typedef struct Wordgraph_pathpos_s Wordgraph_pathpos; #endif - diff -Nru link-grammar-5.3.16/link-grammar/build-disjuncts.c link-grammar-5.5.0/link-grammar/build-disjuncts.c --- link-grammar-5.3.16/link-grammar/build-disjuncts.c 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/build-disjuncts.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,511 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/* stuff for transforming a dictionary entry into a disjunct list */ - -#include -#include "build-disjuncts.h" -#include "dict-api.h" -#include "dict-common.h" -#include "disjunct-utils.h" -#include "externs.h" -#include "string-set.h" -#include "word-utils.h" -#include "utilities.h" /* For Win32 compatibility features */ - -/* Temporary connectors used while converting expressions into disjunct lists */ -typedef struct Tconnector_struct Tconnector; -struct Tconnector_struct -{ - char multi; /* TRUE if this is a multi-connector */ - char dir; /* '-' for left and '+' for right */ - Tconnector * next; - const char * string; -}; - -typedef struct clause_struct Clause; -struct clause_struct -{ - Clause * next; - double cost; - double maxcost; - Tconnector * c; -}; - -static void free_Tconnectors(Tconnector *e) -{ - Tconnector * n; - for(;e != NULL; e=n) - { - n = e->next; - xfree((char *)e, sizeof(Tconnector)); - } -} - -static void free_clause_list(Clause *c) -{ - Clause *c1; - while (c != NULL) - { - c1 = c->next; - free_Tconnectors(c->c); - xfree((char *)c, sizeof(Clause)); - c = c1; - } -} - -/** - * reverse the order of the list e. destructive - */ -static Tconnector * Treverse(Tconnector *e) -{ - Tconnector * head, *x; - head = NULL; - while (e != NULL) { - x = e->next; - e->next = head; - head = e; - e = x; - } - return head; -} - -/** - * reverse the order of the list e. destructive - */ -static Connector * reverse(Connector *e) -{ - Connector * head, *x; - head = NULL; - while (e != NULL) { - x = e->next; - e->next = head; - head = e; - e = x; - } - return head; -} - -/** - * Builds a new list of connectors that is the catenation of e1 with e2. - * does not effect lists e1 or e2. Order is maintained. - */ -static Tconnector * catenate(Tconnector * e1, Tconnector * e2) -{ - Tconnector * e, * head; - head = NULL; - for (;e1 != NULL; e1 = e1->next) { - e = (Tconnector *) xalloc(sizeof(Tconnector)); - *e = *e1; - e->next = head; - head = e; - } - for (;e2 != NULL; e2 = e2->next) { - e = (Tconnector *) xalloc(sizeof(Tconnector)); - *e = *e2; - e->next = head; - head = e; - } - return Treverse(head); -} - -/** - * build the connector for the terminal node n - */ -static Tconnector * build_terminal(Exp * e) -{ - Tconnector * c; - c = (Tconnector *) xalloc(sizeof(Tconnector)); - c->string = e->u.string; - c->multi = e->multi; - c->dir = e->dir; - c->next = NULL; - return c; -} - -/** - * Build the clause for the expression e. Does not change e - */ -static Clause * build_clause(Exp *e) -{ - Clause *c = NULL, *c1, *c2, *c3, *c4, *c_head; - E_list * e_list; - - assert(e != NULL, "build_clause called with null parameter"); - if (e->type == AND_type) - { - c1 = (Clause *) xalloc(sizeof (Clause)); - c1->c = NULL; - c1->next = NULL; - c1->cost = 0.0; - c1->maxcost = 0.0; - for (e_list = e->u.l; e_list != NULL; e_list = e_list->next) - { - c2 = build_clause(e_list->e); - c_head = NULL; - for (c3 = c1; c3 != NULL; c3 = c3->next) - { - for (c4 = c2; c4 != NULL; c4 = c4->next) - { - c = (Clause *) xalloc(sizeof (Clause)); - c->cost = c3->cost + c4->cost; - c->maxcost = fmaxf(c3->maxcost,c4->maxcost); - c->c = catenate(c3->c, c4->c); - c->next = c_head; - c_head = c; - } - } - free_clause_list(c1); - free_clause_list(c2); - c1 = c_head; - } - c = c1; - } - else if (e->type == OR_type) - { - /* we'll catenate the lists of clauses */ - c = NULL; - for (e_list = e->u.l; e_list != NULL; e_list = e_list->next) - { - c1 = build_clause(e_list->e); - while(c1 != NULL) { - c3 = c1->next; - c1->next = c; - c = c1; - c1 = c3; - } - } - } - else if (e->type == CONNECTOR_type) - { - c = (Clause *) xalloc(sizeof(Clause)); - c->c = build_terminal(e); - c->cost = 0.0; - c->maxcost = 0.0; - c->next = NULL; - } - else - { - assert(false, "an expression node with no type"); - } - - /* c now points to the list of clauses */ - for (c1 = c; c1 != NULL; c1 = c1->next) - { - c1->cost += e->cost; - /* c1->maxcost = MAX(c1->maxcost,e->cost); */ - /* Above is how Dennis had it. Someone changed it to below. - * However, this can sometimes lead to a maxcost that is less - * than the cost ! -- which seems wrong to me ... seems Dennis - * had it right!? - */ - c1->maxcost += e->cost; - } - return c; -} - -#ifdef DEBUG -/* Misc printing functions, useful for debugging */ - -static void print_Tconnector_list(Tconnector * e) -{ - for (;e != NULL; e=e->next) { - if (e->multi) printf("@"); - printf("%s",e->string); - printf("%c", e->dir); - if (e->next != NULL) printf(" "); - } -} - -GNUC_UNUSED static void print_clause_list(Clause * c) -{ - for (;c != NULL; c=c->next) { - printf(" Clause: "); - printf("(%4.2f, %4.2f) ", c->cost, c->maxcost); - print_Tconnector_list(c->c); - printf("\n"); - } -} - -static void print_connector_list(Connector * e) -{ - for (;e != NULL; e=e->next) - { - printf("%s",e->string); - if (e->next != NULL) printf(" "); - } -} - -GNUC_UNUSED static void print_disjunct_list(Disjunct * c) -{ - for (;c != NULL; c=c->next) { - printf("%10s: ", c->string); - printf("(%f) ", c->cost); - print_connector_list(c->left); - printf(" <--> "); - print_connector_list(c->right); - printf("\n"); - } -} -#endif /* DEBUG */ - -/** - * Build a new list of connectors starting from the Tconnectors - * in the list pointed to by e. Keep only those whose strings whose - * direction has the value c. - */ -static Connector * extract_connectors(Tconnector *e, int c) -{ - Connector *e1; - if (e == NULL) return NULL; - if (e->dir == c) - { - e1 = connector_new(); - e1->next = extract_connectors(e->next,c); - e1->multi = e->multi; - e1->string = e->string; - e1->nearest_word = 0; - return e1; - } - else - { - return extract_connectors(e->next,c); - } -} - -/** - * Build a disjunct list out of the clause list c. - * string is the print name of word that generated this disjunct. - */ -static Disjunct * -build_disjunct(Clause * cl, const char * string, double cost_cutoff) -{ - Disjunct *dis, *ndis; - dis = NULL; - for (; cl != NULL; cl = cl->next) - { - if (cl->maxcost <= cost_cutoff) - { - ndis = (Disjunct *) xalloc(sizeof(Disjunct)); - ndis->left = reverse(extract_connectors(cl->c, '-')); - ndis->right = reverse(extract_connectors(cl->c, '+')); - ndis->string = string; - ndis->cost = cl->cost; - ndis->next = dis; - dis = ndis; - } - } - return dis; -} - -Disjunct * build_disjuncts_for_exp(Exp* exp, const char *word, double cost_cutoff) -{ - Clause *c ; - Disjunct * dis; - /* print_expression(exp); printf("\n"); */ - c = build_clause(exp); - /* print_clause_list(c); */ - dis = build_disjunct(c, word, cost_cutoff); - /* print_disjunct_list(dis); */ - free_clause_list(c); - return dis; -} - -#if DEBUG -/* There is a much better print_expression elsewhere - * This one is for low-level debug. */ -void prt_exp(Exp *e, int i) -{ - if (e == NULL) return; - - for(int j =0; jtype, e->dir, e->multi, e->cost); - if (e->type != CONNECTOR_type) - { - E_list *l = e->u.l; - while(l) - { - prt_exp(l->e, i+2); - l = l->next; - } - } - else - { - for(int j =0; ju.string); - } -} - -void prt_exp_mem(Exp *e, int i) -{ - char unknown_type[32] = ""; - const char *type = unknown_type; - - if (e == NULL) return; - - if (e->type > 0 && e->type <= 3) - { - type = ((const char *[]) {"OR_type", "AND_type", "CONNECTOR_type"}) [e->type-1]; - } - else - { - snprintf(unknown_type, sizeof(type)-1, "unknown-%d", e->type); - type = unknown_type; - } - - for(int j =0; jcost); - if (e->type != CONNECTOR_type) - { - E_list *l; - for(int j =0; ju.l); - for (l = e->u.l; NULL != l; l = l->next) - { - printf("%p", l->e); - if (NULL != l->next) printf(" "); - } - printf(")\n"); - - for (l = e->u.l; NULL != l; l = l->next) - { - prt_exp_mem(l->e, i+2); - } - } - else - { - for(int j =0; ju.string, e->dir, e->multi); - } -} -#endif - -/** - * Count the number of clauses (disjuncts) for the expression e. - * Should return the number of disjuncts that would be returned - * by build_disjunct(). This in turn should be equal to the number - * of clauses built by build_clause(). - * - * Only one minor cheat here: we are ignoring the cost_cutoff, so - * this potentially over-counts if the cost_cutoff is set low. - */ -static unsigned int count_clause(Exp *e) -{ - unsigned int cnt = 0; - E_list * e_list; - - assert(e != NULL, "count_clause called with null parameter"); - if (e->type == AND_type) - { - /* multiplicative combinatorial explosion */ - cnt = 1; - for (e_list = e->u.l; e_list != NULL; e_list = e_list->next) - cnt *= count_clause(e_list->e); - } - else if (e->type == OR_type) - { - /* Just additive */ - for (e_list = e->u.l; e_list != NULL; e_list = e_list->next) - cnt += count_clause(e_list->e); - } - else if (e->type == CONNECTOR_type) - { - return 1; - } - else - { - assert(false, "an expression node with no type"); - } - - return cnt; -} - -/** - * Count number of disjuncts given the dict node dn. - */ -unsigned int count_disjunct_for_dict_node(Dict_node *dn) -{ - return (NULL == dn) ? 0 : count_clause(dn->exp); -} - -/** - * build_word_expressions() -- build list of expressions for a word. - * - * Looks up a word in the dictionary, fetching from it matching words and their - * expressions. Returns NULL if it's not there. If there, it builds the list - * of expressions for the word, and returns a pointer to it. - * The subword of Gword w is used for this lookup, unless the subword is - * explicitly given as parameter s. The subword of Gword w is always used as - * the base word for each expression, and its subscript is the one from the - * dictionary word of the expression. - */ -X_node * build_word_expressions(Sentence sent, const Gword *w, const char *s) -{ - Dict_node * dn, *dn_head; - X_node * x, * y; - Exp_list eli; - const Dictionary dict = sent->dict; - - eli.exp_list = NULL; - dn_head = dictionary_lookup_list(dict, NULL == s ? w->subword : s); - x = NULL; - dn = dn_head; - while (dn != NULL) - { - y = (X_node *) xalloc(sizeof(X_node)); - y->next = x; - x = y; - x->exp = copy_Exp(dn->exp); - if (NULL == s) - { - x->string = dn->string; - } - else - { - dyn_str *xs = dyn_str_new(); - const char *sm = strrchr(dn->string, SUBSCRIPT_MARK); - - dyn_strcat(xs, w->subword); - if (NULL != sm) dyn_strcat(xs, sm); - x->string = string_set_add(xs->str, sent->string_set); - dyn_str_delete(xs); - } - x->word = w; - dn = dn->right; - } - free_lookup_list (dict, dn_head); - free_Exp_list(&eli); - return x; -} - -/** - * Turn sentence expressions into disjuncts. - * Sentence expressions must have been built, before calling this routine. - */ -void build_sentence_disjuncts(Sentence sent, double cost_cutoff) -{ - Disjunct * d; - X_node * x; - size_t w; - for (w = 0; w < sent->length; w++) - { - d = NULL; - for (x = sent->word[w].x; x != NULL; x = x->next) - { - Disjunct *dx = build_disjuncts_for_exp(x->exp, x->string, cost_cutoff); - word_record_in_disjunct(x->word, dx); - d = catenate_disjuncts(dx, d); - } - sent->word[w].d = d; - } -} diff -Nru link-grammar-5.3.16/link-grammar/build-disjuncts.h link-grammar-5.5.0/link-grammar/build-disjuncts.h --- link-grammar-5.3.16/link-grammar/build-disjuncts.h 2016-10-14 23:03:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/build-disjuncts.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,31 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2012 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LINKGRAMMAR_BUILD_DISJUNCTS_H -#define _LINKGRAMMAR_BUILD_DISJUNCTS_H - -#include "api-types.h" -#include "structures.h" - -void build_sentence_disjuncts(Sentence sent, double cost_cutoff); -X_node * build_word_expressions(Sentence, const Gword *, const char *); -Disjunct * build_disjuncts_for_exp(Exp*, const char*, double cost_cutoff); - -unsigned int count_disjunct_for_dict_node(Dict_node *dn); - -#ifdef DEBUG -void prt_exp(Exp *, int); -void prt_exp_mem(Exp *, int); -#endif /* DEBUG */ - -#endif /* _LINKGRAMMAR_BUILD_DISJUNCTS_H */ diff -Nru link-grammar-5.3.16/link-grammar/connectors.c link-grammar-5.5.0/link-grammar/connectors.c --- link-grammar-5.3.16/link-grammar/connectors.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/connectors.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,523 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ +/* + * Miscellaneous utilities for dealing with word types. + */ + +#include // for CHAR_BIT + +#include "dict-common/dict-utils.h" // for size_of_expression() +#include "api-structures.h" // for Parse_Options_s +#include "connectors.h" +#include "link-includes.h" // for Parse_Options + +#define WILD_TYPE '*' + +/** + * free_connectors() -- free the list of connectors pointed to by e + * (does not free any strings) + */ +void free_connectors(Connector *e) +{ + Connector * n; + for (; e != NULL; e = n) + { + n = e->next; + xfree((char *)e, sizeof(Connector)); + } +} + +void +set_connector_length_limit(Connector *c, Parse_Options opts) +{ + if (NULL == opts) + { + c->length_limit = UNLIMITED_LEN; + return; + } + + int short_len = opts->short_length; + bool all_short = opts->all_short; + int length_limit = c->desc->length_limit; + + if ((all_short && (length_limit > short_len)) || (0 == length_limit)) + c->length_limit = short_len; + else + c->length_limit = length_limit; +} + +Connector * connector_new(const condesc_t *desc, Parse_Options opts) +{ + Connector *c = (Connector *) xalloc(sizeof(Connector)); + + c->desc = desc; + c->nearest_word = 0; + c->multi = false; + set_connector_length_limit(c, opts); + //assert(0 != c->length_limit, "Connector_new(): Zero length_limit"); + + return c; +} + +/* ======================================================== */ +/* UNLIMITED-CONNECTORS handling. */ + +static size_t get_connectors_from_expression(condesc_t **conlist, const Exp *e) +{ + if (e->type == CONNECTOR_type) + { + if (NULL != conlist) *conlist = e->u.condesc; + return 1; + } + + size_t cl_size = 0; + for (E_list *l = e->u.l; l != NULL; l = l->next) + { + cl_size += get_connectors_from_expression(conlist, l->e); + if (NULL != conlist) conlist++; + } + + return cl_size; +} + +static int condesc_by_uc_num(const void *a, const void *b) +{ + const condesc_t * const * cda = a; + const condesc_t * const * cdb = b; + + if ((*cda)->uc_num < (*cdb)->uc_num) return -1; + if ((*cda)->uc_num > (*cdb)->uc_num) return 1; + + return 0; +} + +#define LENGTH_LINIT_WILD_TYPE WILD_TYPE + +/** + * Set the length limit of all the connectors that match those in e. + * XXX A connector in e that doesn't match any other connector cannot + * be detected, because it has been inserted into the connector table and + * hence matches at least itself. + */ +static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_limit) +{ + size_t exp_num_con; + ConTable *ct = &dict->contable; + condesc_t **sdesc = ct->sdesc; + condesc_t **econlist; + + /* Create a connector list from the given expression. */ + exp_num_con = get_connectors_from_expression(NULL, e); + if (0 == exp_num_con) return; /* Empty connector list. */ + econlist = alloca(exp_num_con * sizeof(*econlist)); + get_connectors_from_expression(econlist, e); + + qsort(econlist, exp_num_con, sizeof(*econlist), condesc_by_uc_num); + + /* Scan the expression connector list and set length_limit. + * restart_cn is needed because several connectors in this list + * may match a given uppercase part. */ + size_t restart_cn = 0, cn = 0, en; + for (en = 0; en < exp_num_con; en++) + { + for (cn = restart_cn; cn < ct->num_con; cn++) + if (sdesc[cn]->uc_num >= econlist[en]->uc_num) break; + + for (; en < exp_num_con; en++) + if (econlist[en]->uc_num >= sdesc[cn]->uc_num) break; + if (en == exp_num_con) break; + + if (econlist[en]->uc_num != sdesc[cn]->uc_num) continue; + restart_cn = cn+1; + + const char *wc_str = econlist[en]->string; + char *uc_wildcard = strchr(wc_str, LENGTH_LINIT_WILD_TYPE); + + for (; cn < ct->num_con; cn++) + { + if (NULL == uc_wildcard) + { + if (econlist[en]->uc_num != sdesc[cn]->uc_num) + break; + /* The uppercase parts are equal - match only the lowercase ones. */ + if (!lc_easy_match(econlist[en], sdesc[cn])) + continue; + } + else + { + /* The uppercase part is a prefix. */ + if (0 != strncmp(wc_str, sdesc[cn]->string, uc_wildcard - wc_str)) + break; + } + + sdesc[cn]->length_limit = length_limit; + } + } +} + +static void condesc_length_limit_def_delete(ConTable *ct) +{ + length_limit_def_t *l_next; + + for (length_limit_def_t *l = ct->length_limit_def; NULL != l; l = l_next) + { + l_next = l->next; + free(l); + } + ct->length_limit_def = NULL; +} + +void set_all_condesc_length_limit(Dictionary dict) +{ + ConTable *ct = &dict->contable; + bool unlimited_len_found = false; + + for (length_limit_def_t *l = ct->length_limit_def; NULL != l; l = l->next) + { + set_condesc_length_limit(dict, l->defexp, l->length_limit); + if (UNLIMITED_LEN == l->length_limit) unlimited_len_found = true; + } + + if (!unlimited_len_found) + { + /* If no connectors are defined as UNLIMITED_LEN, set all the + * connectors with no defined length-limit to UNLIMITED_LEN. */ + condesc_t **sdesc = ct->sdesc; + + for (size_t en = 0; en < ct->num_con; en++) + { + if (0 == sdesc[en]->length_limit) + sdesc[en]->length_limit = UNLIMITED_LEN; + } + } + + condesc_length_limit_def_delete(&dict->contable); + + if (verbosity_level(D_SPEC+1)) + { + prt_error("Debug:\n%5s %-6s %3s\n\\", "num", "uc_num", "ll"); + for (size_t n = 0; n < ct->num_con; n++) + { + prt_error("%5zu %6d %3d %s\n\\", n, ct->sdesc[n]->uc_num, + ct->sdesc[n]->length_limit, ct->sdesc[n]->string); + } + prt_error("\n"); + } +} + +/* ======================================================== */ + +/** + * Pack the LC part of a connector into 64 bits, and compute a wild-card mask. + * Up to 9 characters can be so packed. + * + * Because we pack by shifts, we can do it using 7-bit per original + * character at the same overhead needed for 8-bit packing. + * + * Note: The LC part may consist of chars in the range [a-z0-9] + * (total 36) so a 6-bit packing is possible (by abs(value-60) on each + * character value). + */ +static bool connector_encode_lc(const char *lc_string, condesc_t *desc) +{ + lc_enc_t lc_mask = 0; + lc_enc_t lc_value = 0; + lc_enc_t wildcard = LC_MASK; + const char *s; + + for (s = lc_string; '\0' != *s; s++) + { + lc_value |= (lc_enc_t)(*s & LC_MASK) << ((s-lc_string)*LC_BITS); + if (*s != WILD_TYPE) lc_mask |= wildcard; + wildcard <<= LC_BITS; + }; + + if ((unsigned long)(s-lc_string) > (CHAR_BIT*sizeof(lc_value)/LC_BITS)) + { + prt_error("Error: Lower-case part '%s' is too long (%ld)\n", + lc_string, s-lc_string); + return false; + } + + desc->lc_mask = lc_mask; + desc->lc_letters = lc_value; + + return true; +} + +/** + * Calculate fixed connector information that only depend on its string. + * This information is used to speed up the parsing stage. It is + * calculated during the directory creation and doesn't change afterward. + */ +bool calculate_connector_info(condesc_t * c) +{ + const char *s; + unsigned int i; + + s = c->string; + if (islower((int) *s)) s++; /* ignore head-dependent indicator */ + c->head_dependent = (c->string == s)? '\0' : c->string[0]; + + /* For most situations, all three hashes are very nearly equal; + * as to which is faster depends on the parsed text. + * For both English and Russian, there are about 100 pre-defined + * connectors, and another 2K-4K autogen'ed ones (the IDxxx idiom + * connectors, and the LLxxx suffix connectors for Russian). + * Turns out the cost of setting up the hash table dominates the + * cost of collisions. */ +#ifdef USE_DJB2 + /* djb2 hash */ + i = 5381; + while (isupper((int) *s)) /* connector tables cannot contain UTF8, yet */ + { + i = ((i << 5) + i) + *s; + s++; + } + i += i>>14; +#endif /* USE_DJB2 */ + +#define USE_JENKINS +#ifdef USE_JENKINS + /* Jenkins one-at-a-time hash */ + i = 0; + c->uc_start = s - c->string; + while (isupper((int) *s)) /* connector tables cannot contain UTF8, yet */ + { + i += *s; + i += (i<<10); + i ^= (i>>6); + s++; + } + i += (i << 3); + i ^= (i >> 11); + i += (i << 15); +#endif /* USE_JENKINS */ + +#ifdef USE_SDBM + /* sdbm hash */ + i = 0; + c->uc_start = s - c->string; + while (isupper((int) *s)) + { + i = *s + (i << 6) + (i << 16) - i; + s++; + } +#endif /* USE_SDBM */ + + c->uc_length = s - c->string - c->uc_start; + c->uc_hash = i; + + return connector_encode_lc(s, c); +} + +/* ================= Connector descriptor table. ====================== */ + +/** + * Compare connector UC parts, for qsort. + */ +static int condesc_by_uc_constring(const void * a, const void * b) +{ + const condesc_t * const * cda = a; + const condesc_t * const * cdb = b; + + /* Move the empty slots to the end. */ + if (NULL == *cda) return (NULL != *cdb); + if (NULL == *cdb) return -1; + + const char *sa = &(*cda)->string[(*cda)->uc_start]; + const char *sb = &(*cdb)->string[(*cdb)->uc_start]; + + int la = (*cda)->uc_length; + int lb = (*cdb)->uc_length; + + if (la == lb) + { + //printf("la==lb A=%s b=%s, la=%d lb=%d len=%d\n",sa,sb,la,lb,la); + return strncmp(sa, sb, la); + } + + if (la < lb) + { + char *uca = strdupa(sa); + uca[la] = '\0'; + //printf("lalb A=%s b=%s, la=%d lb=%d len=%d\n",sa,ucb,la,lb,la); + return strncmp(sa, ucb, la); + } +} + +/** + * Enumerate the connectors by their UC parts - equal parts get the same number. + * It replaces the existing connector UC-part hash, and can later serve + * as table index as if it was a perfect hash. + */ +void sort_condesc_by_uc_constring(Dictionary dict) +{ + if (0 == dict->contable.num_con) + { + prt_error("Error: Dictionary %s: No connectors found.\n", dict->name); + /* FIXME: Generate a dictionary open error. */ + return; + } + + condesc_t **sdesc = malloc(dict->contable.size * sizeof(*dict->contable.hdesc)); + memcpy(sdesc, dict->contable.hdesc, dict->contable.size * sizeof(*dict->contable.hdesc)); + qsort(sdesc, dict->contable.size, sizeof(*dict->contable.hdesc), + condesc_by_uc_constring); + + /* Enumerate the connectors according to their UC part. */ + int uc_num = 0; + uint32_t uc_hash = sdesc[0]->uc_hash; /* Will be recomputed */ + + sdesc[0]->uc_num = uc_num; + for (size_t n = 1; n < dict->contable.num_con; n++) + { + condesc_t **condesc = &sdesc[n]; + +//#define DEBUG_UC_HASH_CHANGE +#ifndef DEBUG_UC_HASH_CHANGE /* Use a shortcut - not needed for correctness. */ + if ((condesc[0]->uc_hash != uc_hash) || + (condesc[0]->uc_length != condesc[-1]->uc_length)) + + { + /* We know that the UC part has been changed. */ + uc_num++; + } + else +#endif + { + const char *uc1 = &condesc[0]->string[condesc[0]->uc_start]; + const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start]; + if (0 != strncmp(uc1, uc2, condesc[0]->uc_length)) + { + uc_num++; + } + } + + uc_hash = condesc[0]->uc_hash; + //printf("%5d constring=%s\n", uc_num, condesc[0]->string); + condesc[0]->uc_hash = uc_num; + } + + lgdebug(+11, "Dictionary %s: %zu different connectors " + "(%d with a different UC part)\n", + dict->name, dict->contable.num_con, uc_num+1); + + dict->contable.sdesc = sdesc; + dict->contable.num_uc = uc_num + 1; +} + +void condesc_delete(Dictionary dict) +{ + pool_delete(dict->contable.mempool); + free(dict->contable.hdesc); + free(dict->contable.sdesc); + condesc_length_limit_def_delete(&dict->contable); +} + +static condesc_t **condesc_find(ConTable *ct, const char *constring, int hash) +{ + size_t i = hash & (ct->size-1); + + while ((NULL != ct->hdesc[i]) && + !string_set_cmp(constring, ct->hdesc[i]->string)) + { + i = (i + 1) & (ct->size-1); + } + + return &ct->hdesc[i]; +} + +static void condesc_table_alloc(ConTable *ct, size_t size) +{ + ct->hdesc = (condesc_t **)malloc(size * sizeof(condesc_t *)); + memset(ct->hdesc, 0, size * sizeof(condesc_t *)); + ct->size = size; +} + +static bool condesc_insert(ConTable *ct, condesc_t **h, + const char *constring, int hash) +{ + *h = pool_alloc(ct->mempool); + (*h)->str_hash = hash; + (*h)->string = constring; + ct->num_con++; + + return calculate_connector_info(*h); +} + +#define CONDESC_TABLE_GROW_FACTOR 2 + +static bool condesc_grow(ConTable *ct) +{ + size_t old_size = ct->size; + condesc_t **old_hdesc = ct->hdesc; + + lgdebug(+11, "Growing ConTable from %zu\n", old_size); + condesc_table_alloc(ct, ct->size * CONDESC_TABLE_GROW_FACTOR); + + for (size_t i = 0; i < old_size; i++) + { + condesc_t *old_h = old_hdesc[i]; + if (NULL == old_h) continue; + condesc_t **new_h = condesc_find(ct, old_h->string, old_h->str_hash); + + if (NULL != *new_h) + { + prt_error("Fatal Error: condesc_grow(): Internal error\n"); + free(old_hdesc); + return false; + } + *new_h = old_h; + } + + free(old_hdesc); + return true; +} + +condesc_t *condesc_add(ConTable *ct, const char *constring) +{ + if (0 == ct->size) + { + condesc_table_alloc(ct, ct->num_con); + ct->num_con = 0; + ct->mempool = pool_new(__func__, "ConTable", + /*num_elements*/1024, sizeof(condesc_t), + /*zero_out*/true, /*align*/true, /*exact*/false); + } + + int hash = connector_str_hash(constring); + condesc_t **h = condesc_find(ct, constring, hash); + + if (NULL == *h) + { + lgdebug(+11, "Creating connector '%s'\n", constring); + if (!condesc_insert(ct, h, constring, hash)) return NULL; + + if ((8 * ct->num_con) > (3 * ct->size)) + { + if (!condesc_grow(ct)) return NULL; + h = condesc_find(ct, constring, hash); + } + } + + return *h; +} +/* ========================= END OF FILE ============================== */ diff -Nru link-grammar-5.3.16/link-grammar/connectors.h link-grammar-5.5.0/link-grammar/connectors.h --- link-grammar-5.3.16/link-grammar/connectors.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/connectors.h 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,317 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2009, 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _LINK_GRAMMAR_CONNECTORS_H_ +#define _LINK_GRAMMAR_CONNECTORS_H_ + +#include // for islower() +#include +#include +#include // for uint8_t + +#include "api-types.h" +#include "lg_assert.h" +#include "memory-pool.h" +#include "string-set.h" + +/* MAX_SENTENCE cannot be more than 254, because word MAX_SENTENCE+1 is + * BAD_WORD -- it is used to indicate that nothing can connect to this + * connector, and this should fit in one byte (because the word field + * of a connector is an uint8_t, see below). + */ +#define MAX_SENTENCE 254 /* Maximum number of words in a sentence */ + +/* For faster comparisons, the connector lc part is encoded into a number + * and a mask. Each letter is encoded using LC_BITS bits. With 7 bits, it + * is possible to encode up to 9 letters in an uint64_t. */ +#define LC_BITS 7 +#define LC_MASK ((1<desc->string; +} + +static inline int connector_uc_start(const Connector *c) +{ + return c->desc->uc_start; +} + +static inline const condesc_t *connector_desc(const Connector *c) +{ + return c->desc; +} + +static inline int connector_uc_hash(const Connector * c) +{ + return c->desc->uc_hash; +} + +static inline int connector_uc_num(const Connector * c) +{ + return c->desc->uc_num; +} + + +/* Connector utilities ... */ +Connector * connector_new(const condesc_t *, Parse_Options); +void set_connector_length_limit(Connector *, Parse_Options); +void free_connectors(Connector *); + +/* Length-limits for how far connectors can reach out. */ +#define UNLIMITED_LEN 255 + +void set_all_condesc_length_limit(Dictionary); + +/** + * Returns TRUE if s and t match according to the connector matching + * rules. The connector strings must be properly formed, starting with + * zero or one lower case letters, followed by one or more upper case + * letters, followed by some other letters. + * + * The algorithm is symmetric with respect to a and b. + * + * Connectors starting with lower-case letters match ONLY if the initial + * letters are DIFFERENT. Otherwise, connectors only match if the + * upper-case letters are the same, and the trailing lower case letters + * are the same (or have wildcards). + * + * The initial lower-case letters allow an initial 'h' (denoting 'head + * word') to match an initial 'd' (denoting 'dependent word'), while + * rejecting a match 'h' to 'h' or 'd' to 'd'. This allows the parser + * to work with catena, instead of just links. + */ +static inline bool easy_match(const char * s, const char * t) +{ + char is = 0, it = 0; + if (islower((int) *s)) { is = *s; s++; } + if (islower((int) *t)) { it = *t; t++; } + + if (is != 0 && it != 0 && is == it) return false; + + while (isupper((int)*s) || isupper((int)*t)) + { + if (*s != *t) return false; + s++; + t++; + } + + while ((*s!='\0') && (*t!='\0')) + { + if ((*s == '*') || (*t == '*') || (*s == *t)) + { + s++; + t++; + } + else + return false; + } + return true; +} + +/** + * Compare the lower-case and head/dependent parts of two connector descriptors. + * When this function is called, it is assumed that the upper-case + * parts are equal, and thus do not need to be checked again. + */ +static bool lc_easy_match(const condesc_t *c1, const condesc_t *c2) +{ + if ((c1->lc_letters ^ c2->lc_letters) & c1->lc_mask & c2->lc_mask) + return false; + if (('\0' != c1->head_dependent) && (c1->head_dependent == c2->head_dependent)) + return false; + + return true; +} + +/** + * This function is like easy_match(), but with connector descriptors. + * It uses a shortcut comparison of the upper-case parts. + */ +static inline bool easy_match_desc(const condesc_t *c1, const condesc_t *c2) +{ + if (c1->uc_num != c2->uc_num) return false; + return lc_easy_match(c1, c2); +} + +static inline int string_hash(const char *s) +{ + unsigned int i; + + /* djb2 hash */ + i = 5381; + while (*s) + { + i = ((i << 5) + i) + *s; + s++; + } + return i; +} + +bool calculate_connector_info(condesc_t *); + +static inline int connector_str_hash(const char *s) +{ + uint32_t i; + + /* For most situations, all three hashes are very nearly equal; + * as to which is faster depends on the parsed text. + * For both English and Russian, there are about 100 pre-defined + * connectors, and another 2K-4K autogen'ed ones (the IDxxx idiom + * connectors, and the LLxxx suffix connectors for Russian). + * Turns out the cost of setting up the hash table dominates the + * cost of collisions. */ +#ifdef USE_DJB2 + /* djb2 hash */ + i = 5381; + while (*s) + { + i = ((i << 5) + i) + *s; + s++; + } + i += i>>14; +#endif /* USE_DJB2 */ + +#define USE_JENKINS +#ifdef USE_JENKINS + /* Jenkins one-at-a-time hash */ + i = 0; + while (*s) + { + i += *s; + i += (i<<10); + i ^= (i>>6); + s++; + } + i += (i << 3); + i ^= (i >> 11); + i += (i << 15); +#endif /* USE_JENKINS */ + + return i; +} + +/** + * hash function. Based on some tests, this seems to be an almost + * "perfect" hash, in that almost all hash buckets have the same size! + */ +static inline unsigned int pair_hash(unsigned int table_size, + int lw, int rw, + const Connector *le, const Connector *re, + unsigned int cost) +{ + unsigned int i; + +#if 0 + /* hash function. Based on some tests, this seems to be + * an almost "perfect" hash, in that almost all hash buckets + * have the same size! */ + i = 1 << cost; + i += 1 << (lw % (log2_table_size-1)); + i += 1 << (rw % (log2_table_size-1)); + i += ((unsigned int) le) >> 2; + i += ((unsigned int) le) >> log2_table_size; + i += ((unsigned int) re) >> 2; + i += ((unsigned int) re) >> log2_table_size; + i += i >> log2_table_size; +#else + /* sdbm-based hash */ + i = cost; + i = lw + (i << 6) + (i << 16) - i; + i = rw + (i << 6) + (i << 16) - i; + i = ((int)(intptr_t)le) + (i << 6) + (i << 16) - i; + i = ((int)(intptr_t)re) + (i << 6) + (i << 16) - i; +#endif + + return i & (table_size-1); +} +#endif /* _LINK_GRAMMAR_CONNECTORS_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/constituents.c link-grammar-5.5.0/link-grammar/constituents.c --- link-grammar-5.3.16/link-grammar/constituents.c 2017-01-27 22:07:11.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/constituents.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1330 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include - -#include "api-structures.h" -#include "error.h" -#include "externs.h" -#include "post-process.h" -#include "print-util.h" -#include "string-set.h" - -#define D_CONST 8 /* debug level for this file */ - -#define OPEN_BRACKET '[' -#define CLOSE_BRACKET ']' - -typedef enum {OPEN_TOK, CLOSE_TOK, WORD_TOK} CType; -typedef enum {NONE, STYPE, PTYPE, QTYPE, QDTYPE} WType; - -typedef struct -{ - const char * type; - const char * start_link; - size_t left; /* leftmost word */ - size_t right; /* rightmost word */ - int canon; - bool valid; - char domain_type; -} constituent_t; - -/* - * Context used to store assorted intermediate data - * when the constituent string is being generated. - */ -typedef struct -{ - String_set * phrase_ss; - WType * wordtype; - constituent_t * constituent; - int conlen; -} con_context_t; - - -typedef struct CNode_s CNode; - -/* Invariant: Leaf if child==NULL */ -struct CNode_s -{ - char * label; - CNode * child; - CNode * next; - int start, end; -}; - -/* ================================================================ */ - -static inline bool uppercompare(const char * s, const char * t) -{ -#if 0 /* Non-ASCII definition are not supported. */ - return (false == utf8_upper_match(s,t)); -#endif - while (isupper(*s) || isupper(*t)) - { - if (*s++ != *t++) return true; - } - return false; -} - -/** - * If a constituent c has a comma at either end, we exclude the - * comma. - */ -static void adjust_for_left_comma(con_context_t * ctxt, Linkage linkage, int c) -{ - int w; - w = ctxt->constituent[c].left; - if (strcmp(linkage->word[w], ",") == 0) - w++; - ctxt->constituent[c].left = w; -} - -static void adjust_for_right_comma(con_context_t *ctxt, Linkage linkage, int c) -{ - int w; - w = ctxt->constituent[c].right; - if ((strcmp(linkage->word[w], ",") == 0) || - (strcmp(linkage->word[w], "RIGHT-WALL") == 0)) - { - w--; - } - ctxt->constituent[c].right = w; -} - -static void print_constituent(con_context_t *ctxt, Linkage linkage, int c) -{ - size_t w; - - err_msg(lg_Debug, " c %2d %4s [%c] (%2zu-%2zu): ", - c, ctxt->constituent[c].type, ctxt->constituent[c].domain_type, - ctxt->constituent[c].left, ctxt->constituent[c].right); - for (w = ctxt->constituent[c].left; w <= ctxt->constituent[c].right; w++) { - err_msg(lg_Debug, "%s ", linkage->word[w]); /**PV**/ - } - err_msg(lg_Debug, "\n"); -} - -/****************************************************** - * These functions do the bulk of the actual - * constituent-generating; they're called once. - *********************************************************/ - -typedef enum -{ - CASE_S=1, - CASE_UNUSED=2, /* XXX not used anywhere... */ - CASE_REL_CLAUSE=3, - CASE_APPOS=4, - CASE_OPENER=5, - CASE_PPOPEN=6, - CASE_SVINV=7, - CASE_PART_MOD=8, - CASE_PART_OPEN=9, - -} case_type; - -/** - * This function looks for constituents of type ctype1. Say it finds - * one, call it c1. It searches for the next larger constituent of - * type ctype2, call it c2. It then generates a new constituent of - * ctype3, containing all the words in c2 but not c1. - */ -static int gen_comp(con_context_t *ctxt, Linkage linkage, - int numcon_total, int numcon_subl, - const char * ctype1, const char * ctype2, - const char * ctype3, case_type x) -{ - size_t w, w2, w3; - int c, c1, c2; - bool done; - c = numcon_total + numcon_subl; - - for (c1=numcon_total; c1constituent[c1].start_link)==0)) - continue; - -#ifdef REVIVE_DEAD_CODE - /* If ctype1 is X, and domain_type is t, it's an infinitive - skip it */ - if ((x==CASE_UNUSED) && (ctxt->constituent[c1].domain_type=='t')) - continue; -#endif /* REVIVE_DEAD_CODE */ - - /* If it's domain-type z, it's a subject-relative clause; - the VP doesn't need an NP */ - if (ctxt->constituent[c1].domain_type=='z') - continue; - - /* If ctype1 is X or VP, and it's not started by an S, don't generate an NP - (Neither of the two previous checks are necessary now, right?) */ -#ifdef REVIVE_DEAD_CODE - /* use this ... if ((x==CASE_S || x==CASE_UNUSED) && */ -#endif /* REVIVE_DEAD_CODE */ - if ((x==CASE_S) && - (((post_process_match("S", ctxt->constituent[c1].start_link) == 0) && - (post_process_match("SX", ctxt->constituent[c1].start_link) == 0) && - (post_process_match("SF", ctxt->constituent[c1].start_link) == 0)) || - (post_process_match("S##w", ctxt->constituent[c1].start_link) != 0))) - continue; - - /* If it's an SBAR (relative clause case), it has to be a relative clause */ - if ((x==CASE_REL_CLAUSE) && - ((post_process_match("Rn", ctxt->constituent[c1].start_link) == 0) && - (post_process_match("R*", ctxt->constituent[c1].start_link) == 0) && - (post_process_match("MX#r", ctxt->constituent[c1].start_link) == 0) && - (post_process_match("Mr", ctxt->constituent[c1].start_link) == 0) && - (post_process_match("MX#d", ctxt->constituent[c1].start_link) == 0))) - continue; - - /* If ctype1 is SBAR (clause opener case), it has to be an f domain */ - if ((x==CASE_OPENER) && (ctxt->constituent[c1].domain_type!='f')) - continue; - - /* If ctype1 is SBAR (pp opener case), it has to be a g domain */ - if ((x==CASE_PPOPEN) && (ctxt->constituent[c1].domain_type!='g')) - continue; - - /* If ctype1 is NP (paraphrase case), it has to be started by an SI */ - if ((x==CASE_SVINV) && (post_process_match("SI", ctxt->constituent[c1].start_link)==0)) - continue; - - /* If ctype1 is VP (participle modifier case), it has to be - started by an Mv or Mg */ - if ((x==CASE_PART_MOD) && (post_process_match("M", ctxt->constituent[c1].start_link)==0)) - continue; - - /* If ctype1 is VP (participle opener case), it has - to be started by a COp */ - if ((x==CASE_PART_OPEN) && (post_process_match("COp", ctxt->constituent[c1].start_link)==0)) - continue; - - /* Now start at the bounds of c1, and work outwards until you - find a larger constituent of type ctype2 */ - if (!(strcmp(ctxt->constituent[c1].type, ctype1)==0)) - continue; - - if (verbosity_level(D_CONST)) - err_msg(lg_Debug, "Generating complement constituent for c %d of type %s\n", - c1, ctype1); - done = false; - for (w2 = ctxt->constituent[c1].left; (done == false) && (w2 != (size_t)-1); w2--) - { - for (w3 = ctxt->constituent[c1].right; w3num_words; w3++) - { - for (c2 = numcon_total; (done == false) && - (c2 < numcon_total + numcon_subl); c2++) { - if (!((ctxt->constituent[c2].left == w2) && - (ctxt->constituent[c2].right == w3)) || (c2==c1)) - continue; - if (!(strcmp(ctxt->constituent[c2].type, ctype2)==0)) - continue; - - /* if the new constituent (c) is to the left - of c1, its right edge should be adjacent to the - left edge of c1 - or as close as possible. */ - if ((x==CASE_OPENER) || (x==CASE_PPOPEN) || (x==CASE_PART_OPEN)) - { - /* This is the case where c is to the - RIGHT of c1 */ - w = ctxt->constituent[c1].right + 1; - if (w > ctxt->constituent[c2].right) - { - done = true; - continue; - } - ctxt->constituent[c].left = w; - ctxt->constituent[c].right = ctxt->constituent[c2].right; - } - else - { - w = ctxt->constituent[c1].left - 1; - if (w < ctxt->constituent[c2].left) { - done = true; - continue; - } - ctxt->constituent[c].right = w; - ctxt->constituent[c].left = ctxt->constituent[c2].left; - } - - adjust_for_left_comma(ctxt, linkage, c1); - adjust_for_right_comma(ctxt, linkage, c1); - - ctxt->constituent[c].type = - string_set_add(ctype3, ctxt->phrase_ss); - ctxt->constituent[c].domain_type = 'x'; - ctxt->constituent[c].start_link = - string_set_add("XX", ctxt->phrase_ss); - if (verbosity_level(D_CONST)) - { - err_msg(lg_Debug, "Larger c found: c %d (%s); ", c2, ctype2); - err_msg(lg_Debug, "Adding constituent:\n\\"); - print_constituent(ctxt, linkage, c); - } - c++; - assert (c < ctxt->conlen, "Too many constituents"); - done = true; - } - } - } - if (verbosity_level(D_CONST)) - { - if (done == false) - err_msg(lg_Debug, "No constituent added, because no larger %s" \ - " was found\n", ctype2); - } - } - numcon_subl = c - numcon_total; - return numcon_subl; -} - -/** - * Look for a constituent started by an MVs or MVg. - * Find any VP's or ADJP's that contain it (without going - * beyond a larger S or NP). Adjust them so that - * they end right before the m domain starts. - */ -static void adjust_subordinate_clauses(con_context_t *ctxt, Linkage linkage, - int numcon_total, - int numcon_subl) -{ - int c, c2; - size_t w, w2; - bool done; - - for (c=numcon_total; cconstituent[c].start_link) == 1) || - (post_process_match("MVg", ctxt->constituent[c].start_link) == 1)) - { - done = false; - for (w2 = ctxt->constituent[c].left-1; (false == done) && w2 != (size_t) -1; w2--) - { - for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++) - { - if (!((ctxt->constituent[c2].left == w2) && - (ctxt->constituent[c2].right >= ctxt->constituent[c].right))) - continue; - if ((strcmp(ctxt->constituent[c2].type, "S") == 0) || - (strcmp(ctxt->constituent[c2].type, "NP") == 0)) { - done = true; - break; - } - if ((ctxt->constituent[c2].domain_type == 'v') || - (ctxt->constituent[c2].domain_type == 'a')) - { - w = ctxt->constituent[c].left - 1; - ctxt->constituent[c2].right = w; - - if (verbosity_level(D_CONST)) - { - err_msg(lg_Debug, "Adjusting constituent %d:\n\\", c2); - print_constituent(ctxt, linkage, c2); - } - } - } - } - if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0) - ctxt->constituent[c].left++; - } - } -} - -/****************************************************** - * These functions are called once, after constituents - * have been generated, to merge them together and fix up - * some other things. - * - ********************************************************/ - -static int merge_constituents(con_context_t *ctxt, Linkage linkage, int numcon_total) -{ - int c1, c2=0; - - /* First go through and give each constituent a canonical number - (the index number of the lowest-numbered constituent - identical to it) */ - for (c1 = 0; c1 < numcon_total; c1++) - { - ctxt->constituent[c1].valid = true; - ctxt->constituent[c1].canon = c1; - for (c2 = c1 + 1; c2 < numcon_total; c2++) - { - if ((ctxt->constituent[c1].left == ctxt->constituent[c2].left) && - (ctxt->constituent[c1].right == ctxt->constituent[c2].right) && - (strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0)) - { - ctxt->constituent[c2].canon = c1; - } - } - } - - /* Now go through and find duplicates; if a pair is found, - * mark one as invalid. - */ - for (c1 = 0; c1 < numcon_total; c1++) - { - for (c2 = c1 + 1; c2 < numcon_total; c2++) - { - if (ctxt->constituent[c2].canon == ctxt->constituent[c1].canon) - ctxt->constituent[c2].valid = false; - } - } - - return numcon_total; -} - -/** - * Go through all the words. If a word is on the right end of - * an S (or SF or SX), wordtype[w]=STYPE. If it's also on the left end of a - * Pg*b, I, PP, or Pv, wordtype[w]=PTYPE. If it's a question-word - * used in an indirect question, wordtype[w]=QTYPE. If it's a - * question-word determiner, wordtype[w]=QDTYPE. Else wordtype[w]=NONE. - * (This function is called once.) - */ -static void generate_misc_word_info(con_context_t * ctxt, Linkage linkage) -{ - size_t w1, w2, l1, l2; - const char * label1, * label2; - - for (w1 = 0; w1 < linkage->num_words; w1++) - ctxt->wordtype[w1] = NONE; - - for (l1 = 0; l1 < linkage_get_num_links(linkage); l1++) { - w1=linkage_get_link_rword(linkage, l1); - label1 = linkage_get_link_label(linkage, l1); - if ((uppercompare(label1, "S")==0) || - (uppercompare(label1, "SX")==0) || - (uppercompare(label1, "SF")==0)) { - ctxt->wordtype[w1] = STYPE; - for (l2 = 0; l2 < linkage_get_num_links(linkage); l2++) { - w2=linkage_get_link_lword(linkage, l2); - label2 = linkage_get_link_label(linkage, l2); - if ((w1 == w2) && - ((post_process_match("Pg#b", label2)==1) || - (uppercompare(label2, "I")==0) || - (uppercompare(label2, "PP")==0) || - (post_process_match("Pv", label2)==1))) { - /* Pvf, Pgf? */ - ctxt->wordtype[w1] = PTYPE; - } - } - } - if (post_process_match("QI#d", label1)==1) { - ctxt->wordtype[w1] = QTYPE; - for (l2 = 0; l2 < linkage_get_num_links(linkage); l2++) { - w2 = linkage_get_link_lword(linkage, l2); - label2 = linkage_get_link_label(linkage, l2); - if ((w1 == w2) && (post_process_match("D##w", label2)==1)) { - ctxt->wordtype[w1] = QDTYPE; - } - } - } - if (post_process_match("Mr", label1)==1) ctxt->wordtype[w1] = QDTYPE; - if (post_process_match("MX#d", label1)==1) ctxt->wordtype[w1] = QDTYPE; - } -} - -static int new_style_conjunctions(con_context_t *ctxt, Linkage linkage, int numcon_total) -{ -#ifdef DEBUG - int c; - for (c = 0; c < numcon_total; c++) - { - constituent_t *ct = &ctxt->constituent[c]; - lgdebug(6, "ola %d valid=%d %s start=%s lr=%zu %zu\n", c, - ct->valid, ct->type, ct->start_link, ct->left, ct->right); - } -#endif - return numcon_total; -} - -static int last_minute_fixes(con_context_t *ctxt, Linkage linkage, int numcon_total) -{ - int c; - bool global_leftend_found, global_rightend_found; - size_t lastword; - - for (c = 0; c < numcon_total; c++) - { - /* In a paraphrase construction ("John ran, he said"), - the paraphrasing clause doesn't get - an S. (This is true in Treebank II, not Treebank I) */ - - if (uppercompare(ctxt->constituent[c].start_link, "CP") == 0) - { - ctxt->constituent[c].valid = false; - } - - /* If it's a possessive with an "'s", the NP on the left - should be extended to include the "'s". */ - if ((uppercompare(ctxt->constituent[c].start_link, "YS") == 0) || - (uppercompare(ctxt->constituent[c].start_link, "YP") == 0)) - { - ctxt->constituent[c].right++; - } - - /* If a constituent has starting link MVpn, it's a time - expression like "last week"; label it as a noun phrase - (incorrectly) */ - - if (strcmp(ctxt->constituent[c].start_link, "MVpn") == 0) - { - ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss); - } - if (strcmp(ctxt->constituent[c].start_link, "COn") == 0) - { - ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss); - } - if (strcmp(ctxt->constituent[c].start_link, "Mpn") == 0) - { - ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss); - } - - /* If the constituent is an S started by "but" or "and" at - the beginning of the sentence, it should be ignored. */ - - if ((strcmp(ctxt->constituent[c].start_link, "Wdc") == 0) && - (ctxt->constituent[c].left == 2)) - { - ctxt->constituent[c].valid = false; - } - - /* For prenominal adjectives, an ADJP constituent is assigned - if it's a hyphenated (Ah) or comparative (Am) adjective; - otherwise no ADJP is assigned, unless the phrase is more - than one word long (e.g. "very big"). The same with certain - types of adverbs. */ - /* That was for Treebank I. For Treebank II, the rule only - seems to apply to prenominal adjectives (of all kinds). - However, it also applies to number expressions ("QP"). */ - - if ((post_process_match("A", ctxt->constituent[c].start_link) == 1) || - (ctxt->constituent[c].domain_type == 'd') || - (ctxt->constituent[c].domain_type == 'h')) { - if (ctxt->constituent[c].right-ctxt->constituent[c].left == 0) - { - ctxt->constituent[c].valid = false; - } - } - - if ((ctxt->constituent[c].domain_type == 'h') && - (strcmp(linkage->word[ctxt->constituent[c].left - 1], "$") == 0)) - { - ctxt->constituent[c].left--; - } - } - - /* If there's a global S constituent that includes everything - except a final terminating punctuation (period or question mark), - extend it by one word. We know its the terminating punctuation, - because it links to the right wall with an RW link. If its - not, then that final link is not there... - */ - for (c = 0; c < numcon_total; c++) - { - if ((ctxt->constituent[c].right == linkage->num_words - 3) && - (ctxt->constituent[c].left == 1) && - (strcmp(ctxt->constituent[c].type, "S") == 0)) - { - size_t ln; - for (ln = 0; ln < linkage->num_links; ln++) - { - if ((linkage->link_array[ln].lw == linkage->num_words - 2) && - (linkage->link_array[ln].rw == linkage->num_words - 1)) - { - ctxt->constituent[c].right++; - break; - } - } - } - } - - /* If there's no S boundary at the very left end of the sentence, - or the very right end, create a new S spanning the entire sentence */ - - lastword = linkage->num_words - 2; - global_leftend_found = false; - global_rightend_found = false; - for (c = 0; c < numcon_total; c++) - { - if ((ctxt->constituent[c].left == 1) && - (strcmp(ctxt->constituent[c].type, "S") == 0) && - ctxt->constituent[c].valid) - { - global_leftend_found = true; - } - } - - for (c = 0; c < numcon_total; c++) - { - if ((ctxt->constituent[c].right >= lastword) && - (strcmp(ctxt->constituent[c].type, "S") == 0) && - ctxt->constituent[c].valid) - { - global_rightend_found = true; - } - } - - if ((global_leftend_found == false) || (global_rightend_found == false)) - { - c = numcon_total; - ctxt->constituent[c].left = 1; - ctxt->constituent[c].right = linkage->num_words-1; - ctxt->constituent[c].type = string_set_add("S", ctxt->phrase_ss); - ctxt->constituent[c].valid = true; - ctxt->constituent[c].domain_type = 'x'; - numcon_total++; - if (verbosity_level(D_CONST)) - { - err_msg(lg_Debug, "Adding global sentence constituent:\n\\"); - print_constituent(ctxt, linkage, c); - } - } - - return numcon_total; -} - -static int add_constituent(con_context_t *ctxt, int c, const Linkage linkage, - const Domain *domain, - int l, int r, const char * name) -{ - int nwords = linkage->num_words-2; - c++; - - /* Avoid running off end, to walls. */ - if (l < 1) l=1; - if (r > nwords) r = nwords; - if (l > nwords) l = nwords; - assert(l <= r, "negative constituent length!" ); - - ctxt->constituent[c].type = string_set_add(name, ctxt->phrase_ss); - ctxt->constituent[c].left = l; - ctxt->constituent[c].right = r; - ctxt->constituent[c].domain_type = domain->type; - ctxt->constituent[c].start_link = - linkage_get_link_label(linkage, domain->start_link); - return c; -} - -static const char * cons_of_domain(const Linkage linkage, char domain_type) -{ - switch (domain_type) { - case 'a': - return "ADJP"; - case 'b': - return "SBAR"; - case 'c': - return "VP"; - case 'd': - return "QP"; - case 'e': - return "ADVP"; - case 'f': - return "SBAR"; - case 'g': - return "PP"; - case 'h': - return "QP"; - case 'i': - return "ADVP"; - case 'k': - return "PRT"; - case 'n': - return "NP"; - case 'p': - return "PP"; - case 'q': - return "SINV"; - case 's': - return "S"; - case 't': - return "VP"; - case 'u': - return "ADJP"; - case 'v': - return "VP"; - case 'y': - return "NP"; - case 'z': - return "VP"; - default: - { - err_ctxt ec = { linkage->sent }; - err_msgc(&ec, lg_Error, "Error: Illegal domain: %c", domain_type); - return ""; - } - } -} - -static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage, - int numcon_total) -{ - size_t d, l, w2; - int c, w, c2, numcon_subl = 0; - PP_data *pp_data = &linkage->sent->constituent_pp->pp_data; - - for (d = 0, c = numcon_total; d < pp_data->N_domains; d++, c++) - { - size_t leftmost, rightmost, leftlimit; - int rootleft; - List_o_links * dlink; - - Domain domain = pp_data->domain_array[d]; - - // rootright = linkage_get_link_rword(linkage, domain.start_link); - rootleft = linkage_get_link_lword(linkage, domain.start_link); - - if ((domain.type=='c') || - (domain.type=='d') || - (domain.type=='e') || - (domain.type=='f') || - (domain.type=='g') || - (domain.type=='u') || - (domain.type=='y')) - { - leftlimit = 0; - leftmost = linkage_get_link_lword(linkage, domain.start_link); - rightmost = linkage_get_link_lword(linkage, domain.start_link); - } - else - { - leftlimit = linkage_get_link_lword(linkage, domain.start_link) + 1; - leftmost = linkage_get_link_rword(linkage, domain.start_link); - rightmost = linkage_get_link_rword(linkage, domain.start_link); - } - - /* Start by assigning both left and right limits to the - * right word of the start link. This will always be contained - * in the constituent. This will also handle the case - * where the domain contains no links. - */ - for (dlink = domain.lol; dlink != NULL; dlink = dlink->next) - { - l = dlink->link; - - if ((linkage_get_link_lword(linkage, l) < leftmost) && - (linkage_get_link_lword(linkage, l) >= leftlimit)) - { - leftmost = linkage_get_link_lword(linkage, l); - } - - if (linkage_get_link_rword(linkage, l) > rightmost) - { - rightmost = linkage_get_link_rword(linkage, l); - } - } - - c--; - c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, - cons_of_domain(linkage, domain.type)); - - if (domain.type == 'z') - { - c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); - } - if (domain.type=='c') - { - c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); - } - if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) || - (post_process_match("Rn", ctxt->constituent[c].start_link)==1)) - { - c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "SBAR"); - } - if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) || - (post_process_match("MX#r", ctxt->constituent[c].start_link)==1)) - { - w = leftmost; - if (strcmp(linkage->word[w], ",") == 0) w++; - c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHNP"); - } - if (post_process_match("Mj", ctxt->constituent[c].start_link) == 1) - { - w = leftmost; - if (strcmp(linkage->word[w], ",") == 0) w++; - c = add_constituent(ctxt, c, linkage, &domain, w, w+1, "WHPP"); - c = add_constituent(ctxt, c, linkage, &domain, w+1, w+1, "WHNP"); - } - if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) || - (post_process_match("B#d", ctxt->constituent[c].start_link)==1)) - { - c = add_constituent(ctxt, c, linkage, &domain, rootleft, rootleft, "WHNP"); - c = add_constituent(ctxt, c, linkage, &domain, - rootleft, ctxt->constituent[c-1].right, "SBAR"); - } - if (post_process_match("CP", ctxt->constituent[c].start_link)==1) - { - if (strcmp(linkage->word[leftmost], ",") == 0) - ctxt->constituent[c].left++; - c = add_constituent(ctxt, c, linkage, &domain, 1, linkage->num_words-1, "S"); - } - if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) || - (domain.type=='f')) - { - w = ctxt->constituent[c].left; - if (strcmp(linkage->word[w], ",") == 0) - w++; - if (strcmp(linkage->word[w], "when") == 0) - { - c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHADVP"); - } - } - if (domain.type=='t') - { - c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); - } - if ((post_process_match("QI", ctxt->constituent[c].start_link) == 1) || - (post_process_match("Mr", ctxt->constituent[c].start_link) == 1) || - (post_process_match("MX#d", ctxt->constituent[c].start_link) == 1)) - { - const char * name = ""; - w = leftmost; - if (strcmp(linkage->word[w], ",") == 0) w++; - if (ctxt->wordtype[w] == NONE) - name = "WHADVP"; - else if (ctxt->wordtype[w] == QTYPE) - name = "WHNP"; - else if (ctxt->wordtype[w] == QDTYPE) - name = "WHNP"; - else - assert(0, "Unexpected word type"); - c = add_constituent(ctxt, c, linkage, &domain, w, w, name); - - if (ctxt->wordtype[w] == QDTYPE) - { - /* Now find the finite verb to the right, start an S */ - /* Limit w2 to sentence length. */ - // for( w2=w+1; w2 < ctxt->r_limit-1; w2++ ) - for (w2 = w+1; w2 < rightmost; w2++) - if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break; - - /* Adjust the right boundary of previous constituent */ - ctxt->constituent[c].right = w2 - 1; - c = add_constituent(ctxt, c, linkage, &domain, w2, rightmost, "S"); - } - } - - if (ctxt->constituent[c].domain_type == '\0') - { - err_ctxt ec = { linkage->sent }; - err_msgc(&ec, lg_Error, "Error: no domain type assigned to constituent"); - } - if (ctxt->constituent[c].start_link == NULL) - { - err_ctxt ec = { linkage->sent }; - err_msgc(&ec, lg_Error, "Error: no type assigned to constituent"); - } - } - - numcon_subl = c - numcon_total; - /* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl); */ - - if (verbosity_level(D_CONST)) - { - err_msg(lg_Debug, "Constituents added at first stage:\n\\"); - for (c = numcon_total; c < numcon_total + numcon_subl; c++) - { - /* FIXME: Here it cannot be printed as one debug message because - * a newline is printed at the end. */ - print_constituent(ctxt, linkage, c); - } - } - - /* Opener case - generates S around main clause. - (This must be done first; the S generated will be needed for - later cases.) */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", CASE_OPENER); - - /* pp opener case */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", CASE_PPOPEN); - - /* participle opener case */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", CASE_PART_OPEN); - - /* Subject-phrase case; every main VP generates an S */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", CASE_S); - - /* Relative clause case; an SBAR generates a complement NP */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", CASE_REL_CLAUSE); - - /* Participle modifier case */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", CASE_PART_MOD); - - /* PP modifying NP */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", CASE_PART_MOD); - - /* Appositive case */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", CASE_APPOS); - - /* S-V inversion case; an NP generates a complement VP */ - numcon_subl = - gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", CASE_SVINV); - - adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl); - for (c = numcon_total; c < numcon_total + numcon_subl; c++) - { - if ((ctxt->constituent[c].domain_type=='p') && - (strcmp(linkage->word[ctxt->constituent[c].left], ",")==0)) - { - ctxt->constituent[c].left++; - } - } - - /* Make sure the constituents are nested. If two constituents - * are not nested: whichever constituent has the furthest left - * boundary, shift that boundary rightwards to the left boundary - * of the other one. - */ - while (true) - { - bool adjustment_made = false; - for (c = numcon_total; c < numcon_total + numcon_subl; c++) - { - for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++) - { - if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) && - (ctxt->constituent[c].right < ctxt->constituent[c2].right) && - (ctxt->constituent[c].right >= ctxt->constituent[c2].left)) - { - /* We've found two overlapping constituents. - If one is larger, except the smaller one - includes an extra comma, adjust the smaller one - to exclude the comma */ - - if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",") == 0) || - (strcmp(linkage->word[ctxt->constituent[c2].right], - "RIGHT-WALL") == 0)) - { - if (verbosity_level(D_CONST)) - err_msg(lg_Debug, "Adjusting %d to fix comma overlap\n", c2); - adjust_for_right_comma(ctxt, linkage, c2); - adjustment_made = true; - } - else if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0) - { - if (verbosity_level(D_CONST)) - err_msg(lg_Debug, "Adjusting c %d to fix comma overlap\n", c); - adjust_for_left_comma(ctxt, linkage, c); - adjustment_made = true; - } - else - { - if (verbosity_level(D_CONST)) - { - err_ctxt ec = { linkage->sent }; - err_msgc(&ec, lg_Warn, - "Warning: the constituents aren't nested! " - "Adjusting them. (%d, %d)", c, c2); - } - ctxt->constituent[c].left = ctxt->constituent[c2].left; - } - } - } - } - if (adjustment_made == false) break; - } - - assert (numcon_total + numcon_subl < ctxt->conlen, "Too many constituents"); - return numcon_subl; -} - -static char * -exprint_constituent_structure(con_context_t *ctxt, - Linkage linkage, int numcon_total) -{ - size_t w; - int c; - bool *leftdone = alloca(numcon_total * sizeof(bool)); - bool *rightdone = alloca(numcon_total * sizeof(bool)); - int best, bestright, bestleft; - char *p; - char s[MAX_WORD]; - String * cs = string_new(); - - assert (numcon_total < ctxt->conlen, "Too many constituents (b)"); - - for (c = 0; c < numcon_total; c++) - { - leftdone[c] = false; - rightdone[c] = false; - } - - /* Skip left wall; don't skip right wall, since it may - * have constituent boundaries. */ - for (w = 1; w < linkage->num_words; w++) - { - while (1) - { - best = -1; - bestright = -1; - for (c = 0; c < numcon_total; c++) - { - if ((ctxt->constituent[c].left == w) && - (leftdone[c] == false) && ctxt->constituent[c].valid && - ((int) ctxt->constituent[c].right >= bestright)) - { - best = c; - bestright = ctxt->constituent[c].right; - } - } - if (best == -1) - break; - - leftdone[best] = true; - append_string(cs, "%c%s ", OPEN_BRACKET, ctxt->constituent[best].type); - } - - /* Don't print out right wall */ - if (w < linkage->num_words - 1) - { - char *p; - strncpy(s, linkage->word[w], MAX_WORD); - s[MAX_WORD-1] = 0; - - /* Constituent processing will crash if the sentence contains - * square brackets, so we have to do something ... replace - * them with curly braces ... this is a terrible hack, but - * will have to do; for now. A better solution would be to - * allow the user to specify some reserved char as the - * bracket symbol, e.g. SOH and EOT or something like that. - */ - p = strchr(s, OPEN_BRACKET); - while (p) - { - *p = '{'; - p = strchr(p, OPEN_BRACKET); - } - - p = strchr(s, CLOSE_BRACKET); - while (p) - { - *p = '}'; - p = strchr(p, CLOSE_BRACKET); - } - -#if 0 /* firstupper check removed in 0c8107a */ - /* Now, if the first character of the word was - originally uppercase, we put it back that way */ - if (linkage->chosen_disjuncts[w]->word[0]->status & WS_FIRSTUPPER) - upcase_utf8_str(s, s, MAX_WORD); -#endif - append_string(cs, "%s ", s); - } - - while (1) - { - best = -1; - bestleft = -1; - for(c = 0; c < numcon_total; c++) - { - if ((ctxt->constituent[c].right == w) && - (rightdone[c] == false) && ctxt->constituent[c].valid && - ((int) ctxt->constituent[c].left > bestleft)) { - best = c; - bestleft = ctxt->constituent[c].left; - } - } - if (best == -1) - break; - rightdone[best] = true; - append_string(cs, "%s%c ", ctxt->constituent[best].type, CLOSE_BRACKET); - } - } - - append_string(cs, "\n"); - p = string_copy(cs); - string_delete(cs); - return p; -} - -static char * do_print_flat_constituents(con_context_t *ctxt, Linkage linkage) -{ - int numcon_total= 0, numcon_subl; - char * q; - Sentence sent = linkage->sent; - - ctxt->phrase_ss = string_set_create(); - generate_misc_word_info(ctxt, linkage); - - if (NULL == sent->constituent_pp) /* First time for this sentence */ - sent->constituent_pp = post_process_new(sent->dict->hpsg_knowledge); - - do_post_process(sent->constituent_pp, linkage, linkage->is_sent_long); - - /** No-op. If we wanted to debug domain names, we could do this... - * linkage_free_pp_info(linkage); - * linkage_set_domain_names(sent->constituent_pp, linkage); - */ - numcon_subl = read_constituents_from_domains(ctxt, linkage, numcon_total); - numcon_total += numcon_subl; - assert (numcon_total < ctxt->conlen, "Too many constituents (c)"); - numcon_total = merge_constituents(ctxt, linkage, numcon_total); - assert (numcon_total < ctxt->conlen, "Too many constituents (d)"); - numcon_total = new_style_conjunctions(ctxt, linkage, numcon_total); - assert (numcon_total < ctxt->conlen, "Too many constituents (e)"); - numcon_total = last_minute_fixes(ctxt, linkage, numcon_total); - assert (numcon_total < ctxt->conlen, "Too many constituents (f)"); - q = exprint_constituent_structure(ctxt, linkage, numcon_total); - string_set_delete(ctxt->phrase_ss); - ctxt->phrase_ss = NULL; - - post_process_free_data(&sent->constituent_pp->pp_data); - - return q; -} - -static char * print_flat_constituents(Linkage linkage) -{ - size_t wts = linkage->num_words * sizeof(WType); - size_t cns = (linkage->num_links + linkage->num_words) * sizeof(constituent_t); - - con_context_t *ctxt = (con_context_t *) alloca(sizeof(con_context_t)); - memset(ctxt, 0, sizeof(con_context_t)); - ctxt->wordtype = (WType *) alloca(wts); - memset(ctxt->wordtype, 0, wts); - ctxt->conlen = linkage->num_links + linkage->num_words; - ctxt->constituent = (constituent_t *) alloca(cns); - memset(ctxt->constituent, 0, cns); - - return do_print_flat_constituents(ctxt, linkage); -} - -static CType token_type (char *token) -{ - if ((token[0] == OPEN_BRACKET) && (strlen(token) > 1)) - return OPEN_TOK; - if ((strlen(token) > 1) && (token[strlen(token) - 1] == CLOSE_BRACKET)) - return CLOSE_TOK; - return WORD_TOK; -} - -static CNode * make_CNode(char *q) -{ - CNode * cn; - cn = (CNode *) exalloc(sizeof(CNode)); - cn->label = (char *) exalloc(sizeof(char)*(strlen(q)+1)); - strcpy(cn->label, q); - cn->child = cn->next = (CNode *) NULL; - cn->next = (CNode *) NULL; - cn->start = cn->end = -1; - return cn; -} - -static CNode * parse_string(CNode * n, char **saveptr) -{ - char *q; - CNode *m, *last_child=NULL; - - while ((q = strtok_r(NULL, " ", saveptr))) { - switch (token_type(q)) { - case CLOSE_TOK : - q[strlen(q)-1]='\0'; - assert(strcmp(q, n->label)==0, - "Constituent tree: Labels do not match."); - return n; - break; - case OPEN_TOK: - m = make_CNode(q+1); - m = parse_string(m, saveptr); - break; - case WORD_TOK: - m = make_CNode(q); - break; - default: - assert(0, "Constituent tree: Illegal token type"); - } - if (n->child == NULL) { - last_child = n->child = m; - } - else { - last_child->next = m; - last_child = m; - } - } - assert(0, "Constituent tree: Constituent did not close"); - return NULL; -} - -static void print_tree(String * cs, int indent, CNode * n, int o1, int o2) -{ - int i, child_offset; - CNode * m; - - if (n == NULL) return; - - if (indent) - for (i = 0; i < o1; ++i) - append_string(cs, " "); - append_string(cs, "(%s ", n->label); - child_offset = o2 + strlen(n->label) + 2; - - for (m = n->child; m != NULL; m = m->next) - { - if (m->child == NULL) - { - char * p; - /* If the original string has left or right parens in it, - * the printed string will be messed up by these ... - * so replace them by curly braces. What else can one do? - */ - p = strchr(m->label, '('); - while(p) - { - *p = '{'; - p = strchr(p, '('); - } - - p = strchr(m->label, ')'); - while(p) - { - *p = '}'; - p = strchr(p, ')'); - } - - append_string(cs, "%s", m->label); - if ((m->next != NULL) && (m->next->child == NULL)) - append_string(cs, " "); - } - else - { - if (m != n->child) - { - if (indent) append_string(cs, "\n"); - else append_string(cs, " "); - print_tree(cs, indent, m, child_offset, child_offset); - } - else - { - print_tree(cs, indent, m, 0, child_offset); - } - if ((m->next != NULL) && (m->next->child == NULL)) - { - if (indent) - { - append_string(cs, "\n"); - for (i = 0; i < child_offset; ++i) - append_string(cs, " "); - } - else append_string(cs, " "); - } - } - } - append_string(cs, ")"); -} - -static int assign_spans(CNode * n, int start) -{ - int num_words=0; - CNode * m=NULL; - if (n==NULL) return 0; - n->start = start; - if (n->child == NULL) { - n->end = start; - return 1; - } - else { - for (m=n->child; m!=NULL; m=m->next) { - num_words += assign_spans(m, start+num_words); - } - n->end = start+num_words-1; - } - return num_words; -} - -static CNode * linkage_constituent_tree(Linkage linkage) -{ - char *p, *q, *saveptr; - int len; - CNode * root; - - p = print_flat_constituents(linkage); - - len = strlen(p); - q = strtok_r(p, " ", &saveptr); - assert(token_type(q) == OPEN_TOK, "Illegal beginning of string"); - root = make_CNode(q+1); - root = parse_string(root, &saveptr); - assign_spans(root, 0); - exfree(p, sizeof(char)*(len+1)); - return root; -} - -/* Make the compiler shut up about the deprecated functions */ -/* -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -*/ - -static void linkage_free_constituent_tree(CNode * n) -{ - CNode *m, *x; - for (m=n->child; m!=NULL; m=x) { - x=m->next; - linkage_free_constituent_tree(m); - } - exfree(n->label, sizeof(char)*(strlen(n->label)+1)); - exfree(n, sizeof(CNode)); -} - -/** - * Print out the constituent tree. - * mode 1: treebank-style constituent tree - * mode 2: flat, bracketed tree [A like [B this B] A] - * mode 3: flat, treebank-style tree (A like (B this)) - */ -char * linkage_print_constituent_tree(Linkage linkage, ConstituentDisplayStyle mode) -{ - String * cs; - CNode * root; - char * p; - - if (!linkage) return NULL; - if (mode == NO_DISPLAY) - { - return NULL; - } - else if (mode == MULTILINE || mode == SINGLE_LINE) - { - cs = string_new(); - root = linkage_constituent_tree(linkage); - print_tree(cs, (mode==1), root, 0, 0); - linkage_free_constituent_tree(root); - append_string(cs, "\n"); - p = string_copy(cs); - string_delete(cs); - return p; - } - else if (mode == BRACKET_TREE) - { - return print_flat_constituents(linkage); - } - prt_error("Warning: Illegal mode %d for printing constituents\n" - "Allowed values: %d to %d\n", mode, NO_DISPLAY, MAX_STYLES); - return NULL; -} - -void linkage_free_constituent_tree_str(char * s) -{ - exfree(s, strlen(s)+1); -} diff -Nru link-grammar-5.3.16/link-grammar/corpus/cluster.c link-grammar-5.5.0/link-grammar/corpus/cluster.c --- link-grammar-5.3.16/link-grammar/corpus/cluster.c 2014-12-27 04:49:23.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/corpus/cluster.c 2017-06-17 01:29:32.000000000 +0000 @@ -2,7 +2,7 @@ * cluster.c * * Data for related-word clusters. Meant to expand disjunct coverage - * for the case where a parse cannot be completed without ommitting + * for the case where a parse cannot be completed without omitting * a word. * * Copyright (c) 2009 Linas Vepstas @@ -79,7 +79,7 @@ "\tWas looking for: " DBNAME); } else - { + { prt_error("Warning: Can't open database: %s\n" "\tWas looking for: " DBNAME, sqlite3_errmsg(c->dbconn)); @@ -88,7 +88,7 @@ } /* Now prepare the statements we plan to use */ - rc = sqlite3_prepare_v2(c->dbconn, + rc = sqlite3_prepare_v2(c->dbconn, "SELECT cluster_name FROM ClusterMembers " "WHERE inflected_word = ?;", -1, &c->clu_query, NULL); @@ -98,7 +98,7 @@ sqlite3_errmsg(c->dbconn)); } - rc = sqlite3_prepare_v2(c->dbconn, + rc = sqlite3_prepare_v2(c->dbconn, "SELECT disjunct, cost FROM ClusterDisjuncts " "WHERE cluster_name = ?;", -1, &c->dj_query, NULL); @@ -114,7 +114,7 @@ /** * lg_cluster_delete -- shut down the cluster statistics subsystem. - */ + */ void lg_cluster_delete(Cluster *c) { if (NULL == c) return; @@ -153,7 +153,7 @@ Exp *p1, *p2; E_list *l, *lhead = NULL; size_t len; - const char *sp = strchr (djstr, ' '); + const char *sp = strchr (djstr, ' '); Exp *e = (Exp *) malloc(sizeof(Exp)); e->multi = 0; diff -Nru link-grammar-5.3.16/link-grammar/corpus/cluster.h link-grammar-5.5.0/link-grammar/corpus/cluster.h --- link-grammar-5.3.16/link-grammar/corpus/cluster.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/corpus/cluster.h 2017-06-17 01:29:32.000000000 +0000 @@ -1,8 +1,8 @@ /* * cluster.h * - * Data for related-word clusters. Meant to expand disjunct covereage - * for the case where a parse cannot be completed without ommitting + * Data for related-word clusters. Meant to expand disjunct coverage + * for the case where a parse cannot be completed without omitting * a word. * * Copyright (c) 2009 Linas Vepstas @@ -16,6 +16,9 @@ #include "../api-types.h" #include "../link-includes.h" +/* Upper bound on the cost of any connector. */ +#define MAX_CONNECTOR_COST 1000.0 + Cluster * lg_cluster_new(void); void lg_cluster_delete(Cluster *); diff -Nru link-grammar-5.3.16/link-grammar/corpus/corpus.c link-grammar-5.5.0/link-grammar/corpus/corpus.c --- link-grammar-5.3.16/link-grammar/corpus/corpus.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/corpus/corpus.c 2017-06-17 01:29:32.000000000 +0000 @@ -3,7 +3,7 @@ * * Data for corpus statistics, used to provide a parse ranking * to drive the SAT solver, as well as parse ranking with the - * ordinary solver. + * ordinary solver. * * Copyright (c) 2008, 2009 Linas Vepstas */ @@ -87,7 +87,7 @@ c->errmsg); } else - { + { prt_error("Warning: Can't open database: %s\n" "\tWas looking for: " DBNAME "\n", c->errmsg); @@ -96,7 +96,7 @@ } /* Now prepare the statements we plan to use */ - rc = sqlite3_prepare_v2(c->dbconn, + rc = sqlite3_prepare_v2(c->dbconn, "SELECT log_cond_probability FROM Disjuncts " "WHERE inflected_word = ? AND disjunct = ?;", -1, &c->rank_query, NULL); @@ -108,7 +108,7 @@ /* Results are returned in sorted order .. would it be faster * to sort locally? Don't know ... */ - rc = sqlite3_prepare_v2(c->dbconn, + rc = sqlite3_prepare_v2(c->dbconn, "SELECT word_sense, log_cond_probability FROM DisjunctSenses " "WHERE inflected_word = ? AND disjunct = ? " "ORDER BY log_cond_probability ASC;", @@ -125,7 +125,7 @@ /** * lg_corpus_delete -- shut down the corpus statistics subsystem. - */ + */ void lg_corpus_delete(Corpus *c) { if (NULL == c) return; @@ -168,7 +168,7 @@ /** * get_disjunct_score -- get log probability of observing disjunt. * - * Given an "inflected" word and a disjunct, thris routine returns the + * Given an "inflected" word and a disjunct, this routine returns the * -log_2 conditional probability prob(d|w) of seeing the disjunct 'd' * given that the word 'w' was observed. Here, "inflected word" means * the link-grammar dictionary entry, complete with its trailing period @@ -237,7 +237,7 @@ * lg_corpus_score -- compute parse-ranking score for sentence. * * Given a parsed sentence, this routine will compute a parse ranking - * score, based on the probabilites of observing the indicated set of + * score, based on the probabilities of observing the indicated set of * disjuncts in the statistics database. * * The score is stored in the Linkage_info->corpus_cost struct member. @@ -315,7 +315,7 @@ * lg_corpus_senses -- Given word and disjunct, look up senses. * * Given a particular disjunct for a word, look up its most - * likely sense assignments from the database. + * likely sense assignments from the database. */ static Sense * lg_corpus_senses(Corpus *corp, @@ -350,8 +350,8 @@ { sense = sqlite3_column_text(corp->sense_query, 0); log_prob = sqlite3_column_double(corp->sense_query, 1); - // printf ("Word=%s dj=%s sense=%s score=%f\n", - // inflected_word, disjunct, sense, log_prob); + // printf ("Word=%s dj=%s sense=%s score=%f\n", + // inflected_word, disjunct, sense, log_prob); sns = (Sense *) malloc(sizeof(Sense)); sns->next = head; @@ -380,8 +380,8 @@ /** * lg_corpus_linkage_senses -- Given a linkage, look up senses. * - * Given a particular linakge, look up the most likely sense - * assignments from the database. + * Given a particular linkage, look up the most likely sense + * assignments from the database. * * This function is not used to guide the parsing process; it is * only an informational look-up. @@ -391,7 +391,7 @@ { const char * infword; Sentence sent = lkg->sent; - Dictionary dict = sent->dict; + Dictionary dict = sent->dict; Corpus *corp = dict->corpus; int nwords = lkg->num_words; int w; @@ -420,7 +420,7 @@ } infword = disj->string; - lkg->sense_list[w] = lg_corpus_senses(corp, infword, + lkg->sense_list[w] = lg_corpus_senses(corp, infword, lkg->disjunct_list_str[w], w); } } diff -Nru link-grammar-5.3.16/link-grammar/corpus/corpus.h link-grammar-5.5.0/link-grammar/corpus/corpus.h --- link-grammar-5.3.16/link-grammar/corpus/corpus.h 2016-10-14 23:03:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/corpus/corpus.h 2017-05-29 08:07:13.000000000 +0000 @@ -15,8 +15,7 @@ #ifdef USE_CORPUS -#include "../api-types.h" -#include "../utilities.h" +#include "api-types.h" Corpus * lg_corpus_new(void); void lg_corpus_delete(Corpus *); diff -Nru link-grammar-5.3.16/link-grammar/corpus/Makefile.in link-grammar-5.5.0/link-grammar/corpus/Makefile.in --- link-grammar-5.3.16/link-grammar/corpus/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/corpus/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -215,6 +215,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -243,6 +244,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -251,13 +253,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -324,12 +329,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/link-grammar/corpus/README link-grammar-5.5.0/link-grammar/corpus/README --- link-grammar-5.3.16/link-grammar/corpus/README 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/corpus/README 2017-06-17 01:29:32.000000000 +0000 @@ -8,7 +8,7 @@ is used because it is "administration-free" for the user, and because its license is compatbile with the current link-grammar license. -This directory also contains code for "broadening" word linakges. +This directory also contains code for "broadening" word linkages. See data/sql/README for more info. This directory contains one administrative tool, "cluster-pop", diff -Nru link-grammar-5.3.16/link-grammar/count.c link-grammar-5.5.0/link-grammar/count.c --- link-grammar-5.3.16/link-grammar/count.c 2017-03-14 05:43:05.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/count.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,546 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2013,2014,2015 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include "link-includes.h" -#include "api-structures.h" -#include "count.h" -#include "disjunct-utils.h" -#include "fast-match.h" -#include "prune.h" -#include "resources.h" -#include "structures.h" -#include "word-utils.h" - -/* This file contains the exhaustive search algorithm. */ - -typedef struct Table_connector_s Table_connector; -struct Table_connector_s -{ - Table_connector *next; - Connector *le, *re; - Count_bin count; - short lw, rw; - unsigned short null_count; -}; - -struct count_context_s -{ - Word * local_sent; - /* int null_block; */ /* not used, always 1 */ - bool islands_ok; - bool null_links; - bool exhausted; - int checktimer; /* Avoid excess system calls */ - int table_size; - int log2_table_size; - Table_connector ** table; - Resources current_resources; -}; - -static void free_table(count_context_t *ctxt) -{ - int i; - Table_connector *t, *x; - - for (i=0; itable_size; i++) - { - for(t = ctxt->table[i]; t!= NULL; t=x) - { - x = t->next; - xfree((void *) t, sizeof(Table_connector)); - } - } - xfree(ctxt->table, ctxt->table_size * sizeof(Table_connector*)); - ctxt->table = NULL; - ctxt->table_size = 0; -} - -static void init_table(count_context_t *ctxt, size_t sent_len) -{ - unsigned int shift; - /* A piecewise exponential function determines the size of the - * hash table. Probably should make use of the actual number of - * disjuncts, rather than just the number of words. - */ - if (ctxt->table) free_table(ctxt); - - if (sent_len >= 10) - { - shift = 12 + (sent_len) / 4 ; - } - else - { - shift = 12; - } - - /* Clamp at max 4*(1<<24) == 64 MBytes */ - if (24 < shift) shift = 24; - ctxt->table_size = (1U << shift); - ctxt->log2_table_size = shift; - ctxt->table = (Table_connector**) - xalloc(ctxt->table_size * sizeof(Table_connector*)); - memset(ctxt->table, 0, ctxt->table_size*sizeof(Table_connector*)); -} - -/** - * Stores the value in the table. Assumes it's not already there. - */ -static Table_connector * table_store(count_context_t *ctxt, - int lw, int rw, - Connector *le, Connector *re, - unsigned int null_count) -{ - Table_connector *t, *n; - unsigned int h; - - n = (Table_connector *) xalloc(sizeof(Table_connector)); - n->lw = lw; n->rw = rw; n->le = le; n->re = re; n->null_count = null_count; - h = pair_hash(ctxt->table_size, lw, rw, le, re, null_count); - t = ctxt->table[h]; - n->next = t; - ctxt->table[h] = n; - - return n; -} - -/** returns the pointer to this info, NULL if not there */ -static Table_connector * -find_table_pointer(count_context_t *ctxt, - int lw, int rw, - Connector *le, Connector *re, - unsigned int null_count) -{ - Table_connector *t; - unsigned int h = pair_hash(ctxt->table_size,lw, rw, le, re, null_count); - t = ctxt->table[h]; - for (; t != NULL; t = t->next) { - if ((t->lw == lw) && (t->rw == rw) - && (t->le == le) && (t->re == re) - && (t->null_count == null_count)) return t; - } - - /* Create a new connector only if resources are exhausted. - * (???) Huh? I guess we're in panic parse mode in that case. - * checktimer is a device to avoid a gazillion system calls - * to get the timer value. On circa-2009 machines, it results - * in maybe 5-10 timer calls per second. - */ - ctxt->checktimer ++; - if (ctxt->exhausted || ((0 == ctxt->checktimer%1450100) && - (ctxt->current_resources != NULL) && - resources_exhausted(ctxt->current_resources))) - { - ctxt->exhausted = true; - t = table_store(ctxt, lw, rw, le, re, null_count); - t->count = hist_zero(); - return t; - } - else return NULL; -} - -/** returns the count for this quintuple if there, -1 otherwise */ -Count_bin* table_lookup(count_context_t * ctxt, - int lw, int rw, Connector *le, Connector *re, - unsigned int null_count) -{ - Table_connector *t = find_table_pointer(ctxt, lw, rw, le, re, null_count); - - if (t == NULL) return NULL; else return &t->count; -} - -/** - * psuedocount is used to check to see if a parse is even possible, - * so that we don't waste cpu time performing an actual count, only - * to discover that it is zero. - * - * Returns false if and only if this entry is in the hash table - * with a count value of 0. If an entry is not in the hash table, - * we have to assume the worst case: that the count might be non-zero, - * and since we don't know, we return true. However, if the entry is - * in the hash table, and its zero, then we know, for sure, that the - * count is zero. - */ -static bool pseudocount(count_context_t * ctxt, - int lw, int rw, Connector *le, Connector *re, - unsigned int null_count) -{ - Count_bin * count = table_lookup(ctxt, lw, rw, le, re, null_count); - if (NULL == count) return true; - if (hist_total(count) == 0) return false; - return true; -} - -/** - * Return the number of optional words strictly between w1 and w2. - */ -static int num_optional_words(count_context_t *ctxt, int w1, int w2) -{ - int n = 0; - - for (int w = w1+1; w < w2; w++) - if (ctxt->local_sent[w].optional) n++; - - return n; -} - -static Count_bin do_count(fast_matcher_t *mchxt, - count_context_t *ctxt, - int lw, int rw, - Connector *le, Connector *re, - int null_count) -{ - Count_bin zero = hist_zero(); - Count_bin total; - int start_word, end_word, w; - Table_connector *t; - - assert (0 <= null_count, "Bad null count"); - - t = find_table_pointer(ctxt, lw, rw, le, re, null_count); - - if (t) return t->count; - - /* Create the table entry with a tentative null count of 0. - * This count must be updated before we return. */ - t = table_store(ctxt, lw, rw, le, re, null_count); - - if (rw == 1+lw) - { - /* lw and rw are neighboring words */ - /* You can't have a linkage here with null_count > 0 */ - if ((le == NULL) && (re == NULL) && (null_count == 0)) - { - t->count = hist_one(); - } - else - { - t->count = zero; - } - return t->count; - } - - /* The left and right connectors are null, but the two words are - * NOT next to each-other. */ - if ((le == NULL) && (re == NULL)) - { - if (!ctxt->islands_ok && (lw != -1)) - { - /* If we don't allow islands (a set of words linked together - * but separate from the rest of the sentence) then the - * null_count of skipping n words is just n. */ - if (null_count == (rw-lw-1) - num_optional_words(ctxt, lw, rw)) - { - t->count = hist_one(); - } - else - { - t->count = zero; - } - return t->count; - } - if (null_count == 0) - { - /* There is no solution without nulls in this case. There is - * a slight efficiency hack to separate this null_count==0 - * case out, but not necessary for correctness */ - if ((rw-lw-1) == num_optional_words(ctxt, lw, rw)) - { - t->count = hist_one(); - } - else - { - t->count = zero; - } - } - else - { - t->count = zero; - Disjunct * d; - int w = lw + 1; - for (d = ctxt->local_sent[w].d; d != NULL; d = d->next) - { - if (d->left == NULL) - { - hist_accumv(&t->count, d->cost, - do_count(mchxt, ctxt, w, rw, d->right, NULL, null_count-1)); - } - } - hist_accumv(&t->count, 0.0, - do_count(mchxt, ctxt, w, rw, NULL, NULL, null_count-1)); - } - return t->count; - } - - if (le == NULL) - { - start_word = lw+1; - } - else - { - start_word = le->nearest_word; - } - - if (re == NULL) - { - end_word = rw; - } - else - { - end_word = re->nearest_word +1; - } - - total = zero; - - for (w = start_word; w < end_word; w++) - { - size_t mlb, mle; - mle = mlb = form_match_list(mchxt, w, le, lw, re, rw); -#ifdef VERIFY_MATCH_LIST - int id = get_match_list_element(mchxt, mlb) ? - get_match_list_element(mchxt, mlb)->match_id : 0; -#endif - for (; get_match_list_element(mchxt, mle) != NULL; mle++) - { - unsigned int lnull_cnt, rnull_cnt; - Disjunct *d = get_match_list_element(mchxt, mle); - bool Lmatch = d->match_left; - bool Rmatch = d->match_right; - -#ifdef VERIFY_MATCH_LIST - assert(id == d->match_id, "Modified id (%d!=%d)", id, d->match_id); -#endif - /* _p1 avoids a gcc warning about unsafe loop opt */ - unsigned int null_count_p1 = null_count + 1; - - for (lnull_cnt = 0; lnull_cnt < null_count_p1; lnull_cnt++) - { - bool leftpcount = false; - bool rightpcount = false; - bool pseudototal = false; - - rnull_cnt = null_count - lnull_cnt; - /* Now lnull_cnt and rnull_cnt are the costs we're assigning - * to those parts respectively */ - - /* Now, we determine if (based on table only) we can see that - the current range is not parsable. */ - - /* First, perform pseudocounting as an optimization. If - * the pseudocount is zero, then we know that the true - * count will be zero, and so skip counting entirely, - * in that case. - */ - if (Lmatch) - { - leftpcount = pseudocount(ctxt, lw, w, le->next, d->left->next, lnull_cnt); - if (!leftpcount && le->multi) - leftpcount = - pseudocount(ctxt, lw, w, le, d->left->next, lnull_cnt); - if (!leftpcount && d->left->multi) - leftpcount = - pseudocount(ctxt, lw, w, le->next, d->left, lnull_cnt); - if (!leftpcount && le->multi && d->left->multi) - leftpcount = - pseudocount(ctxt, lw, w, le, d->left, lnull_cnt); - } - - if (Rmatch) - { - rightpcount = pseudocount(ctxt, w, rw, d->right->next, re->next, rnull_cnt); - if (!rightpcount && d->right->multi) - rightpcount = - pseudocount(ctxt, w,rw, d->right, re->next, rnull_cnt); - if (!rightpcount && re->multi) - rightpcount = - pseudocount(ctxt, w, rw, d->right->next, re, rnull_cnt); - if (!rightpcount && d->right->multi && re->multi) - rightpcount = - pseudocount(ctxt, w, rw, d->right, re, rnull_cnt); - } - - /* Total number where links are used on both sides */ - pseudototal = leftpcount && rightpcount; - - if (!pseudototal && leftpcount) { - /* Evaluate using the left match, but not the right. */ - pseudototal = - pseudocount(ctxt, w, rw, d->right, re, rnull_cnt); - } - if (!pseudototal && (le == NULL) && rightpcount) { - /* Evaluate using the right match, but not the left. */ - pseudototal = - pseudocount(ctxt, lw, w, le, d->left, lnull_cnt); - } - - /* If pseudototal is zero (false), that implies that - * we know that the true total is zero. So we don't - * bother counting at all, in that case. */ - if (pseudototal) - { - Count_bin leftcount = zero; - Count_bin rightcount = zero; - if (Lmatch) { - leftcount = do_count(mchxt, ctxt, lw, w, le->next, d->left->next, lnull_cnt); - if (le->multi) - hist_accumv(&leftcount, d->cost, - do_count(mchxt, ctxt, lw, w, le, d->left->next, lnull_cnt)); - if (d->left->multi) - hist_accumv(&leftcount, d->cost, - do_count(mchxt, ctxt, lw, w, le->next, d->left, lnull_cnt)); - if (le->multi && d->left->multi) - hist_accumv(&leftcount, d->cost, - do_count(mchxt, ctxt, lw, w, le, d->left, lnull_cnt)); - } - - if (Rmatch) { - rightcount = do_count(mchxt, ctxt, w, rw, d->right->next, re->next, rnull_cnt); - if (d->right->multi) - hist_accumv(&rightcount, d->cost, - do_count(mchxt, ctxt, w, rw, d->right,re->next, rnull_cnt)); - if (re->multi) - hist_accumv(&rightcount, d->cost, - do_count(mchxt, ctxt, w, rw, d->right->next, re, rnull_cnt)); - if (d->right->multi && re->multi) - hist_accumv(&rightcount, d->cost, - do_count(mchxt, ctxt, w, rw, d->right, re, rnull_cnt)); - } - - /* Total number where links are used on both sides */ - hist_muladd(&total, &leftcount, 0.0, &rightcount); - - if (0 < hist_total(&leftcount)) - { - /* Evaluate using the left match, but not the right */ - hist_muladdv(&total, &leftcount, d->cost, - do_count(mchxt, ctxt, w, rw, d->right, re, rnull_cnt)); - } - if ((le == NULL) && (0 < hist_total(&rightcount))) - { - /* Evaluate using the right match, but not the left */ - hist_muladdv(&total, &rightcount, d->cost, - do_count(mchxt, ctxt, lw, w, le, d->left, lnull_cnt)); - } - - /* Sigh. Overflows can and do occur, esp for the ANY language. */ - if (INT_MAX < hist_total(&total)) - { -#ifdef PERFORM_COUNT_HISTOGRAMMING - total.total = INT_MAX; -#else - total = INT_MAX; -#endif /* PERFORM_COUNT_HISTOGRAMMING */ - t->count = total; - pop_match_list(mchxt, mlb); - return total; - } - } - } - } - pop_match_list(mchxt, mlb); - } - t->count = total; - return total; -} - - -/** - * Returns the number of ways the sentence can be parsed with the - * specified null count. Assumes that the fast-matcher and the count - * context have already been initialized, and will be freed later. The - * "null_count" argument is the number of words that are allowed to - * have no links to them. - * - * This the full-fledged parser, but it only 'counts', in order to - * avoid an explosion of allocated memory structures to hold each - * possible parse. Thus, to see an 'actual' parse, a second pass - * must be made, with build_parse_set(), to get actual parse structures. - * - * The work is split up this way for two reasons: - * 1) A given sentence may have thousands of parses, and the user is - * interested in only a few. - * 2) A given sentence may have billions of parses, in which case, - * allocating for each would blow out RAM. - * So, basically, its good to know how many parses to expect, before - * starting to allocate parse structures. - * - * The count returned here is meant to be completely accurate; it is - * not an approximation! - * - * Currently, the code has been designed to maintain a histogram of - * the cost of each of the parses. The number and width of the bins - * is adjustable in histogram.c. At this time, the histogram is not - * used anywhere, and a 3-5% speedup is available if it is avoided. - * We plan to use this histogram, later .... - */ -Count_bin do_parse(Sentence sent, - fast_matcher_t *mchxt, - count_context_t *ctxt, - int null_count, Parse_Options opts) -{ - Count_bin hist; - - ctxt->current_resources = opts->resources; - ctxt->exhausted = false; - ctxt->checktimer = 0; - ctxt->local_sent = sent->word; - - /* consecutive blocks of this many words are considered as - * one null link. */ - /* ctxt->null_block = 1; */ - ctxt->islands_ok = opts->islands_ok; - - hist = do_count(mchxt, ctxt, -1, sent->length, NULL, NULL, null_count+1); - - ctxt->local_sent = NULL; - ctxt->current_resources = NULL; - ctxt->checktimer = 0; - return hist; -} - -void delete_unmarked_disjuncts(Sentence sent) -{ - size_t w; - Disjunct *d_head, *d, *dx; - - for (w=0; wlength; w++) { - d_head = NULL; - for (d=sent->word[w].d; d != NULL; d=dx) { - dx = d->next; - if (d->marked) { - d->next = d_head; - d_head = d; - } else { - d->next = NULL; - free_disjuncts(d); - } - } - sent->word[w].d = d_head; - } -} - -/* sent_length is used only as a hint for the hash table size ... */ -count_context_t * alloc_count_context(size_t sent_length) -{ - count_context_t *ctxt = (count_context_t *) xalloc (sizeof(count_context_t)); - memset(ctxt, 0, sizeof(count_context_t)); - - init_table(ctxt, sent_length); - return ctxt; -} - -void free_count_context(count_context_t *ctxt) -{ - free_table(ctxt); - xfree(ctxt, sizeof(count_context_t)); -} diff -Nru link-grammar-5.3.16/link-grammar/count.h link-grammar-5.5.0/link-grammar/count.h --- link-grammar-5.3.16/link-grammar/count.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/count.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,25 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _COUNT_H -#define _COUNT_H - -#include "api-structures.h" -#include "histogram.h" /* for s64 */ - -Count_bin* table_lookup(count_context_t *, int, int, Connector *, Connector *, unsigned int); -Count_bin do_parse(Sentence, fast_matcher_t*, count_context_t*, int null_count, Parse_Options); -void delete_unmarked_disjuncts(Sentence sent); - -count_context_t* alloc_count_context(size_t); -void free_count_context(count_context_t*); -#endif /* _COUNT_H */ diff -Nru link-grammar-5.3.16/link-grammar/dict-api.h link-grammar-5.5.0/link-grammar/dict-api.h --- link-grammar-5.3.16/link-grammar/dict-api.h 2014-12-17 18:26:36.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-api.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,46 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_DICT_API_H_ -#define _LG_DICT_API_H_ - -#include "dict-structures.h" -#include "link-includes.h" - -LINK_BEGIN_DECLS - -/** - * Declaration of dictionary-related functions that link-grammar users - * are free to use in their applications. That is, these are a part of - * the public API to the link-parser system. - */ - -Dictionary dictionary_create_from_utf8(const char * input); - -bool boolean_dictionary_lookup(const Dictionary, const char *); - -Dict_node * abridged_lookup_list(const Dictionary, const char *); -Dict_node * dictionary_lookup_list(const Dictionary, const char *); - -bool find_word_in_dict(const Dictionary, const char *); - -void free_lookup_list(const Dictionary, Dict_node *); - -/* XXX the below probably does not belong ... ?? */ -Dict_node * insert_dict(Dictionary dict, Dict_node * n, Dict_node * newnode); - -void print_expression(const Exp *); - -LINK_END_DECLS - -#endif /* _LG_DICT_API_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-affix.h link-grammar-5.5.0/link-grammar/dict-common/dict-affix.h --- link-grammar-5.3.16/link-grammar/dict-common/dict-affix.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-affix.h 2017-08-11 14:52:16.000000000 +0000 @@ -0,0 +1,88 @@ +/*************************************************************************/ +/* Copyright (c) 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _LG_DICT_AFFIX_H_ +#define _LG_DICT_AFFIX_H_ + +#include "dict-common.h" + +/* The functions here are intended for use by the tokenizer, only, + * and pretty much no one else. If you are not the tokenizer, you + * probably don't need these. */ + +/* Connector names for the affix class lists in the affix file */ + +typedef enum { + AFDICT_RPUNC=1, + AFDICT_LPUNC, + AFDICT_MPUNC, + AFDICT_UNITS, + AFDICT_SUF, + AFDICT_PRE, + AFDICT_MPRE, + AFDICT_QUOTES, + AFDICT_BULLETS, + AFDICT_INFIXMARK, + AFDICT_STEMSUBSCR, + AFDICT_SANEMORPHISM, + + /* The below are used only for random morphology via regex */ + AFDICT_REGPRE, + AFDICT_REGMID, + AFDICT_REGSUF, + AFDICT_REGALTS, + AFDICT_REGPARTS, + + /* Have to have one last entry, to get the array size correct */ + AFDICT_LAST_ENTRY, + AFDICT_NUM_ENTRIES +} afdict_classnum; + +#define AFDICT_CLASSNAMES1 \ + "invalid classname", \ + "RPUNC", \ + "LPUNC", \ + "MPUNC", \ + "UNITS", \ + "SUF", /* SUF is used in the Russian dict */ \ + "PRE", /* PRE is not used anywhere, yet... */ \ + "MPRE", /* Multi-prefix, currently for Hebrew */ \ + "QUOTES", \ + "BULLETS", \ + "INFIXMARK", /* Prepended to suffixes, appended to prefixes */ \ + "STEMSUBSCR", /* Subscripts for stems */ \ + "SANEMORPHISM", /* Regex for sane_morphism() */ + +/* The regexes below are used only for random morphology generation */ +#define AFDICT_CLASSNAMES2 \ + "REGPRE", /* Regex for prefix */ \ + "REGMID", /* Regex for middle parts */ \ + "REGSUF", /* Regex for suffix */ \ + "REGALTS", /* Min&max number of alternatives to issue for a word */\ + "REGPARTS", /* Max number of word partitions */ + +#define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2 "last classname" +#define AFCLASS(afdict, class) (&afdict->afdict_class[class]) + +/* Suffixes start with it. + * This is needed to distinguish suffixes that were stripped off from + * ordinary words that just happen to be the same as the suffix. + * Kind-of a weird hack, but I'm not sure what else to do... + * Similarly, prefixes end with it. + */ +#define INFIX_MARK(afdict) \ + ((NULL == afdict) ? '\0' : (AFCLASS(afdict, AFDICT_INFIXMARK)->string[0][0])) + + +Afdict_class * afdict_find(Dictionary, const char *, bool); +bool is_stem(const char *); + +#endif /* _LG_DICT_AFFIX_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-api.h link-grammar-5.5.0/link-grammar/dict-common/dict-api.h --- link-grammar-5.3.16/link-grammar/dict-common/dict-api.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-api.h 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,49 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _LG_DICT_API_H_ +#define _LG_DICT_API_H_ + +#include "dict-structures.h" +#include "link-includes.h" + +LINK_BEGIN_DECLS + +/** + * Declaration of dictionary-related functions that link-grammar users + * are free to use in their applications. That is, these are a part of + * the public API to the link-parser system. + */ + +#ifdef USE_VITERBI +Dictionary dictionary_create_from_utf8(const char * input); +#endif + +bool boolean_dictionary_lookup(const Dictionary, const char *); + +Dict_node * dictionary_lookup_list(const Dictionary, const char *); +Dict_node * dictionary_lookup_wild(const Dictionary, const char *); + +bool find_word_in_dict(const Dictionary, const char *); + +void free_lookup_list(const Dictionary, Dict_node *); + +/* XXX the below probably does not belong ... ?? */ +Dict_node * insert_dict(Dictionary dict, Dict_node * n, Dict_node * newnode); + +void print_expression(const Exp *); +char *expression_stringify(const Exp *); + +LINK_END_DECLS + +#endif /* _LG_DICT_API_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-common.c link-grammar-5.5.0/link-grammar/dict-common/dict-common.c --- link-grammar-5.3.16/link-grammar/dict-common/dict-common.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-common.c 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,337 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright 2008, 2009, 2012-2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "connectors.h" // for connector_set_delete +#include "dict-affix.h" +#include "dict-api.h" +#include "dict-common.h" +#include "dict-defines.h" +#include "file-utils.h" +#include "post-process/pp_knowledge.h" // Needed only for pp_close !!?? +#include "regex-morph.h" +#include "string-set.h" +#include "tokenize/anysplit.h" +#include "tokenize/spellcheck.h" + +#include "dict-sql/read-sql.h" +#include "dict-file/read-dict.h" +#include "dict-file/word-file.h" + +/* Stems, by definition, end with ".=x" (when x is usually an empty + * string, i.e. ".="). The STEMSUBSCR definition in the affix file + * may include endings with other x values, when x serves as a word + * subscript, e.g. ".=a". */ +#define STEM_MARK '=' + +/* ======================================================================== */ +/* Affix type finding */ + +/** + * Return TRUE if the word seems to be in stem form. + * Stems are signified by including = sign which is preceded by the subscript + * mark. Examples (. represented the subscript mark): word.= word.=[!] + */ +bool is_stem(const char* w) +{ + const char *subscrmark = strchr(w, SUBSCRIPT_MARK); + + if (NULL == subscrmark) return false; + if (subscrmark == w) return false; + if (STEM_MARK != subscrmark[1]) return false; + return true; +} + +/* ======================================================================== */ + +Dictionary dictionary_create_default_lang(void) +{ + Dictionary dictionary = NULL; + char * lang = get_default_locale(); /* E.g. ll_CC.UTF_8 or ll-CC */ + + if (lang && *lang) + { + lang[strcspn(lang, "_-")] = '\0'; + dictionary = dictionary_create_lang(lang); + } + free(lang); + + /* Fall back to English if no default locale or no matching dict. */ + if (NULL == dictionary) + { + dictionary = dictionary_create_lang("en"); + } + + return dictionary; +} + +Dictionary dictionary_create_lang(const char * lang) +{ + Dictionary dictionary = NULL; + + object_open(NULL, NULL, NULL); /* Invalidate the directory path cache */ + + /* If an sql database exists, try to read that. */ + if (check_db(lang)) + { + dictionary = dictionary_create_from_db(lang); + } + + /* Fallback to a plain-text dictionary */ + if (NULL == dictionary) + { + dictionary = dictionary_create_from_file(lang); + } + + return dictionary; +} + +const char * dictionary_get_lang(Dictionary dict) +{ + if (!dict) return ""; + return dict->lang; +} + +/* ======================================================================== */ +/* Dictionary lookup stuff */ + +/** + * dictionary_lookup_list() - get list of matching words in the dictionary. + * + * Returns a pointer to a list of dict_nodes for matching words in the + * dictionary. + * + * This list is made up of Dict_nodes, linked by their right pointers. + * The exp, file and string fields are copied from the dictionary. + * + * The returned list must be freed with free_lookup_list(). + */ +Dict_node * dictionary_lookup_list(const Dictionary dict, const char *s) +{ + return dict->lookup_list(dict, s); +} + +Dict_node * dictionary_lookup_wild(const Dictionary dict, const char *s) +{ + return dict->lookup_wild(dict, s); +} + +void free_lookup_list(const Dictionary dict, Dict_node *llist) +{ + dict->free_lookup(dict, llist); +} + +bool boolean_dictionary_lookup(const Dictionary dict, const char *s) +{ + return dict->lookup(dict, s); +} + +/** + * Return true if word is in dictionary, or if word is matched by + * regex. + */ +bool find_word_in_dict(const Dictionary dict, const char * word) +{ + const char * regex_name; + if (boolean_dictionary_lookup (dict, word)) return true; + + regex_name = match_regex(dict->regex_root, word); + if (NULL == regex_name) return false; + + return boolean_dictionary_lookup(dict, regex_name); +} + +/* ======================================================================== */ +/* the following functions are for handling deletion */ + +#ifdef USEFUL_BUT_NOT_CURRENTLY_USED +/** + * Returns true if it finds a non-idiom dict_node in a file that matches + * the string s. + * + * Also sets parent and to_be_deleted appropriately. + * Note: this function is used in only one place: delete_dictionary_words() + * which is, itself, not currently used ... + */ +static bool find_one_non_idiom_node(Dict_node * p, Dict_node * dn, + const char * s, + Dict_node **parent, Dict_node **to_be_deleted) +{ + int m; + if (dn == NULL) return false; + m = dict_order_bare(s, dn); + if (m <= 0) { + if (find_one_non_idiom_node(dn, dn->left, s, parent, to_be_deleted)) return true; + } +/* if ((m == 0) && (!is_idiom_word(dn->string)) && (dn->file != NULL)) { */ + if ((m == 0) && (!is_idiom_word(dn->string))) { + *to_be_deleted = dn; + *parent = p; + return true; + } + if (m >= 0) { + if (find_one_non_idiom_node(dn, dn->right, s, parent, to_be_deleted)) return true; + } + return false; +} + +static void set_parent_of_node(Dictionary dict, + Dict_node *p, + Dict_node * del, + Dict_node * newnode) +{ + if (p == NULL) { + dict->root = newnode; + } else { + if (p->left == del) { + p->left = newnode; + } else if (p->right == del) { + p->right = newnode; + } else { + assert(false, "Dictionary broken?"); + } + } +} + +/** + * This deletes all the non-idiom words of the dictionary that match + * the given string. Returns true if some deleted, false otherwise. + * + * XXX Note: this function is not currently used anywhere in the code, + * but it could be useful for general dictionary editing. + */ +int delete_dictionary_words(Dictionary dict, const char * s) +{ + Dict_node *pred, *pred_parent; + Dict_node *parent, *to_be_deleted; + + if (!find_one_non_idiom_node(NULL, dict->root, s, &parent, &to_be_deleted)) return false; + for(;;) { + /* now parent and to_be_deleted are set */ + if (to_be_deleted->file != NULL) { + to_be_deleted->file->changed = true; + } + if (to_be_deleted->left == NULL) { + set_parent_of_node(dict, parent, to_be_deleted, to_be_deleted->right); + free(to_be_deleted); + } else { + pred_parent = to_be_deleted; + pred = to_be_deleted->left; + while(pred->right != NULL) { + pred_parent = pred; + pred = pred->right; + } + to_be_deleted->string = pred->string; + to_be_deleted->file = pred->file; + to_be_deleted->exp = pred->exp; + set_parent_of_node(dict, pred_parent, pred, pred->left); + free(pred); + } + if (!find_one_non_idiom_node(NULL, dict->root, s, &parent, &to_be_deleted)) return true; + } +} +#endif /* USEFUL_BUT_NOT_CURRENTLY_USED */ + +/** + * The following two functions free the Exp s and the + * E_lists of the dictionary. Not to be confused with + * free_E_list in word-utils.c. + */ +static void free_Elist(E_list * l) +{ + E_list * l1; + + for (; l != NULL; l = l1) { + l1 = l->next; + free(l); + } +} + +void free_Exp_list(Exp_list * eli) +{ + Exp * e1; + Exp * e = eli->exp_list; + for (; e != NULL; e = e1) + { + e1 = e->next; + if (e->type != CONNECTOR_type) + { + free_Elist(e->u.l); + } + free(e); + } +} + +static void free_dict_node_recursive(Dict_node * dn) +{ + if (dn == NULL) return; + free_dict_node_recursive(dn->left); + free_dict_node_recursive(dn->right); + free(dn); +} + +static void free_dictionary(Dictionary dict) +{ + free_dict_node_recursive(dict->root); + free_Word_file(dict->word_file_header); + free_Exp_list(&dict->exp_list); +} + +static void affix_list_delete(Dictionary dict) +{ + int i; + Afdict_class * atc; + for (i=0, atc = dict->afdict_class; i < AFDICT_NUM_ENTRIES; i++, atc++) + { + if (atc->string) free(atc->string); + } + free(dict->afdict_class); + dict->afdict_class = NULL; +} + +void dictionary_delete(Dictionary dict) +{ + if (!dict) return; + + if (verbosity > 0) { + prt_error("Info: Freeing dictionary %s\n", dict->name); + } + +#ifdef USE_CORPUS + lg_corpus_delete(dict->corpus); +#endif + + if (dict->affix_table != NULL) { + affix_list_delete(dict->affix_table); + dictionary_delete(dict->affix_table); + } + spellcheck_destroy(dict->spell_checker); + if ((locale_t) 0 != dict->lctype) { + freelocale(dict->lctype); + } + + condesc_delete(dict); + + if (dict->close) dict->close(dict); + + pp_knowledge_close(dict->base_knowledge); + pp_knowledge_close(dict->hpsg_knowledge); + string_set_delete(dict->string_set); + free((void *)dict->suppress_warning); + free_regexs(dict->regex_root); + free_anysplit(dict); + free_dictionary(dict); + free(dict); + object_open(NULL, NULL, NULL); /* Free the directory path cache */ +} + +/* ======================================================================== */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-common.h link-grammar-5.5.0/link-grammar/dict-common/dict-common.h --- link-grammar-5.3.16/link-grammar/dict-common/dict-common.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-common.h 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,145 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _LG_DICT_COMMON_H_ +#define _LG_DICT_COMMON_H_ + +#include "api-types.h" // for pp_knowledge +#include "connectors.h" // for condest_t +#include "dict-structures.h" +#include "utilities.h" // for locale_t + +#define EMPTY_CONNECTOR "ZZZ" +#define UNLIMITED_CONNECTORS_WORD ("UNLIMITED-CONNECTORS") +#define LIMITED_CONNECTORS_WORD ("LENGTH-LIMIT-") + +/* Forward decls */ +typedef struct Afdict_class_struct Afdict_class; +typedef struct Exp_list_s Exp_list; +typedef struct Regex_node_s Regex_node; + +/* Used for memory management */ +struct Exp_list_s +{ + Exp * exp_list; +}; + +typedef struct X_node_struct X_node; +struct X_node_struct +{ + const char * string; /* the word itself */ + Exp * exp; + X_node *next; + const Gword *word; /* originating Wordgraph word */ +}; + +/* The regexes are stored as a linked list of the following nodes. */ +struct Regex_node_s +{ + char *name; /* The identifying name of the regex */ + char *pattern; /* The regular expression pattern */ + bool neg; /* Negate the match */ + void *re; /* The compiled regex. void * to avoid + having re library details invading the + rest of the LG system; regex-morph.c + takes care of all matching. + */ + Regex_node *next; +}; + +struct Afdict_class_struct +{ + size_t mem_elems; /* number of memory elements allocated */ + size_t length; /* number of strings */ + char const ** string; +}; + +#define MAX_TOKEN_LENGTH 250 /* Maximum number of chars in a token */ +#define IDIOM_LINK_SZ 5 + +struct Dictionary_s +{ + Dict_node * root; + Regex_node * regex_root; + const char * name; + const char * lang; + const char * version; + const char * locale; /* Locale name */ + locale_t lctype; /* Locale argument for the *_l() functions */ + int num_entries; + + bool use_unknown_word; + bool unknown_word_defined; + bool left_wall_defined; + bool right_wall_defined; + bool shuffle_linkages; + + /* Affixes are used during the tokenization stage. */ + Dictionary affix_table; + Afdict_class * afdict_class; + + /* Random morphology generator */ + struct anysplit_params * anysplit; + + /* If not null, then use spelling guesser for unknown words */ + void * spell_checker; /* spell checker handle */ +#if USE_CORPUS + Corpus * corpus; /* Statistics database */ +#endif +#ifdef HAVE_SQLITE + void * db_handle; /* database handle */ +#endif + + void (*insert_entry)(Dictionary, Dict_node *, int); + Dict_node* (*lookup_list)(Dictionary, const char*); + Dict_node* (*lookup_wild)(Dictionary, const char*); + void (*free_lookup)(Dictionary, Dict_node*); + bool (*lookup)(Dictionary, const char*); + void (*close)(Dictionary); + + pp_knowledge * base_knowledge; /* Core post-processing rules */ + pp_knowledge * hpsg_knowledge; /* Head-Phrase Structure rules */ + Connector_set * unlimited_connector_set; /* NULL=everything is unlimited */ + String_set * string_set; /* Set of link names in the dictionary */ + Word_file * word_file_header; + ConTable contable; + + /* exp_list links together all the Exp structs that are allocated + * in reading this dictionary. Needed for freeing the dictionary + */ + Exp_list exp_list; + + /* Private data elements that come in play only while the + * dictionary is being read, and are not otherwise used. + */ + const char * input; + const char * pin; + bool recursive_error; + const char * suppress_warning; + bool is_special; + int already_got_it; /* For char, but needs to hold EOF */ + int line_number; + char current_idiom[IDIOM_LINK_SZ]; + char token[MAX_TOKEN_LENGTH]; +}; +/* The functions here are intended for use by the tokenizer, only, + * and pretty much no one else. If you are not the tokenizer, you + * probably don't need these. */ + +bool find_word_in_dict(const Dictionary dict, const char *); + +Exp * Exp_create(Exp_list *); +void add_empty_word(Dictionary const, X_node *); +void free_Exp_list(Exp_list *); + +#endif /* _LG_DICT_COMMON_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-defines.h link-grammar-5.5.0/link-grammar/dict-common/dict-defines.h --- link-grammar-5.3.16/link-grammar/dict-common/dict-defines.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-defines.h 2018-04-27 00:39:43.000000000 +0000 @@ -0,0 +1,50 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _DICT_DEFINES_H_ +#define _DICT_DEFINES_H_ + +/* The following define the names of the special strings in the dictionary. */ +#define LEFT_WALL_WORD ("LEFT-WALL") +#define RIGHT_WALL_WORD ("RIGHT-WALL") + +#define UNKNOWN_WORD "" + +/* Some size definitions. Reduce these for small machines */ +/* MAX_WORD is large, because Unicode entries can use a lot of space */ +#define MAX_WORD 180 /* maximum number of bytes in a word */ + +/* Word subscripts come after the subscript mark (ASCII ETX) + * In the dictionary, a dot is used; but that dot interferes with dots + * in the input stream, and so we convert dictionary dots into the + * subscript mark, which we don't expect to see in user input. + */ +#define SUBSCRIPT_MARK '\3' +#define SUBSCRIPT_DOT '.' + +/* A dictionary directive to suppress dictionary check warnings. + * For example: + * %suppress: DUP-BARE SOME-OTHER ... optional comment + * The chosen symbols for warning suppression should not overlap, + * e.g. if there is a symbol DUP-BARE, there should no symbol "DUP". + * Their effect is until the ending ';' of the following expression. + */ +#define SUPPRESS "suppress: " +#define DUP_BASE "DUP-BASE" /* Allow a base-word + subscripted-same-word. */ + +static inline const char *subscript_mark_str(void) +{ + static const char sm[] = { SUBSCRIPT_MARK, '\0' }; + return sm; +} +#endif diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-impl.c link-grammar-5.5.0/link-grammar/dict-common/dict-impl.c --- link-grammar-5.3.16/link-grammar/dict-common/dict-impl.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-impl.c 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,677 @@ + +/*************************************************************************/ +/* Copyright 2013, 2014 Linas Vepstas */ +/* Copyright 2014, 2015 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include + +#include "api-types.h" +#include "connectors.h" +#include "dict-affix.h" +#include "dict-api.h" +#include "dict-defines.h" +#include "dict-impl.h" +#include "regex-morph.h" +#include "dict-structures.h" +#include "string-set.h" +#include "utilities.h" + +/* ======================================================================= */ + +// WindowsXP workaround - missing GetLocaleInfoEx +#ifdef _WIN32 +int callGetLocaleInfoEx(LPCWSTR lpLocaleName, LCTYPE LCType, LPWSTR lpLCData, int cchData) +{ + int rc = -1; + + // Normal call + int (WINAPI * pfnGetLocaleInfoEx)(LPCWSTR, LCTYPE, LPWSTR, int); + *(FARPROC*)&pfnGetLocaleInfoEx = GetProcAddress(GetModuleHandleA("Kernel32" ), "GetLocaleInfoEx" ); + if (pfnGetLocaleInfoEx) + { + rc = pfnGetLocaleInfoEx(lpLocaleName, LCType, lpLCData, cchData); + } + else + { + // Workaround for missing GetLocaleInfoEx + HMODULE module = LoadLibraryA("Mlang"); + HRESULT (WINAPI * pfnRfc1766ToLcidW)(LCID*, LPCWSTR); + *(FARPROC*)&pfnRfc1766ToLcidW = GetProcAddress(module, "Rfc1766ToLcidW" ); + if (pfnRfc1766ToLcidW) + { + LCID lcid; + if (SUCCEEDED(pfnRfc1766ToLcidW(&lcid, lpLocaleName))) + { + rc = GetLocaleInfoW(lcid, LCType, lpLCData, cchData); + } + } + FreeLibrary(module); + } + + return rc; +} +#endif //_WIN32 + +/* ======================================================================= */ + +/** + * Format the given locale for use in setlocale(). + * POSIX systems and Windows use different conventions. + * On Windows, convert to full language and territory names, because the + * short ones don't work for some reason on every system (including MinGW). + * @param dict Used for putting the returned value in a string-set. + * @param ll Locale 2-letter language code. + * @param cc Locale 2-letter territory code. + * @return The formatted locale, directly usable in setlocale(). + */ +static const char * format_locale(Dictionary dict, + const char *ll, const char *cc) +{ + unsigned char *locale_ll = (unsigned char *)strdupa(ll); + unsigned char *locale_cc = (unsigned char *)strdupa(cc); + + for (unsigned char *p = locale_ll; '\0' != *p; p++) *p = tolower(*p); + for (unsigned char *p = locale_cc; '\0' != *p; p++) *p = toupper(*p); + +#ifdef _WIN32 + const int locale_size = strlen(ll) + 1 + strlen(cc) + 1; + char *locale = alloca(locale_size); + snprintf(locale, locale_size, "%s-%s", locale_ll, locale_cc); + + wchar_t wlocale[LOCALE_NAME_MAX_LENGTH]; + wchar_t wtmpbuf[LOCALE_NAME_MAX_LENGTH]; + char tmpbuf[LOCALE_NAME_MAX_LENGTH]; + char locale_buf[LOCALE_NAME_MAX_LENGTH]; + size_t r; + + r = mbstowcs(wlocale, locale, LOCALE_NAME_MAX_LENGTH); + if ((size_t)-1 == r) + { + prt_error("Error: Error converting %s to wide character.\n", locale); + return NULL; + } + wlocale[LOCALE_NAME_MAX_LENGTH-1] = L'\0'; + + if (0 >= callGetLocaleInfoEx(wlocale, LOCALE_SENGLISHLANGUAGENAME, + wtmpbuf, LOCALE_NAME_MAX_LENGTH)) + { + prt_error("Error: GetLocaleInfoEx LOCALE_SENGLISHLANGUAGENAME Locale=%s: \n" + "Error %d", locale, (int)GetLastError()); + return NULL; + } + r = wcstombs(tmpbuf, wtmpbuf, LOCALE_NAME_MAX_LENGTH); + if ((size_t)-1 == r) + { + prt_error("Error: Error converting locale language from wide character.\n"); + return NULL; + } + tmpbuf[LOCALE_NAME_MAX_LENGTH-1] = '\0'; + if (0 == strncmp(tmpbuf, "Unknown", 7)) + { + prt_error("Error: Unknown territory code in locale \"%s\"\n", locale); + return NULL; + } + strcpy(locale_buf, tmpbuf); + strcat(locale_buf, "_"); + + if (0 >= callGetLocaleInfoEx(wlocale, LOCALE_SENGLISHCOUNTRYNAME, + wtmpbuf, LOCALE_NAME_MAX_LENGTH)) + { + prt_error("Error: GetLocaleInfoEx LOCALE_SENGLISHCOUNTRYNAME Locale=%s: \n" + "Error %d", locale, (int)GetLastError()); + return NULL; + } + r = wcstombs(tmpbuf, wtmpbuf, LOCALE_NAME_MAX_LENGTH); + if ((size_t)-1 == r) + { + prt_error("Error: Error converting locale territory from wide character.\n"); + return NULL; + } + tmpbuf[LOCALE_NAME_MAX_LENGTH-1] = '\0'; + if (0 == strncmp(tmpbuf, "Unknown", 7)) + { + prt_error("Error: Unknown territory code in locale \"%s\"\n", locale); + return NULL; + } + locale = strcat(locale_buf, tmpbuf); +#else /* Assuming POSIX */ + const int locale_size = strlen(ll) + 1 + strlen(cc) + sizeof(".UTF-8"); + char *locale = alloca(locale_size); + snprintf(locale, locale_size, "%s_%s.UTF-8", locale_ll, locale_cc); +#endif + + return string_set_add(locale, dict->string_set); +} + +/* ======================================================================= */ + +/** + * Return a locale for the given dictionary, in the OS format. + * - If is defined, use it. + * - Else use the locale from the environment. + * - On Windows, if no environment locale use the default locale. + * + * : LL4cc+; + * LL is the ISO639 language code in uppercase, + * cc is the ISO3166 territory code in lowercase. + * This particular capitalization is needed for the value to be a + * valid LG connector. + * For transliterated dictionaries: + * : C+; + * + * @param dict The dictionary for which the locale is needed. + * @return The locale, in a format suitable for use by setlocale(). + */ +const char * linkgrammar_get_dict_locale(Dictionary dict) +{ + if (dict->locale) return dict->locale; + + const char *locale; + Dict_node *dn = dict->lookup_list(dict, ""); + + if (NULL == dn) + { + lgdebug(D_USER_FILES, "Debug: Dictionary '%s': Locale is not defined.\n", + dict->name); + goto locale_error; + } + + if (0 == strcmp(dn->exp->u.condesc->string, "C")) + { + locale = string_set_add("C", dict->string_set); + } + else + { + char c; + char locale_ll[4], locale_cc[3]; + int locale_numelement = sscanf(dn->exp->u.condesc->string, "%3[A-Z]4%2[a-z]%c", + locale_ll, locale_cc, &c); + if (2 != locale_numelement) + { + prt_error("Error: \": %s\" " + "should be in the form LL4cc+\n" + "\t(LL: language code; cc: territory code) " + "\tor C+ for transliterated dictionaries.\n", + dn->exp->u.condesc->string); + goto locale_error; + } + + locale = format_locale(dict, locale_ll, locale_cc); + + if (!try_locale(locale)) + { + prt_error("Debug: Dictionary \"%s\": Locale \"%s\" unknown\n", + dict->name, locale); + goto locale_error; + } + } + + dict->free_lookup(dict, dn); + lgdebug(D_USER_FILES, "Debug: Dictionary locale: \"%s\"\n", locale); + dict->locale = locale; + return locale; + +locale_error: + { + dict->free_lookup(dict, dn); + + locale = get_default_locale(); + if (NULL == locale) return NULL; + const char *sslocale = string_set_add(locale, dict->string_set); + free((void *)locale); + prt_error("Info: Dictionary '%s': No locale definition - " + "\"%s\" will be used.\n", dict->name, sslocale); + if (!try_locale(sslocale)) + { + lgdebug(D_USER_FILES, "Debug: Unknown locale \"%s\"...\n", sslocale); + return NULL; + } + return sslocale; + } +} + +/* ======================================================================= */ + +const char * linkgrammar_get_version(void) +{ + const char *s = "link-grammar-" LINK_VERSION_STRING; + return s; +} + +/* ======================================================================= */ + +const char * linkgrammar_get_dict_version(Dictionary dict) +{ + char * ver; + char * p; + Dict_node *dn; + Exp *e; + + if (dict->version) return dict->version; + + /* The newer dictionaries should contain a macro of the form: + * : V4v6v6+; + * which would indicate dictionary version 4.6.6 + * Older dictionaries contain no version info. + */ + dn = dict->lookup_list(dict, ""); + if (NULL == dn) return "[unknown]"; + + e = dn->exp; + ver = strdup(&e->u.condesc->string[1]); + p = strchr(ver, 'v'); + while (p) + { + *p = '.'; + p = strchr(p+1, 'v'); + } + + dict->free_lookup(dict, dn); + dict->version = string_set_add(ver, dict->string_set); + free(ver); + return dict->version; +} + +/* ======================================================================= */ + +void dictionary_setup_locale(Dictionary dict) +{ + /* Get the locale for the dictionary. The first one of the + * following which exists, is used: + * 1. The locale which is defined in the dictionary. + * 2. The locale from the environment. + * 3. On Windows - the user's default locale. + * NULL is returned if the locale is not valid. + * Note: + * If we don't have locale_t, as a side effect of checking the locale + * it is set as the program's locale (as desired). However, in that + * case if it is not valid and this is the first dictionary which is + * opened, the program's locale may remain the initial one, i.e. "C" + * (unless the API user changed it). */ + dict->locale = linkgrammar_get_dict_locale(dict); + + /* If the program's locale doesn't have a UTF-8 codeset (e.g. it is + * "C", or because the API user has set it incorrectly) set it to one + * that has it. */ + set_utf8_program_locale(); + + /* If the dictionary locale couldn't be established - then set + * dict->locale so that it is consistent with the current program's + * locale. It will be used as the intended locale of this + * dictionary, and the locale of the compiled regexs. */ + if (NULL == dict->locale) + { + dict->locale = setlocale(LC_CTYPE, NULL); + prt_error("Warning: Couldn't set dictionary locale! " + "Using current program locale \"%s\"\n", dict->locale); + } + + /* setlocale() returns a string owned by the system. Copy it. */ + dict->locale = string_set_add(dict->locale, dict->string_set); + +#ifdef HAVE_LOCALE_T + /* Since linkgrammar_get_dict_locale() (which is called above) + * validates the locale, the following call is guaranteed to succeed. */ + dict->lctype = newlocale_LC_CTYPE(dict->locale); + + /* If dict->locale is still not set, there is a bug. + * Without this assert(), the program may SEGFAULT when it + * uses the isw*() functions. */ + assert((locale_t) 0 != dict->lctype, "Dictionary locale is not set."); +#else + dict->lctype = 0; +#endif /* HAVE_LOCALE_T */ + + /* setlocale() returns a string owned by the system. Copy it. */ + dict->locale = string_set_add(dict->locale, dict->string_set); +} + +void dictionary_setup_defines(Dictionary dict) +{ + dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); + dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); + + dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); + dict->use_unknown_word = true; + + dict->shuffle_linkages = false; + + set_all_condesc_length_limit(dict); +} + +/* ======================================================================= */ +/* ======================================================================= */ + +/* The affix dictionary is represented as a dynamically allocated array with + * an element for each class (connector type) in the affix file. Each element + * has a pointer to an array of strings which are the punctuation/affix + * names. */ + +const char * afdict_classname[] = { AFDICT_CLASSNAMES }; + +/** initialize the affix class table */ +void afclass_init(Dictionary dict) +{ + size_t i; + + dict->afdict_class = + malloc(sizeof(*dict->afdict_class) * AFDICT_NUM_ENTRIES); + for (i = 0; i < AFDICT_NUM_ENTRIES; i++) + { + dict->afdict_class[i].mem_elems = 0; + dict->afdict_class[i].length = 0; + dict->afdict_class[i].string = NULL; + } +} + +/** + * Find the affix table entry for given connector name. + * If the connector name is not in the table, return NULL. + */ +Afdict_class * afdict_find(Dictionary afdict, const char * con, bool notify_err) +{ + const char ** ac; + + for (ac = afdict_classname; + ac < &afdict_classname[ARRAY_SIZE(afdict_classname)]; ac++) + { + if (0 == strcmp(*ac, con)) + return &afdict->afdict_class[ac - afdict_classname]; + } + if (notify_err) { + prt_error("Warning: Unknown class name %s found near line %d of %s.\n" + "\tThis class name will be ignored.\n", + con, afdict->line_number, afdict->name); + } + return NULL; +} + +#define AFFIX_COUNT_MEM_INCREMENT 64 + +void affix_list_add(Dictionary afdict, Afdict_class * ac, + const char * affix) +{ + if (NULL == ac) return; /* ignore unknown class name */ + if (ac->mem_elems <= ac->length) + { + size_t new_sz; + ac->mem_elems += AFFIX_COUNT_MEM_INCREMENT; + new_sz = ac->mem_elems * sizeof(const char *); + ac->string = (char const **) realloc((void *)ac->string, new_sz); + } + ac->string[ac->length] = string_set_add(affix, afdict->string_set); + ac->length++; +} + +#ifdef AFDICT_ORDER_NOT_PRESERVED +static int revcmplen(const void *a, const void *b) +{ + return strlen(*(char * const *)b) - strlen(*(char * const *)a); +} +#endif /* AFDICT_ORDER_NOT_PRESERVED */ + +/** + * Traverse the main dict in dictionary order, and extract all the suffixes + * and prefixes - every time we see a new suffix/prefix (the previous one is + * remembered by w_last), we save it in the corresponding affix-class list. + * The saved affixes don't include the infix mark. + */ +static void get_dict_affixes(Dictionary dict, Dict_node * dn, + char infix_mark, char * w_last) +{ + const char *w; /* current dict word */ + const char *w_sm; /* SUBSCRIPT_MARK position in the dict word */ + size_t w_len; /* length of the dict word */ + Dictionary afdict = dict->affix_table; + + if (dn == NULL) return; + get_dict_affixes(dict, dn->right, infix_mark, w_last); + + w = dn->string; + w_sm = strrchr(w, SUBSCRIPT_MARK); + w_len = (NULL == w_sm) ? strlen(w) : (size_t)(w_sm - w); + if (w_len > MAX_WORD) + { + prt_error("Error: word '%s' too long (%zd), program may malfunction\n", + w, w_len); + w_len = MAX_WORD; + } + /* (strlen(w_last) can be cached for speedup) */ + if ((strlen(w_last) != w_len) || (0 != strncmp(w_last, w, w_len))) + { + strncpy(w_last, w, w_len); + w_last[w_len] = '\0'; + + if (infix_mark == w_last[0]) + { + affix_list_add(afdict, &afdict->afdict_class[AFDICT_SUF], w_last+1); + } + else + if (infix_mark == w_last[w_len-1]) + { + w_last[w_len-1] = '\0'; + affix_list_add(afdict, &afdict->afdict_class[AFDICT_PRE], w_last); + w_last[w_len-1] = infix_mark; + } + } + + get_dict_affixes(dict, dn->left, infix_mark, w_last); +} + +/** + * Concatenate the definitions for the given affix class. + * This allows specifying the characters in different definitions + * instead in a one long string, e.g. instead of: + * ""«»《》【】『』`„": QUOTES+; + * One can specify (note the added spaces): + * """ «» 《》 【】 『』 ` „: QUOTES+; + * Or even: + * """: QUOTES+; + * «» : QUOTES+; + * etc. + * Note that if there are no definitions or only one definition, there is + * nothing to do. + * The result is written to the first entry. + * @param classno The given affix class. + */ +static void concat_class(Dictionary afdict, int classno) +{ + Afdict_class * ac; + size_t i; + dyn_str * qs; + + ac = AFCLASS(afdict, classno); + if (1 >= ac->length) return; + + qs = dyn_str_new(); + for (i = 0; i < ac->length; i++) + dyn_strcat(qs, ac->string[i]); + + ac->string[0] = string_set_add(qs->str, afdict->string_set); + dyn_str_delete(qs); +} + +/** + * Compare lengths of strings, for affix class qsort. + * Sort order: + * 1. Longest base words first. + * 2. Equal base words one after the other. + */ +static int split_order(const void *a, const void *b) +{ + const char * const *sa = a; + const char * const *sb = b; + + size_t len_a = strcspn(*sb, subscript_mark_str()); + size_t len_b = strcspn(*sa, subscript_mark_str()); + + int len_order = (int)(len_a - len_b); + if (0 == len_order) return strncmp(*sa, *sb, len_a); + + return len_order; +} + +/** + * Initialize several classes. + * In case of a future dynamic change of the affix table, this function needs to + * be invoked again after the affix table is re-constructed (changes may be + * needed - especially to first free memory and initialize the affix dict + * structure.). + */ +#define D_AI (D_DICT+1) +bool afdict_init(Dictionary dict) +{ + Afdict_class * ac; + Dictionary afdict = dict->affix_table; + + /* FIXME: read_entry() builds word lists in reverse order (can we + * just create the list top-down without breaking anything?). Unless + * it is fixed to preserve the order, reverse here the word list for + * each affix class. */ + for (ac = afdict->afdict_class; + ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++) + { + int i; + int l = ac->length - 1; + const char * t; + + for (i = 0; i < l; i++, l--) + { + t = ac->string[i]; + ac->string[i] = ac->string[l]; + ac->string[l] = t; + } + } + + /* Create the affix lists */ + ac = AFCLASS(afdict, AFDICT_INFIXMARK); + if ((1 < ac->length) || ((1 == ac->length) && (1 != strlen(ac->string[0])))) + { + prt_error("Error: afdict_init: Invalid value for class %s in file %s" + " (should have been one ASCII punctuation - ignored)\n", + afdict_classname[AFDICT_INFIXMARK], afdict->name); + free((void *)ac->string); + ac->length = 0; + ac->mem_elems = 0; + ac->string = NULL; + } + /* XXX For now there is a possibility to use predefined SUF and PRE lists. + * So if SUF or PRE are defined, don't extract any of them from the dict. */ + if (1 == ac->length) + { + if ((0 == AFCLASS(afdict, AFDICT_PRE)->length) && + (0 == AFCLASS(afdict, AFDICT_SUF)->length)) + { + char last_entry[MAX_WORD+1] = ""; + get_dict_affixes(dict, dict->root, ac->string[0][0], last_entry); + } + } + else + { + /* No INFIX_MARK - create a dummy one that always mismatches */ + affix_list_add(afdict, &afdict->afdict_class[AFDICT_INFIXMARK], ""); + } + + /* Store the SANEMORPHISM regex in the unused (up to now) + * regex_root element of the affix dictionary, and precompile it */ + assert(NULL == afdict->regex_root, "SM regex is already assigned"); + ac = AFCLASS(afdict, AFDICT_SANEMORPHISM); + if (0 != ac->length) + { + int rc; + + Regex_node *sm_re = malloc(sizeof(*sm_re)); + dyn_str *rebuf = dyn_str_new(); + + /* The regex used to be converted to: ^((original-regex)b)+$ + * In the initial wordgraph version word boundaries are not supported, + * so instead it is converted to: ^(original-regex)+$ */ +#ifdef WORD_BOUNDARIES + dyn_strcat(rebuf, "^(("); +#else + dyn_strcat(rebuf, "^("); +#endif + dyn_strcat(rebuf, ac->string[0]); +#ifdef WORD_BOUNDARIES + dyn_strcat(rebuf, ")b)+$"); +#else + dyn_strcat(rebuf, ")+$"); +#endif + sm_re->pattern = strdup(rebuf->str); + dyn_str_delete(rebuf); + + afdict->regex_root = sm_re; + sm_re->name = strdup(afdict_classname[AFDICT_SANEMORPHISM]); + sm_re->re = NULL; + sm_re->next = NULL; + sm_re->neg = false; + rc = compile_regexs(afdict->regex_root, afdict); + if (rc) { + prt_error("Error: afdict_init: Failed to compile " + "regex '%s' in file %s, return code %d\n", + afdict_classname[AFDICT_SANEMORPHISM], afdict->name, rc); + return false; + } + lgdebug(+D_AI, "%s regex %s\n", + afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern); + } + + /* Sort the affix-classes of tokens to be stripped. */ + /* Longer unit names must get split off before shorter ones. + * This prevents single-letter splits from screwing things + * up. e.g. split 7gram before 7am before 7m. + * Another example: The ellipsis "..." must appear before the dot ".". + */ + afdict_classnum af[] = {AFDICT_UNITS, AFDICT_LPUNC, AFDICT_RPUNC, AFDICT_MPUNC}; + for (size_t i = 0; i < ARRAY_SIZE(af); i++) + { + ac = AFCLASS(afdict, af[i]); + if (0 < ac->length) + { + qsort(ac->string, ac->length, sizeof(char *), split_order); + } + } + +#ifdef AFDICT_ORDER_NOT_PRESERVED + /* pre-sort the MPRE list */ + ac = AFCLASS(afdict, AFDICT_MPRE); + if (0 < ac->length) + { + /* Longer subwords have priority over shorter ones, + * reverse-sort by length. + * XXX mprefix_split() for Hebrew depends on that. */ + qsort(ac->string, ac->length, sizeof(char *), revcmplen); + } +#endif /* AFDICT_ORDER_NOT_PRESERVED */ + + concat_class(afdict, AFDICT_QUOTES); + concat_class(afdict, AFDICT_BULLETS); + + if (verbosity_level(D_AI)) + { + size_t l; + + for (ac = afdict->afdict_class; + ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++) + { + if (0 == ac->length) continue; + lgdebug(+0, "Class %s, %zd items:", + afdict_classname[ac-afdict->afdict_class], ac->length); + for (l = 0; l < ac->length; l++) + lgdebug(0, " '%s'", ac->string[l]); + lgdebug(0, "\n\\"); + } + lg_error_flush(); + } + + return true; +} +#undef D_AI diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-impl.h link-grammar-5.5.0/link-grammar/dict-common/dict-impl.h --- link-grammar-5.3.16/link-grammar/dict-common/dict-impl.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-impl.h 2017-11-10 19:30:23.000000000 +0000 @@ -0,0 +1,19 @@ + +#include "link-includes.h" +#include "dict-common/dict-common.h" // For Afdict_class +#include "utilities.h" + +// Already declared in link-includes.h +// const char * linkgrammar_get_dict_locale(Dictionary dict); +// const char * linkgrammar_get_version(void); +// const char * linkgrammar_get_dict_version(Dictionary dict); + +void dictionary_setup_locale(Dictionary dict); +void dictionary_setup_defines(Dictionary dict); +void afclass_init(Dictionary dict); +bool afdict_init(Dictionary dict); +void affix_list_add(Dictionary afdict, Afdict_class *, const char *); + +#ifdef __MINGW32__ +int callGetLocaleInfoEx(LPCWSTR, LCTYPE, LPWSTR, int); +#endif /* __MINGW32__ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-structures.h link-grammar-5.5.0/link-grammar/dict-common/dict-structures.h --- link-grammar-5.3.16/link-grammar/dict-common/dict-structures.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-structures.h 2018-04-26 02:38:14.000000000 +0000 @@ -0,0 +1,88 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _LG_DICT_STRUCTURES_H_ +#define _LG_DICT_STRUCTURES_H_ + +#include "link-grammar/link-features.h" +#include "link-includes.h" + +LINK_BEGIN_DECLS + +/* Forward decls */ +typedef struct Dict_node_struct Dict_node; +typedef struct Exp_struct Exp; +typedef struct E_list_struct E_list; +typedef struct Word_file_struct Word_file; +typedef struct condesc_struct condesc_t; + +/** + * Types of Exp_struct structures + */ +typedef enum +{ + OR_type = 1, + AND_type, + CONNECTOR_type +} Exp_type; + +/** + * The E_list and Exp structures defined below comprise the expression + * trees that are stored in the dictionary. The expression has a type + * (OR_type, AND_type or CONNECTOR_type). If it is not a terminal it + * has a list (an E_list) of children. Else "string" is the connector, + * and "dir" indicates its direction. + */ +struct Exp_struct +{ + Exp * next; /* Used only for memory management, for freeing */ + Exp_type type; /* One of three types: AND, OR, or connector. */ + char dir; /* The connector connects to: '-': the left; '+': the right */ + bool multi; /* TRUE if a multi-connector (for connector) */ + union { + E_list * l; /* Only needed for non-terminals */ + condesc_t * condesc; /* Only needed if it's a connector */ + } u; + double cost; /* The cost of using this expression. + Only used for non-terminals */ +}; + +struct E_list_struct +{ + E_list * next; + Exp * e; +}; + +/* API to access the above structure. */ +static inline Exp_type lg_exp_get_type(const Exp* exp) { return exp->type; } +static inline char lg_exp_get_dir(const Exp* exp) { return exp->dir; } +static inline bool lg_exp_get_multi(const Exp* exp) { return exp->multi; } +const char* lg_exp_get_string(const Exp*); +static inline double lg_exp_get_cost(const Exp* exp) { return exp->cost; } + +/** + * The dictionary is stored as a binary tree comprised of the following + * nodes. A list of these (via right pointers) is used to return + * the result of a dictionary lookup. + */ +struct Dict_node_struct +{ + const char * string; /* The word itself */ + Word_file * file; /* The file the word came from (NULL if dict file) */ + Exp * exp; + Dict_node *left, *right; +}; + +LINK_END_DECLS + +#endif /* _LG_DICT_STRUCTURES_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-utils.c link-grammar-5.5.0/link-grammar/dict-common/dict-utils.c --- link-grammar-5.3.16/link-grammar/dict-common/dict-utils.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-utils.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,344 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ +/* + * Miscellaneous utilities for dealing with word types. + */ + +#include // for fabs() + +#include "connectors.h" +#include "dict-api.h" +#include "string-set.h" +#include "dict-utils.h" + +/* ======================================================== */ +/* Public API ... */ + +const char * lg_exp_get_string(const Exp* exp) +{ + return exp->u.condesc->string; +} + +/* ======================================================== */ +/* Exp utilities ... */ + +void free_E_list(E_list *); +void free_Exp(Exp * e) +{ + // Exp might be null if the user has a bad dict. e.g. badly formed + // SQL dict. + if (NULL == e) return; + if (e->type != CONNECTOR_type) { + free_E_list(e->u.l); + } + free(e); +} + +void free_E_list(E_list * l) +{ + if (l == NULL) return; + free_E_list(l->next); + free_Exp(l->e); + free(l); +} + +/* Returns the number of connectors in the expression e */ +int size_of_expression(Exp * e) +{ + int size; + E_list * l; + if (e->type == CONNECTOR_type) return 1; + size = 0; + for (l=e->u.l; l!=NULL; l=l->next) { + size += size_of_expression(l->e); + } + return size; +} + +/** + * Build a copy of the given expression (don't copy strings, of course) + */ +static E_list * copy_E_list(E_list * l); +Exp * copy_Exp(Exp * e) +{ + Exp * n; + if (e == NULL) return NULL; + n = malloc(sizeof(Exp)); + *n = *e; + if (e->type != CONNECTOR_type) { + n->u.l = copy_E_list(e->u.l); + } + return n; +} + +static E_list * copy_E_list(E_list * l) +{ + E_list * nl; + if (l == NULL) return NULL; + nl = malloc(sizeof(E_list)); + nl->next = copy_E_list(l->next); + nl->e = copy_Exp(l->e); + return nl; +} + +/** + * Compare two expressions, return true for equal, false for unequal + */ +static bool exp_compare(Exp * e1, Exp * e2) +{ + E_list *el1, *el2; + + if ((e1 == NULL) && (e2 == NULL)) + return 1; /* they are equal */ + if ((e1 == NULL) || (e2 == NULL)) + return 0; /* they are not equal */ + if (e1->type != e2->type) + return 0; + if (fabs (e1->cost - e2->cost) > 0.001) + return 0; + if (e1->type == CONNECTOR_type) + { + if (e1->dir != e2->dir) + return 0; + /* printf("%s %s\n",e1->u.condesc->string,e2->u.condesc->string); */ + if (e1->u.condesc != e2->u.condesc) + return 0; + } + else + { + el1 = e1->u.l; + el2 = e2->u.l; + /* while at least 1 is non-null */ + for (;(el1!=NULL)||(el2!=NULL);) { + /*fail if 1 is null */ + if ((el1==NULL)||(el2==NULL)) + return 0; + /* fail if they are not compared */ + if (!exp_compare(el1->e, el2->e)) + return 0; + el1 = el1->next; + el2 = el2->next; + } + } + return 1; /* if never returned 0, return 1 */ +} + +/** + * Sub-expression matcher -- return 1 if sub is non-NULL and + * contained in super, 0 otherwise. + */ +static int exp_contains(Exp * super, Exp * sub) +{ + E_list * el; + +#if 0 /* DEBUG */ + printf("SUP: "); + if (super) print_expression(super); + printf("\n"); +#endif + + if (sub==NULL || super==NULL) + return 0; + if (exp_compare(sub,super)) + return 1; + if (super->type==CONNECTOR_type) + return 0; /* super is a leaf */ + + /* proceed through supers children and return 1 if sub + is contained in any of them */ + for(el = super->u.l; el!=NULL; el=el->next) { + if (exp_contains(el->e, sub)==1) + return 1; + } + return 0; +} + +/* ======================================================== */ +/* X_node utilities ... */ +/** + * frees the list of X_nodes pointed to by x, and all of the expressions + */ +void free_X_nodes(X_node * x) +{ + X_node * y; + for (; x!= NULL; x = y) { + y = x->next; + free_Exp(x->exp); + free(x); + } +} + +/** + * Destructively catenates the two disjunct lists d1 followed by d2. + * Doesn't change the contents of the disjuncts. + * Traverses the first list, but not the second. + */ +X_node * catenate_X_nodes(X_node *d1, X_node *d2) +{ + X_node * dis = d1; + + if (d1 == NULL) return d2; + if (d2 == NULL) return d1; + while (dis->next != NULL) dis = dis->next; + dis->next = d2; + return d1; +} + +/* ======================================================== */ +/* More connector utilities ... */ + +/** + * word_has_connector() -- return TRUE if dictionary expression has connector + * This function takes a dict_node (corresponding to an entry in a + * given dictionary), a string (representing a connector), and a + * direction (+ = right-pointing, '-' = left-pointing); it returns true + * if the dictionary expression for the word includes the connector, + * false otherwise. This can be used to see if a word is in a certain + * category (checking for a category connector in a table), or to see + * if a word has a connector in a normal dictionary. The connector + * check uses a "smart-match", the same kind used by the parser. + */ +#if CRAZY_OBESE_CHECKING_AGLO +bool word_has_connector(Dict_node * dn, const char * cs, char direction) +{ + Connector * c2 = NULL; + Disjunct *d, *d0; + if (dn == NULL) return false; + d0 = d = build_disjuncts_for_dict_node(dn); + if (d == NULL) return false; + for (; d != NULL; d = d->next) { + if (direction == '+') c2 = d->right; + if (direction == '-') c2 = d->left; + for (; c2 != NULL; c2 = c2->next) { + if (easy_match(c2->string, cs)) { + free_disjuncts(d0); + return true; + } + } + } + free_disjuncts(d0); + return false; +} +#else /* CRAZY_OBESE_CHECKING_AGLO */ + +/** + * Return true if the given expression has the given connector. + * The connector cs argument must originally be in the dictionary string set. + */ +static bool exp_has_connector(const Exp * e, int depth, const char * cs, + char direction, bool smart_match) +{ + E_list * el; + if (e->type == CONNECTOR_type) + { + if (direction != e->dir) return false; + return smart_match ? easy_match(e->u.condesc->string, cs) + : string_set_cmp(e->u.condesc->string, cs); + } + + if (depth == 0) return false; + if (depth > 0) depth--; + + for (el = e->u.l; el != NULL; el = el->next) + { + if (exp_has_connector(el->e, depth, cs, direction, smart_match)) + return true; + } + return false; +} + +bool word_has_connector(Dict_node * dn, const char * cs, char direction) +{ + return exp_has_connector(dn->exp, -1, cs, direction, /*smart_match*/true); +} +#endif /* CRAZY_OBESE_CHECKING_AGLO */ + +/** + * Find if an expression has a connector ZZZ- (that an empty-word has). + * This is a costly way to find it. To reduce the overhead, the + * exp_has_connector() "depth" argument limits the expression depth check, + * supposing the ZZZ- connectors are not deep in the word expression. + * FIXME? A cheaper way is to have a dictionary entry which lists such + * words, or to mark such words at dictionary read time. + **/ +bool is_exp_like_empty_word(Dictionary dict, Exp *exp) +{ + const char *cs = string_set_lookup(EMPTY_CONNECTOR, dict->string_set); + if (NULL == cs) return false; + return exp_has_connector(exp, 2, cs, '-', /*smart_match*/false); +} + +/** + * If word has a connector, return it. + * If word has more than one connector, return NULL. + */ +const char * word_only_connector(Dict_node * dn) +{ + Exp * e = dn->exp; + if (CONNECTOR_type == e->type) + return e->u.condesc->string; + return NULL; +} + +/* ======================================================== */ +/* Dictionary utilities ... */ + +static bool dn_word_contains(Dictionary dict, + Dict_node * w_dn, const char * macro) +{ + Exp * m_exp; + Dict_node *m_dn; + + if (w_dn == NULL) return false; + + m_dn = dictionary_lookup_list(dict, macro); + if (m_dn == NULL) return false; + + m_exp = m_dn->exp; + +#if 0 /* DEBUG */ + printf("\nWORD: "); + print_expression(w_dn->exp); + printf("\nMACR: "); + print_expression(m_exp); + printf("\n"); +#endif + + for (;w_dn != NULL; w_dn = w_dn->right) + { + if (1 == exp_contains(w_dn->exp, m_exp)) + { + free_lookup_list(dict, m_dn); + return true; + } + } + free_lookup_list(dict, m_dn); + return false; +} + +/** + * word_contains: return true if the word may involve application of + * a rule. + * + * @return: true if word's expression contains macro's expression, + * false otherwise. + */ +bool word_contains(Dictionary dict, const char * word, const char * macro) +{ + Dict_node *w_dn = dictionary_lookup_list(dict, word); + bool ret = dn_word_contains(dict, w_dn, macro); + free_lookup_list(dict, w_dn); + return ret; +} + +/* ========================= END OF FILE ============================== */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/dict-utils.h link-grammar-5.5.0/link-grammar/dict-common/dict-utils.h --- link-grammar-5.3.16/link-grammar/dict-common/dict-utils.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/dict-utils.h 2018-01-24 01:57:32.000000000 +0000 @@ -0,0 +1,35 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2009, 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _DICT_UTILS_H_ +#define _DICT_UTILS_H_ + +#include "dict-common.h" + +/* Exp utilities ... */ +void free_Exp(Exp *); +void free_E_list(E_list *); +int size_of_expression(Exp *); +Exp * copy_Exp(Exp *); +bool is_exp_like_empty_word(Dictionary dict, Exp *); + +/* X_node utilities ... */ +X_node * catenate_X_nodes(X_node *, X_node *); +void free_X_nodes(X_node *); + +/* Dictionary utilities ... */ +bool word_has_connector(Dict_node *, const char *, char); +const char * word_only_connector(Dict_node *); +bool word_contains(Dictionary dict, const char * word, const char * macro); + +#endif /* _DICT_UTILS_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/file-utils.c link-grammar-5.5.0/link-grammar/dict-common/file-utils.c --- link-grammar-5.3.16/link-grammar/dict-common/file-utils.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/file-utils.c 2018-04-25 17:04:50.000000000 +0000 @@ -0,0 +1,394 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright 2008, 2009, 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include +#include // fstat() + +#ifndef _WIN32 + #include +#else + #include + #include // PathRemoveFileSpecA() + #include // getcwd() +#endif /* _WIN32 */ + +#include +#include + +#include "file-utils.h" +#include "error.h" // verbosity_level() +#include "link-includes.h" +#include "utilities.h" + +#ifdef _WIN32 + #define DIR_SEPARATOR "\\" +#else + #define DIR_SEPARATOR "/" +#endif /*_WIN32 */ + +#define IS_DIR_SEPARATOR(ch) (DIR_SEPARATOR[0] == (ch)) +#if !defined(DICTIONARY_DIR) || defined(__MINGW32__) + #define DEFAULTPATH NULL +#else + #define DEFAULTPATH DICTIONARY_DIR +#endif + +/* =========================================================== */ +/* File path and dictionary open routines below */ + +#define MAX_PATH_NAME 200 /* file names (including paths) + should not be longer than this */ + +char * join_path(const char * prefix, const char * suffix) +{ + char * path; + size_t path_len, prel; + + path_len = strlen(prefix) + 1 /* len(DIR_SEPARATOR) */ + strlen(suffix); + path = (char *) malloc(path_len + 1); + + strcpy(path, prefix); + + /* Windows is allergic to multiple path separators, so append one + * only if the prefix isn't already terminated by a path sep. + */ + prel = strlen(path); + if (0 < prel && path[prel-1] != DIR_SEPARATOR[0]) + { + path[prel] = DIR_SEPARATOR[0]; + path[prel+1] = '\0'; + } + strcat(path, suffix); + + return path; +} + +/* global - but that's OK, since this is set only during initialization, + * and is is thenceforth a read-only item. So it doesn't need to be + * locked. XXX Assuming the user actually does set it only once, + * and doesn't hammer on this inside of a thread loop, in which case + * it will race and abort with a double-free error. + */ +static char * custom_data_dir = NULL; + +static void free_custom_data_dir(void) +{ + free(custom_data_dir); +} + +void dictionary_set_data_dir(const char * path) +{ + if (custom_data_dir) + free(custom_data_dir); + else + atexit(free_custom_data_dir); + custom_data_dir = safe_strdup(path); +} + +char * dictionary_get_data_dir(void) +{ + char * data_dir = NULL; + + if (custom_data_dir != NULL) { + data_dir = safe_strdup(custom_data_dir); + return data_dir; + } + +#ifdef _WIN32 + /* Dynamically locate invocation directory of our program. + * Non-ASCII characters are not supported (files will not be found). */ + char prog_path[MAX_PATH_NAME]; + + if (!GetModuleFileNameA(NULL, prog_path, sizeof(prog_path))) + { + prt_error("Warning: GetModuleFileName error %d\n", (int)GetLastError()); + } + else + { + if (NULL == prog_path) + { + /* Can it happen? */ + prt_error("Warning: GetModuleFileName returned a NULL program path!\n"); + } + else + { + if (!PathRemoveFileSpecA(prog_path)) + { + prt_error("Warning: Cannot get directory from program path '%s'!\n", + prog_path); + } + else + { + /* Unconvertible characters are marked as '?' */ + const char *unsupported = (NULL != strchr(prog_path, '?')) ? + " (containing unsupported character)" : ""; + + lgdebug(D_USER_FILES, "Debug: Directory of executable: %s%s\n", + unsupported, prog_path); + data_dir = safe_strdup(prog_path); + } + } + } +#endif /* _WIN32 */ + + return data_dir; +} + +static void *dict_file_open(const char *fullname, const void *how) +{ + return fopen(fullname, how); +} + +/** + * Locate a data file and open it. + * + * This function is used to open a dictionary file or a word file, + * or any associated data file (like a post process knowledge file). + * + * It works as follows. If the file name begins with a "/", then + * it's assumed to be an absolute file name and it tries to open + * that exact file. + * + * Otherwise, it looks for the file in a sequence of directories, as + * specified in the dictpath array, until it finds it. + * + * If it is still not found, it may be that the user specified a relative + * path, so it tries to open the exact file. + * + * Associated data files are looked in the *same* directory in which the + * first one was found (typically "en/4.0.dict"). The private static + * "path_found" serves as a directory path cache which records where the + * first file was found. The goal here is to avoid insanity due to + * user's fractured installs. + * If the filename argument is NULL, the function just invalidates this + * directory path cache. + */ +#define NOTFOUND(fp) ((NULL == (fp)) ? " (Not found)" : "") +void * object_open(const char *filename, + void * (*opencb)(const char *, const void *), + const void * user_data) +{ + /* Dictionary data directory path cache -- per-thread storage. */ + static TLS char *path_found; + char *completename = NULL; + void *fp = NULL; + char *data_dir = NULL; + const char **path = NULL; + + if (NULL == filename) + { + /* Invalidate the dictionary data directory path cache. */ + char *pf = path_found; + path_found = NULL; + free(pf); + return NULL; + } + + if (NULL == path_found) + { + data_dir = dictionary_get_data_dir(); + if (verbosity_level(D_USER_FILES)) + { + char cwd[MAX_PATH_NAME]; + char *cwdp = getcwd(cwd, sizeof(cwd)); + prt_error("Debug: Current directory: %s\n", NULL == cwdp ? "NULL": cwdp); + prt_error("Debug: Last-resort data directory: %s\n", + data_dir ? data_dir : "NULL"); + } + } + + /* Look for absolute filename. + * Unix: starts with leading slash. + * Windows: starts with C:\ except that the drive letter may differ. */ + if ((filename[0] == '/') +#ifdef _WIN32 + || ((filename[1] == ':') + && ((filename[2] == '\\') || (filename[2] == '/'))) + || (filename[0] == '\\') /* UNC path */ +#endif /* _WIN32 */ + ) + { + /* opencb() returns NULL if the file does not exist. */ + fp = opencb(filename, user_data); + lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp)); + } + else + { + /* A path list in which to search for dictionaries. + * path_found, data_dir or DEFAULTPATH may be NULL. */ + const char *dictpath[] = + { + path_found, + ".", + "." DIR_SEPARATOR "data", + "..", + ".." DIR_SEPARATOR "data", + data_dir, + DEFAULTPATH, + }; + size_t i = sizeof(dictpath)/sizeof(dictpath[0]); + + for (path = dictpath; i-- > 0; path++) + { + if (NULL == *path) continue; + + free(completename); + completename = join_path(*path, filename); + fp = opencb(completename, user_data); + lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", completename, NOTFOUND(fp)); + if ((NULL != fp) || (NULL != path_found)) break; + } + } + + if (NULL == fp) + { + fp = opencb(filename, user_data); + lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp)); + } + else if (NULL == path_found) + { + char *pfnd = strdup((NULL != completename) ? completename : filename); + if ((0 < verbosity) && (dict_file_open == opencb)) + prt_error("Info: Dictionary found at %s\n", pfnd); + for (size_t i = 0; i < 2; i++) + { + char *root = strrchr(pfnd, DIR_SEPARATOR[0]); + if (NULL != root) *root = '\0'; + } + path_found = pfnd; + } + + free(data_dir); + free(completename); + return fp; +} +#undef NOTFOUND + +FILE *dictopen(const char *filename, const char *how) +{ + return object_open(filename, dict_file_open, how); +} + +/* + * XXX - dict_file_open() cannot be used due to the Info printout + * of opening a dictionary. + */ +static void *data_file_open(const char *fullname, const void *how) +{ + return fopen(fullname, how); +} + +/** + * Open a file in the dictionary search path. + * Experimental API (may be unstable). + * @param filename Filename to be opened. + * @return FILE pointer, or NULL if the file was no found. + */ +FILE *linkgrammar_open_data_file(const char *filename) +{ + object_open(NULL, NULL, NULL); /* Invalidate the directory path cache */ + return object_open(filename, data_file_open, "r"); +} + +/* ======================================================== */ + +/** + * Check to see if a file exists. + */ +bool file_exists(const char * dict_name) +{ + bool retval = false; + int fd; + struct stat buf; + + /* On Windows, 'b' (binary mode) is mandatory, otherwise fstat file length + * is confused by crlf counted as one byte. POSIX systems just ignore it. */ + FILE *fp = dictopen(dict_name, "rb"); + + if (fp == NULL) + return false; + + /* Get the file size, in bytes. */ + fd = fileno(fp); + fstat(fd, &buf); + if (0 < buf.st_size) retval = true; + + fclose(fp); + return retval; +} + +/** + * Read in the whole stinkin file. This routine returns + * malloced memory, which should be freed as soon as possible. + */ +char *get_file_contents(const char * dict_name) +{ + int fd; + size_t tot_size; + size_t tot_read = 0; + struct stat buf; + char * contents; + + /* On Windows, 'b' (binary mode) is mandatory, otherwise fstat file length + * is confused by crlf counted as one byte. POSIX systems just ignore it. */ + FILE *fp = dictopen(dict_name, "rb"); + + if (fp == NULL) + return NULL; + + /* Get the file size, in bytes. */ + fd = fileno(fp); + fstat(fd, &buf); + tot_size = buf.st_size; + + contents = (char *) malloc(sizeof(char) * (tot_size+7)); + + /* Now, read the whole file. + * Normally, a single fread() call below reads the whole file. */ + while (1) + { + size_t read_size = fread(contents, 1, tot_size+7, fp); + + if (0 == read_size) + { + bool err = (0 != ferror(fp)); + + if (err) + { + char errbuf[64]; + + strerror_r(errno, errbuf, sizeof(errbuf)); + fclose(fp); + prt_error("Error: %s: Read error (%s)\n", dict_name, errbuf); + free(contents); + return NULL; + } + fclose(fp); + break; + } + tot_read += read_size; + } + + if (tot_read > tot_size+6) + { + prt_error("Error: %s: File size is insane (%zu)!\n", dict_name, tot_size); + free(contents); + return NULL; + } + + contents[tot_read] = '\0'; + return contents; +} + +/* ============================================================= */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/file-utils.h link-grammar-5.5.0/link-grammar/dict-common/file-utils.h --- link-grammar-5.3.16/link-grammar/dict-common/file-utils.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/file-utils.h 2017-05-28 11:14:43.000000000 +0000 @@ -0,0 +1,29 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2009-2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ +#ifndef _DICT_FILE_UTILITIES_H_ +#define _DICT_FILE_UTILITIES_H_ + +#include +#include + +char * join_path(const char * prefix, const char * suffix); + +FILE * dictopen(const char *filename, const char *how); +void * object_open(const char *filename, + void * (*opencb)(const char *, const void *), + const void * user_data); + +bool file_exists(const char * dict_name); +char * get_file_contents(const char *filename); + +#endif /* _DICT_FILE_UTILITIES_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common/idiom.c link-grammar-5.5.0/link-grammar/dict-common/idiom.c --- link-grammar-5.3.16/link-grammar/dict-common/idiom.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/idiom.c 2018-04-25 17:04:50.000000000 +0000 @@ -0,0 +1,336 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-structures.h" +#include "api-types.h" +#include "dict-api.h" +#include "dict-common.h" +#include "dict-defines.h" // For MAX_WORD +#include "error.h" +#include "idiom.h" +#include "string-set.h" + +/** + * Find if a string signifies an idiom. + * Returns true if the string contains an underbar character. + * The check of s[0] prevents inclusion of '_'. In that case no check for + * length=1 is done because it is not going to be a valid idiom anyway. + * + * If the underbar character is preceded by a backslash, it is not + * considered. The subscript, if exists, is not checked. + * + * FIXME: Words with '\' escaped underbars that contain also unescaped + * ones are not supported. + */ +bool contains_underbar(const char * s) +{ + if ((s[0] == '_') || (s[0] == '\0')) return false; + while (*++s != '\0') + { + if (*s == SUBSCRIPT_MARK) return false; + if ((*s == '_') && (s[-1] != '\\')) return true; + } + return false; +} + +/** + * Returns false if it is not a correctly formed idiom string. + * Such a string is correct if it consists of non-empty strings + * separated by '_'. + */ +static bool is_idiom_string(const char * s) +{ + size_t len; + const char * t; + + len = strlen(s); + if ((s[0] == '_') || (s[len-1] == '_')) + { + return false; + } + + for (t = s; *t != '\0'; t++) + { + if (*s == SUBSCRIPT_MARK) return true; + if ((*t == '_') && (*(t+1) == '_')) return false; + } + return true; +} + +/** + * Return true if the string s is a sequence of digits. + */ +static bool is_number(const char *s) +{ + while(*s != '\0') { + if (!isdigit(*s)) return false; + s++; + } + return true; +} + +/** + * If the string contains a SUBSCIPT_MARK, and ends in ".Ix" where + * x is a number, return x. Return -1 if not of this form. + */ +static int numberfy(const char * s) +{ + s = strrchr(s, SUBSCRIPT_MARK); + if (NULL == s) return -1; + if (*++s != 'I') return -1; + if (!is_number(++s)) return -1; + return atoi(s); +} + +/** + * Look for words that end in ".Ix" where x is a number. + * Return the largest x found. + */ +static int max_postfix_found(Dict_node * d) +{ + int i, j; + i = 0; + while(d != NULL) { + j = numberfy(d->string); + if (j > i) i = j; + d = d->right; + } + return i; +} + +/** + * build_idiom_word_name() -- return idiomized name of given string. + * + * Allocates string space and returns a pointer to it. + * In this string is placed the idiomized name of the given string s. + * This is the same as s, but with a postfix of ".Ix", where x is an + * appropriate number. x is the minimum number that distinguishes + * this word from others in the dictionary. + */ +static const char * build_idiom_word_name(Dictionary dict, const char * s) +{ + char buff[2*MAX_WORD]; + size_t bufsz = 2*MAX_WORD; + int count; + + Dict_node *dn = dictionary_lookup_list(dict, s); + count = max_postfix_found(dn) + 1; + free_lookup_list(dict, dn); + + size_t l = lg_strlcpy(buff, s, bufsz); + snprintf(buff+l, bufsz-l, "%cI%d", SUBSCRIPT_MARK, count); + + return string_set_add(buff, dict->string_set); +} + +/** + * Tear the idiom string apart. + * Put the parts into a list of Dict_nodes (connected by their right pointers) + * Sets the string fields of these Dict_nodes pointing to the + * fragments of the string s. Later these will be replaced by + * correct names (with .Ix suffixes). + * The list is reversed from the way they occur in the string. + * A pointer to this list is returned. + * This function is called after is_idiom_string() ensures the validity + * of the given string. + */ +static Dict_node * make_idiom_Dict_nodes(Dictionary dict, const char * string) +{ + Dict_node * dn = NULL; + char * s = strdupa(string); + const char * t; + const char *sm = strchr(s, SUBSCRIPT_MARK); + + for (t = s; NULL != s; t = s) + { + s = strchr(s, '_'); + if ((NULL != sm) && (s > sm)) s = NULL; + if (NULL != s) *s++ = '\0'; + Dict_node *dn_new = (Dict_node *) malloc(sizeof (Dict_node)); + dn_new->right = dn; + dn = dn_new; + dn->string = string_set_add(t, dict->string_set); + dn->file = NULL; + } + + return dn; +} + +static void increment_current_name(Dictionary dict) +{ + int i = IDIOM_LINK_SZ-2; + + do + { + dict->current_idiom[i]++; + if (dict->current_idiom[i] <= 'Z') return; + dict->current_idiom[i] = 'A'; + } while (i-- > 0); + assert(0, "increment_current_name: Overflow"); +} + +/** + * Generate a new connector name obtained from the current_name. + * allocate string space for it. + * @return a pointer to connector name. + */ +static const char * generate_id_connector(Dictionary dict) +{ + char buff[IDIOM_LINK_SZ+4]; + unsigned int i; + char * t; + + for (i=0; dict->current_idiom[i] == 'A'; i++) + ; + /* i is now the number of characters of current_name to skip */ + t = buff; + + /* All idiom connector names start with the two letters "ID" */ + *t++ = 'I'; + *t++ = 'D'; + for (; i < IDIOM_LINK_SZ; i++ ) + { + *t++ = dict->current_idiom[i] ; + } + *t++ = '\0'; + return string_set_add(buff, dict->string_set); +} + +/** + * Takes as input a pointer to a Dict_node. + * The string of this Dict_node is an idiom string. + * This string is torn apart, and its components are inserted into the + * dictionary as special idiom words (ending in .I*, where * is a number). + * The expression of this Dict_node (its node field) has already been + * read and constructed. This will be used to construct the special idiom + * expressions. + * The given dict node is freed. The string is also freed. + */ +void insert_idiom(Dictionary dict, Dict_node * dn) +{ + Exp * nc, * no, * n1; + E_list *ell, *elr; + const char * s; + Dict_node * dn_list, * xdn, * start_dn_list; + + no = dn->exp; + s = dn->string; + + if (!is_idiom_string(s)) + { + prt_error("Warning: Word \"%s\" on line %d " + "is not a correctly formed idiom string.\n" + "\tThis word will be ignored\n", + s, dict->line_number); + + return; + } + + dn_list = start_dn_list = make_idiom_Dict_nodes(dict, s); + + assert(dn_list->right != NULL, "Idiom string with only one connector"); + + /* first make the nodes for the base word of the idiom (last word) */ + /* note that the last word of the idiom is first in our list */ + + /* ----- this code just sets up the node fields of the dn_list ----*/ + nc = Exp_create(&dict->exp_list); + nc->u.condesc = condesc_add(&dict->contable, generate_id_connector(dict)); + nc->dir = '-'; + nc->multi = false; + nc->type = CONNECTOR_type; + nc->cost = 0; + + n1 = Exp_create(&dict->exp_list); + n1->u.l = ell = (E_list *) malloc(sizeof(E_list)); + ell->next = elr = (E_list *) malloc(sizeof(E_list)); + elr->next = NULL; + ell->e = nc; + elr->e = no; + n1->type = AND_type; + n1->cost = 0; + + dn_list->exp = n1; + + dn_list = dn_list->right; + + while(dn_list->right != NULL) + { + /* generate the expression for a middle idiom word */ + + n1 = Exp_create(&dict->exp_list); + n1->type = AND_type; + n1->cost = 0; + n1->u.l = ell = (E_list *) malloc(sizeof(E_list)); + ell->next = elr = (E_list *) malloc(sizeof(E_list)); + elr->next = NULL; + + nc = Exp_create(&dict->exp_list); + nc->u.condesc = condesc_add(&dict->contable, generate_id_connector(dict)); + nc->dir = '+'; + nc->multi = false; + nc->type = CONNECTOR_type; + nc->cost = 0; + elr->e = nc; + + increment_current_name(dict); + + nc = Exp_create(&dict->exp_list); + nc->u.condesc = condesc_add(&dict->contable, generate_id_connector(dict)); + nc->dir = '-'; + nc->multi = false; + nc->type = CONNECTOR_type; + nc->cost = 0; + + ell->e = nc; + + dn_list->exp = n1; + + dn_list = dn_list->right; + } + /* now generate the last one */ + + nc = Exp_create(&dict->exp_list); + nc->u.condesc = condesc_add(&dict->contable, generate_id_connector(dict)); + nc->dir = '+'; + nc->multi = false; + nc->type = CONNECTOR_type; + nc->cost = 0; + + dn_list->exp = nc; + + increment_current_name(dict); + + /* ---- end of the code alluded to above ---- */ + + /* now its time to insert them into the dictionary */ + + dn_list = start_dn_list; + + while (dn_list != NULL) + { + xdn = dn_list->right; + dn_list->left = dn_list->right = NULL; + dn_list->string = build_idiom_word_name(dict, dn_list->string); + dict->root = insert_dict(dict, dict->root, dn_list); + dict->num_entries++; + dn_list = xdn; + } +} + +/** + * returns true if this is a word ending in ".Ix", where x is a number. + */ +bool is_idiom_word(const char * s) +{ + return (numberfy(s) != -1) ; +} diff -Nru link-grammar-5.3.16/link-grammar/dict-common/idiom.h link-grammar-5.5.0/link-grammar/dict-common/idiom.h --- link-grammar-5.3.16/link-grammar/dict-common/idiom.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/idiom.h 2018-01-11 08:10:07.000000000 +0000 @@ -0,0 +1,18 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "dict-structures.h" +#include "link-includes.h" + +void insert_idiom(Dictionary dict, Dict_node *); +bool contains_underbar(const char *); +bool is_idiom_word(const char *); diff -Nru link-grammar-5.3.16/link-grammar/dict-common/print-dict.c link-grammar-5.5.0/link-grammar/dict-common/print-dict.c --- link-grammar-5.3.16/link-grammar/dict-common/print-dict.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/print-dict.c 2018-03-11 23:13:19.000000000 +0000 @@ -0,0 +1,398 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright 2013, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include + +#include "api-structures.h" // for Parse_Options_s (seems hacky to me) +#include "dict-common.h" +#include "dict-defines.h" +#include "print/print.h" +#include "print/print-util.h" +#include "regex-morph.h" +#include "dict-file/word-file.h" +#include "dict-file/read-dict.h" + + +/* ======================================================================== */ + +/* INFIX_NOTATION is always defined; we simply never use the format below. */ +/* #if ! defined INFIX_NOTATION */ +#if 0 +/** + * print the expression, in prefix-style + */ +void print_expression(Exp * n) +{ + E_list * el; + int i, icost; + + if (n == NULL) + { + printf("NULL expression"); + return; + } + + icost = (int) (n->cost); + if (n->type == CONNECTOR_type) + { + for (i=0; imulti) printf("@"); + printf("%s%c", n->u.string, n->dir); + for (i=0; i 0) printf(" "); + } + else + { + for (i=0; itype == AND_type) printf("& "); + if (n->type == OR_type) printf("or "); + for (el = n->u.l; el != NULL; el = el->next) + { + print_expression(el->e); + } + for (i=0; i 0) printf(" "); + if (icost == 0) printf(") "); + } +} + +#else /* INFIX_NOTATION */ + +#define COST_FMT "%.3f" +/** + * print the expression, in infix-style + */ +static dyn_str *print_expression_parens(dyn_str *e, + const Exp * n, int need_parens) +{ + E_list * el; + int i, icost; + double dcost; + + if (n == NULL) + { + append_string(e, "NULL expression"); + return e; + } + + icost = (int) (n->cost); + dcost = n->cost - icost; + if (dcost > 10E-4) + { + dcost = n->cost; + icost = 1; + } + else + { + dcost = 0; + } + + /* print the connector only */ + if (n->type == CONNECTOR_type) + { + for (i=0; imulti) dyn_strcat(e, "@"); + append_string(e, "%s%c", n->u.condesc?n->u.condesc->string:"(null)", n->dir); + for (i=0; iu.l; + if (el == NULL) + { + for (i=0; itype == OR_type) && el->e && (NULL == el->e->u.l)) + { + dyn_strcat(e, "{"); + if (NULL == el->next) dyn_strcat(e, "error-no-next"); + else print_expression_parens(e, el->next->e, false); + append_string(e, "}"); + return e; + } + + if ((icost == 0) && need_parens) dyn_strcat(e, "("); + + /* print left side of binary expr */ + print_expression_parens(e, el->e, true); + + /* get a funny "and optional" when its a named expression thing. */ + if ((n->type == AND_type) && (el->next == NULL)) + { + for (i=0; itype == AND_type) dyn_strcat(e, " & "); + if (n->type == OR_type) dyn_strcat(e, " or "); + + /* print right side of binary expr */ + el = el->next; + if (el == NULL) + { + dyn_strcat(e, "()"); + } + else + { + if (el->e->type == n->type) + { + print_expression_parens(e, el->e, false); + } + else + { + print_expression_parens(e, el->e, true); + } + if (el->next != NULL) + { + // dyn_strcat(e, "\nERROR! Unexpected list!\n"); + /* The SAT parser just naively joins all X_node expressions + * using "or", and this check used to give an error due to that, + * preventing a convenient debugging. + * Just accept it (but mark it with '!'). */ + if (n->type == AND_type) dyn_strcat(e, " &! "); + if (n->type == OR_type) dyn_strcat(e, " or! "); + print_expression_parens(e, el->next->e, true); + } + } + + for (i=0; itype == AND_type) + { + /* multiplicative combinatorial explosion */ + cnt = 1; + for (e_list = e->u.l; e_list != NULL; e_list = e_list->next) + cnt *= count_clause(e_list->e); + } + else if (e->type == OR_type) + { + /* Just additive */ + for (e_list = e->u.l; e_list != NULL; e_list = e_list->next) + cnt += count_clause(e_list->e); + } + else if (e->type == CONNECTOR_type) + { + return 1; + } + else + { + assert(false, "an expression node with no type"); + } + + return cnt; +} + +/** + * Count number of disjuncts given the dict node dn. + */ +static unsigned int count_disjunct_for_dict_node(Dict_node *dn) +{ + return (NULL == dn) ? 0 : count_clause(dn->exp); +} + +#define DJ_COL_WIDTH sizeof(" ") + +/** + * Display the number of disjuncts associated with this dict node + */ +static char *display_counts(const char *word, Dict_node *dn) +{ + dyn_str *s = dyn_str_new(); + + append_string(s, "matches:\n"); + for (; dn != NULL; dn = dn->right) + { + append_string(s, " %-*s %8u disjuncts", + display_width(DJ_COL_WIDTH, dn->string), dn->string, + count_disjunct_for_dict_node(dn)); + + if (dn->file != NULL) + { + append_string(s, " <%s>", dn->file->file); + } + append_string(s, "\n\n"); + } + return dyn_str_take(s); +} + +/** + * Display the number of disjuncts associated with this dict node + */ +static char *display_expr(const char *word, Dict_node *dn) +{ + dyn_str *s = dyn_str_new(); + + append_string(s, "expressions:\n"); + for (; dn != NULL; dn = dn->right) + { + char *expstr = expression_stringify(dn->exp); + + append_string(s, " %-*s %s", + display_width(DJ_COL_WIDTH, dn->string), dn->string, + expstr); + free(expstr); + append_string(s, "\n\n"); + } + return dyn_str_take(s); +} + +static char *display_word_info(Dictionary dict, const char * word) +{ + const char * regex_name; + Dict_node *dn_head; + + dn_head = dictionary_lookup_wild(dict, word); + if (dn_head) + { + char *out = display_counts(word, dn_head); + free_lookup_list(dict, dn_head); + return out; + } + + /* Recurse, if it's a regex match */ + regex_name = match_regex(dict->regex_root, word); + if (regex_name) + { + return display_word_info(dict, regex_name); + } + + return NULL; +} + +static char *display_word_expr(Dictionary dict, const char * word) +{ + const char * regex_name; + Dict_node *dn_head; + + dn_head = dictionary_lookup_wild(dict, word); + if (dn_head) + { + char *out = display_expr(word, dn_head); + free_lookup_list(dict, dn_head); + return out; + } + + /* Recurse, if it's a regex match */ + regex_name = match_regex(dict->regex_root, word); + if (regex_name) + { + return display_word_expr(dict, regex_name); + } + + return NULL; +} + +/** + * dict_display_word_info() - display the information about the given word. + */ +char *dict_display_word_info(Dictionary dict, const char * word, + Parse_Options opts) +{ + return display_word_split(dict, word, opts, display_word_info); +} + +/** + * dict_display_word_expr() - display the connector info for a given word. + */ +char *dict_display_word_expr(Dictionary dict, const char * word, Parse_Options opts) +{ + return display_word_split(dict, word, opts, display_word_expr); +} diff -Nru link-grammar-5.3.16/link-grammar/dict-common/regex-morph.c link-grammar-5.5.0/link-grammar/dict-common/regex-morph.c --- link-grammar-5.3.16/link-grammar/dict-common/regex-morph.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/regex-morph.c 2018-03-04 14:56:46.000000000 +0000 @@ -0,0 +1,162 @@ +/*************************************************************************/ +/* Copyright (c) 2005 Sampo Pyysalo */ +/* Copyright (c) 2009 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +/* On MS Windows, regex.h fails to pull in size_t, so work around this by + * including before ( is not enough) */ +#include +#if HAVE_TRE_TRE_H +#include +#else +#include +#endif +#include "api-structures.h" +#include "error.h" /* verbosity */ +#include "externs.h" /* lgdebug() */ +#include "dict-common/dict-api.h" +#include "dict-common/dict-common.h" +#include "dict-common/regex-morph.h" +#include "link-includes.h" + + +/** + * Support for the regular-expression based token matching system + * using standard POSIX regex. + */ + +/** + * Notify an error message according to the error code. + */ +static void prt_regerror(const char *msg, const Regex_node *re, int rc) +{ + const size_t errbuf_size = regerror(rc, re->re, NULL, 0); + char * const errbuf = malloc(errbuf_size); + + /* + prt_error("Error: Failed to compile regex '%s' (%s) at %d: %s\n", + re->pattern, re->name, erroroffset, error); + */ + regerror(rc, re->re, errbuf, errbuf_size); + prt_error("Error: %s: \"%s\" (%s): %s\n", msg, re->pattern, re->name, errbuf); + free(errbuf); +} + +/** + * Compiles all the given regexs. Returns 0 on success, + * else an error code. + */ +int compile_regexs(Regex_node *re, Dictionary dict) +{ + regex_t *preg; + int rc; + + while (re != NULL) + { + /* If re->re non-null, assume compiled already. */ + if(re->re == NULL) + { + /* Compile with default options (0) and default character + * tables (NULL). */ + /* re->re = pcre_compile(re->pattern, 0, &error, &erroroffset, NULL); */ + preg = (regex_t *) malloc (sizeof(regex_t)); + re->re = preg; + + /* REG_ENHANCED is needed for OS X to support \w etc. */ +#ifndef REG_ENHANCED +#define REG_ENHANCED 0 +#endif + rc = regcomp(preg, re->pattern, REG_NOSUB|REG_EXTENDED|REG_ENHANCED); + if (rc) + { + prt_regerror("Failed to compile regex", re, rc); + return rc; + } + + /* Check that the regex name is defined in the dictionary. */ + if ((NULL != dict) && !boolean_dictionary_lookup(dict, re->name)) + { + /* TODO: better error handing. Maybe remove the regex? */ + prt_error("Error: Regex name %s not found in dictionary!\n", + re->name); + } + } + re = re->next; + } + return 0; +} + +/** + * Tries to match each regex in turn to word s. + * On match, returns the name of the first matching regex. + * If no match is found, returns NULL. + */ +#define D_MRE 6 +const char *match_regex(const Regex_node *re, const char *s) +{ + int rc; + const char *nre_name; + + while (re != NULL) + { + /* Make sure the regex has been compiled. */ + assert(re->re); + +#if 0 + /* Try to match with no extra data (NULL), whole str + * (0 to strlen(s)), and default options (second 0). */ + int rc = pcre_exec(re->re, NULL, s, strlen(s), 0, + 0, ovector, PCRE_OVEC_SIZE); +#endif + + rc = regexec((regex_t*) re->re, s, 0, NULL, 0); + if (0 == rc) + { + lgdebug(+D_MRE, "%s%s %s\n", &"!"[!re->neg], re->name, s); + if (!re->neg) + return re->name; /* Match found - return--no multiple matches. */ + + /* Negative match - skip this regex name. */ + for (nre_name = re->name; re->next != NULL; re = re->next) + { + if (strcmp(nre_name, re->next->name) != 0) break; + } + } + else if (rc != REG_NOMATCH) + { + /* We have an error. */ + prt_regerror("Regex matching error", re, rc); + } + re = re->next; + } + return NULL; /* No matches. */ +} +#undef D_MRE + +/** + * Delete associated storage + */ +void free_regexs(Regex_node *re) +{ + while (re != NULL) + { + Regex_node *next = re->next; + + /* Prevent a crash in regfree() in case of a regex compilation error. */ + if (NULL != re->re) + regfree((regex_t *)re->re); + + free(re->re); + free(re->name); + free(re->pattern); + free(re); + re = next; + } +} diff -Nru link-grammar-5.3.16/link-grammar/dict-common/regex-morph.h link-grammar-5.5.0/link-grammar/dict-common/regex-morph.h --- link-grammar-5.3.16/link-grammar/dict-common/regex-morph.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common/regex-morph.h 2017-05-31 07:51:33.000000000 +0000 @@ -0,0 +1,20 @@ +/*************************************************************************/ +/* Copyright (c) 2005 Sampo Pyysalo */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _REGEX_MORPH_H +#define _REGEX_MORPH_H + +#include "dict-common.h" + +int compile_regexs(Regex_node *, Dictionary); +const char *match_regex(const Regex_node *, const char *); +void free_regexs(Regex_node *); +#endif /* _REGEX_MORPH_H */ diff -Nru link-grammar-5.3.16/link-grammar/dict-common.c link-grammar-5.5.0/link-grammar/dict-common.c --- link-grammar-5.3.16/link-grammar/dict-common.c 2017-02-17 21:27:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-common.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,542 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright 2008, 2009, 2012-2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "anysplit.h" -#include "dict-api.h" -#include "dict-common.h" -#include "externs.h" -#include "pp_knowledge.h" -#include "regex-morph.h" -#include "spellcheck.h" -#include "string-set.h" -#include "structures.h" -#include "word-utils.h" -#include "dict-sql/read-sql.h" -#include "dict-file/read-dict.h" -#include "dict-file/word-file.h" - -/* ======================================================================== */ -/* Affix type finding */ - -/** - * Return TRUE if the word is a suffix. - * - * Suffixes have the form =asdf.asdf or possibly just =asdf without - * the dot (subscript mark). The "null" suffixes have the form - * =.asdf (always with the subscript mark, as there are several). - * Ordinary equals signs appearing in regular text are either = or =[!]. - */ -bool is_suffix(const char infix_mark, const char* w) -{ - if (infix_mark != w[0]) return false; - if (1 >= strlen(w)) return false; - if (0 == strncmp("[!", w+1, 2)) return false; -#if SUBSCRIPT_MARK == '.' - /* Hmmm ... equals signs look like suffixes, but they are not ... */ - if (0 == strcmp("=.v", w)) return false; - if (0 == strcmp("=.eq", w)) return false; -#endif - return true; -} - -/** - * Return TRUE if the word seems to be in stem form. - * Stems are signified by including = sign which is preceded by the subscript - * mark. Examples (. represented the subscript mark): word.= word.=[!] - */ -bool is_stem(const char* w) -{ - const char *subscrmark = strchr(w, SUBSCRIPT_MARK); - - if (NULL == subscrmark) return false; - if (subscrmark == w) return false; - if (STEM_MARK != subscrmark[1]) return false; - return true; -} - -/* ======================================================================== */ -/* Replace the right-most dot with SUBSCRIPT_MARK */ -void patch_subscript(char * s) -{ - char *ds, *de; - int dp; - ds = strrchr(s, SUBSCRIPT_DOT); - if (!ds) return; - - /* a dot at the end or a dot followed by a number is NOT - * considered a subscript */ - de = ds + 1; - if (*de == '\0') return; - dp = (int) *de; - - /* If its followed by a UTF8 char, its NOT a subscript */ - if (127 < dp || dp < 0) return; - /* assert ((0 < dp) && (dp <= 127), "Bad dictionary entry!"); */ - if (isdigit(dp)) return; - *ds = SUBSCRIPT_MARK; -} - -/* ======================================================================== */ - -Dictionary dictionary_create_default_lang(void) -{ - Dictionary dictionary = NULL; - char * lang = get_default_locale(); /* E.g. ll_CC.UTF_8 or ll-CC */ - - if (lang && *lang) - { - lang[strcspn(lang, "_-")] = '\0'; - dictionary = dictionary_create_lang(lang); - } - free(lang); - - /* Fall back to English if no default locale or no matching dict. */ - if (NULL == dictionary) - { - dictionary = dictionary_create_lang("en"); - } - - return dictionary; -} - -Dictionary dictionary_create_lang(const char * lang) -{ - Dictionary dictionary = NULL; - - object_open(NULL, NULL, NULL); /* Invalidate the directory path cache */ - - /* If an sql database exists, try to read that. */ - if (check_db(lang)) - { - dictionary = dictionary_create_from_db(lang); - } - - /* Fallback to a plain-text dictionary */ - if (NULL == dictionary) - { - dictionary = dictionary_create_from_file(lang); - } - - return dictionary; -} - -const char * dictionary_get_lang(Dictionary dict) -{ - if (!dict) return ""; - return dict->lang; -} - -/* ======================================================================== */ -/* Dictionary lookup stuff */ - -/** - * dictionary_lookup_list() - get list of matching words in the dictionary. - * - * Returns a pointer to a list of dict_nodes for matching words in the - * dictionary. Matches include words that appear in idioms. To exclude - * idioms, use abridged_lookup_list() to obtain matches. - * - * This list is made up of Dict_nodes, linked by their right pointers. - * The exp, file and string fields are copied from the dictionary. - * - * The returned list must be freed with free_lookup_list(). - */ -Dict_node * dictionary_lookup_list(const Dictionary dict, const char *s) -{ - return dict->lookup_list(dict, s); -} - -void free_lookup_list(const Dictionary dict, Dict_node *llist) -{ - dict->free_lookup(dict, llist); -} - -bool boolean_dictionary_lookup(const Dictionary dict, const char *s) -{ - return dict->lookup(dict, s); -} - -/** - * Return true if word is in dictionary, or if word is matched by - * regex. - */ -bool find_word_in_dict(const Dictionary dict, const char * word) -{ - const char * regex_name; - if (boolean_dictionary_lookup (dict, word)) return true; - - regex_name = match_regex(dict->regex_root, word); - if (NULL == regex_name) return false; - - return boolean_dictionary_lookup(dict, regex_name); -} - -/* ======================================================================== */ -/* the following functions are for handling deletion */ - -#ifdef USEFUL_BUT_NOT_CURRENTLY_USED -/** - * Returns true if it finds a non-idiom dict_node in a file that matches - * the string s. - * - * Also sets parent and to_be_deleted appropriately. - * Note: this function is used in only one place: delete_dictionary_words() - * which is, itself, not currently used ... - */ -static bool find_one_non_idiom_node(Dict_node * p, Dict_node * dn, - const char * s, - Dict_node **parent, Dict_node **to_be_deleted) -{ - int m; - if (dn == NULL) return false; - m = dict_order_bare(s, dn); - if (m <= 0) { - if (find_one_non_idiom_node(dn, dn->left, s, parent, to_be_deleted)) return true; - } -/* if ((m == 0) && (!is_idiom_word(dn->string)) && (dn->file != NULL)) { */ - if ((m == 0) && (!is_idiom_word(dn->string))) { - *to_be_deleted = dn; - *parent = p; - return true; - } - if (m >= 0) { - if (find_one_non_idiom_node(dn, dn->right, s, parent, to_be_deleted)) return true; - } - return false; -} - -static void set_parent_of_node(Dictionary dict, - Dict_node *p, - Dict_node * del, - Dict_node * newnode) -{ - if (p == NULL) { - dict->root = newnode; - } else { - if (p->left == del) { - p->left = newnode; - } else if (p->right == del) { - p->right = newnode; - } else { - assert(false, "Dictionary broken?"); - } - } -} - -/** - * This deletes all the non-idiom words of the dictionary that match - * the given string. Returns true if some deleted, false otherwise. - * - * XXX Note: this function is not currently used anywhere in the code, - * but it could be useful for general dictionary editing. - */ -int delete_dictionary_words(Dictionary dict, const char * s) -{ - Dict_node *pred, *pred_parent; - Dict_node *parent, *to_be_deleted; - - if (!find_one_non_idiom_node(NULL, dict->root, s, &parent, &to_be_deleted)) return false; - for(;;) { - /* now parent and to_be_deleted are set */ - if (to_be_deleted->file != NULL) { - to_be_deleted->file->changed = true; - } - if (to_be_deleted->left == NULL) { - set_parent_of_node(dict, parent, to_be_deleted, to_be_deleted->right); - free_dict_node(to_be_deleted); - } else { - pred_parent = to_be_deleted; - pred = to_be_deleted->left; - while(pred->right != NULL) { - pred_parent = pred; - pred = pred->right; - } - to_be_deleted->string = pred->string; - to_be_deleted->file = pred->file; - to_be_deleted->exp = pred->exp; - set_parent_of_node(dict, pred_parent, pred, pred->left); - free_dict_node(pred); - } - if (!find_one_non_idiom_node(NULL, dict->root, s, &parent, &to_be_deleted)) return true; - } -} -#endif /* USEFUL_BUT_NOT_CURRENTLY_USED */ - -/** - * The following two functions free the Exp s and the - * E_lists of the dictionary. Not to be confused with - * free_E_list in word-utils.c. - */ -static void free_Elist(E_list * l) -{ - E_list * l1; - - for (; l != NULL; l = l1) { - l1 = l->next; - xfree(l, sizeof(E_list)); - } -} - -static inline void exp_free(Exp * e) -{ - xfree((char *)e, sizeof(Exp)); -} - -static inline void free_dict_node(Dict_node *dn) -{ - xfree((char *)dn, sizeof(Dict_node)); -} - -void free_Exp_list(Exp_list * eli) -{ - Exp * e1; - Exp * e = eli->exp_list; - for (; e != NULL; e = e1) - { - e1 = e->next; - if (e->type != CONNECTOR_type) - { - free_Elist(e->u.l); - } - exp_free(e); - } -} - -static void free_dict_node_recursive(Dict_node * dn) -{ - if (dn == NULL) return; - free_dict_node_recursive(dn->left); - free_dict_node_recursive(dn->right); - free_dict_node(dn); -} - -static void free_dictionary(Dictionary dict) -{ - free_dict_node_recursive(dict->root); - free_Word_file(dict->word_file_header); - free_Exp_list(&dict->exp_list); -} - -static void affix_list_delete(Dictionary dict) -{ - int i; - Afdict_class * atc; - for (i=0, atc = dict->afdict_class; i < AFDICT_NUM_ENTRIES; i++, atc++) - { - if (atc->string) free(atc->string); - } - free(dict->afdict_class); - dict->afdict_class = NULL; -} - -void dictionary_delete(Dictionary dict) -{ - if (!dict) return; - - if (verbosity > 0) { - prt_error("Info: Freeing dictionary %s\n", dict->name); - } - -#ifdef USE_CORPUS - lg_corpus_delete(dict->corpus); -#endif - - if (dict->affix_table != NULL) { - affix_list_delete(dict->affix_table); - dictionary_delete(dict->affix_table); - } - spellcheck_destroy(dict->spell_checker); - if ((locale_t) 0 != dict->lctype) { - freelocale(dict->lctype); - } - - connector_set_delete(dict->unlimited_connector_set); - - if (dict->close) dict->close(dict); - - pp_knowledge_close(dict->base_knowledge); - pp_knowledge_close(dict->hpsg_knowledge); - string_set_delete(dict->string_set); - free_regexs(dict->regex_root); - free_anysplit(dict); - free_dictionary(dict); - xfree(dict, sizeof(struct Dictionary_s)); - object_open(NULL, NULL, NULL); /* Free the directory path cache */ -} - -/* ======================================================================== */ - -/* INFIX_NOTATION is always defined; we simply never use the format below. */ -/* #if ! defined INFIX_NOTATION */ -#if 0 -/** - * print the expression, in prefix-style - */ -void print_expression(Exp * n) -{ - E_list * el; - int i, icost; - - if (n == NULL) - { - printf("NULL expression"); - return; - } - - icost = (int) (n->cost); - if (n->type == CONNECTOR_type) - { - for (i=0; imulti) printf("@"); - printf("%s%c", n->u.string, n->dir); - for (i=0; i 0) printf(" "); - } - else - { - for (i=0; itype == AND_type) printf("& "); - if (n->type == OR_type) printf("or "); - for (el = n->u.l; el != NULL; el = el->next) - { - print_expression(el->e); - } - for (i=0; i 0) printf(" "); - if (icost == 0) printf(") "); - } -} - -#else /* INFIX_NOTATION */ - -#define COST_FMT "%.3f" -/** - * print the expression, in infix-style - */ -static void print_expression_parens(const Exp * n, int need_parens) -{ - E_list * el; - int i, icost; - double dcost; - - if (n == NULL) - { - err_msg(lg_Debug, "NULL expression"); - return; - } - - icost = (int) (n->cost); - dcost = n->cost - icost; - if (dcost > 10E-4) - { - dcost = n->cost; - icost = 1; - } - else - { - dcost = 0; - } - - /* print the connector only */ - if (n->type == CONNECTOR_type) - { - for (i=0; imulti) err_msg(lg_Debug, "@"); - err_msg(lg_Debug, "%s%c", n->u.string, n->dir); - for (i=0; iu.l; - if (el == NULL) - { - for (i=0; itype == OR_type) && - el && el->e && (NULL == el->e->u.l)) - { - prt_error ("{"); - if (NULL == el->next) err_msg(lg_Debug, "error-no-next"); - else print_expression_parens(el->next->e, false); - prt_error ("}"); - return; - } - - if ((icost == 0) && need_parens) err_msg(lg_Debug, "("); - - /* print left side of binary expr */ - print_expression_parens(el->e, true); - - /* get a funny "and optional" when its a named expression thing. */ - if ((n->type == AND_type) && (el->next == NULL)) - { - for (i=0; itype == AND_type) err_msg(lg_Debug, " & "); - if (n->type == OR_type) err_msg(lg_Debug, " or "); - - /* print right side of binary expr */ - el = el->next; - if (el == NULL) - { - prt_error ("()"); - } - else - { - if (el->e->type == n->type) - { - print_expression_parens(el->e, false); - } - else - { - print_expression_parens(el->e, true); - } - if (el->next != NULL) - { - // prt_error ("\nERROR! Unexpected list!\n"); - /* The SAT parser just naively joins all X_node expressions - * using "or", and this check used to give an error due to that, - * preventing a convenient debugging. - * Just accept it (but mark it with '!'). */ - if (n->type == AND_type) err_msg(lg_Debug, " &! "); - if (n->type == OR_type) err_msg(lg_Debug, " or! "); - print_expression_parens(el->next->e, true); - } - } - - for (i=0; iafdict_class[class]) - -#endif /* _LG_DICT_COMMON_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/dict-file/dictionary.c link-grammar-5.5.0/link-grammar/dict-file/dictionary.c --- link-grammar-5.3.16/link-grammar/dict-file/dictionary.c 2017-01-27 22:07:11.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-file/dictionary.c 2018-04-23 19:49:07.000000000 +0000 @@ -11,23 +11,21 @@ /* */ /*************************************************************************/ -#include "anysplit.h" #include "api-structures.h" -#include "dict-api.h" -#include "dict-common.h" -#include "externs.h" -#include "idiom.h" -#include "pp_knowledge.h" +#include "dict-common/dict-affix.h" +#include "dict-common/dict-api.h" +#include "dict-common/dict-common.h" +#include "dict-common/dict-impl.h" +#include "dict-common/dict-utils.h" +#include "dict-common/file-utils.h" +#include "dict-common/idiom.h" +#include "dict-common/regex-morph.h" +#include "post-process/pp_knowledge.h" #include "read-dict.h" #include "read-regex.h" -#include "regex-morph.h" -#include "spellcheck.h" #include "string-set.h" -#include "structures.h" -#include "utilities.h" -#include "word-utils.h" -#include "dict-sql/read-sql.h" /* Temporary hack */ - +#include "tokenize/anysplit.h" // Initialize anysplit here ... +#include "tokenize/spellcheck.h" // Initialize spellcheck here ... /*************************************************************** * @@ -35,74 +33,12 @@ * ****************************************************************/ -/* Units will typically have a ".u" at the end. Get - * rid of it, as otherwise stripping is messed up. */ -static inline char * deinflect(const char * str) -{ - size_t len; - char *s; - char *p = strrchr(str, SUBSCRIPT_MARK); - if (!p || (p == str)) return strdup(str); - - len = p - str; - s = (char *)malloc(len + 1); - strncpy(s, str, len); - s[len] = '\0'; - return s; -} - -/* The affix dictionary is represented as a dynamically allocated array with - * an element for each class (connector type) in the affix file. Each element - * has a pointer to an array of strings which are the punctuation/affix - * names. */ - -const char * afdict_classname[] = { AFDICT_CLASSNAMES }; - -/** - * Find the affix table entry for given connector name. - * If the connector name is not in the table, return NULL. - */ -Afdict_class * afdict_find(Dictionary afdict, const char * con, bool notify_err) -{ - const char ** ac; - - for (ac = afdict_classname; - ac < &afdict_classname[ARRAY_SIZE(afdict_classname)]; ac++) - { - if (0 == strcmp(*ac, con)) - return &afdict->afdict_class[ac - afdict_classname]; - } - if (notify_err) { - prt_error("Warning: Unknown class name %s found near line %d of %s.\n" - "\tThis class name will be ignored.\n", - con, afdict->line_number, afdict->name); - } - return NULL; -} - -#define AFFIX_COUNT_MEM_INCREMENT 64 - -static void affix_list_add(Dictionary afdict, Afdict_class * ac, - const char * affix) -{ - if (NULL == ac) return; /* ignore unknown class name */ - if (ac->mem_elems <= ac->length) - { - size_t new_sz; - ac->mem_elems += AFFIX_COUNT_MEM_INCREMENT; - new_sz = ac->mem_elems * sizeof(const char *); - ac->string = (char const **) realloc((void *)ac->string, new_sz); - } - ac->string[ac->length] = string_set_add(affix, afdict->string_set); - ac->length++; -} - static void load_affix(Dictionary afdict, Dict_node *dn, int l) { Dict_node * dnx = NULL; for (; NULL != dn; dn = dnx) { - char *string; + const char *string; const char *con = word_only_connector(dn); if (NULL == con) { @@ -123,277 +59,28 @@ if (contains_underbar(dn->string)) { char *p; - string = strdup(dn->string); - p = string+1; + char *writeable_string = strdupa(dn->string); + p = writeable_string+1; while (*p != '_' && *p != '\0') p++; *p = '\0'; + string = writeable_string; } else { - string = deinflect(dn->string); + string = dn->string; } affix_list_add(afdict, afdict_find(afdict, con, /*notify_err*/true), string); - free(string); dnx = dn->left; - xfree((char *)dn, sizeof(Dict_node)); + free(dn); } } -#ifdef AFDICT_ORDER_NOT_PRESERVED -static int revcmplen(const void *a, const void *b) -{ - return strlen(*(char * const *)b) - strlen(*(char * const *)a); -} -#endif /* AFDICT_ORDER_NOT_PRESERVED */ - -/** - * Traverse the main dict in dictionary order, and extract all the suffixes - * and prefixes - every time we see a new suffix/prefix (the previous one is - * remembered by w_last), we save it in the corresponding affix-class list. - * The saved affixes don't include the infix mark. - */ -static void get_dict_affixes(Dictionary dict, Dict_node * dn, - char infix_mark, char * w_last) -{ - const char *w; /* current dict word */ - const char *w_sm; /* SUBSCRIPT_MARK position in the dict word */ - size_t w_len; /* length of the dict word */ - Dictionary afdict = dict->affix_table; - - if (dn == NULL) return; - get_dict_affixes(dict, dn->right, infix_mark, w_last); - - w = dn->string; - w_sm = strrchr(w, SUBSCRIPT_MARK); - w_len = (NULL == w_sm) ? strlen(w) : (size_t)(w_sm - w); - if (w_len > MAX_WORD) - { - prt_error("Error: word '%s' too long (%zd), program may malfunction\n", - w, w_len); - w_len = MAX_WORD; - } - /* (strlen(w_last) can be cached for speedup) */ - if ((strlen(w_last) != w_len) || (0 != strncmp(w_last, w, w_len))) - { - strncpy(w_last, w, w_len); - w_last[w_len] = '\0'; - - if (infix_mark == w_last[0]) - { - affix_list_add(afdict, &afdict->afdict_class[AFDICT_SUF], w_last+1); - } - else - if (infix_mark == w_last[w_len-1]) - { - w_last[w_len-1] = '\0'; - affix_list_add(afdict, &afdict->afdict_class[AFDICT_PRE], w_last); - w_last[w_len-1] = infix_mark; - } - } - - get_dict_affixes(dict, dn->left, infix_mark, w_last); -} - -/** - * Concatenate the definitions for the given affix class. - * This allows specifying the characters in different definitions - * instead in a one long string, e.g. instead of: - * ""«»《》【】『』`„": QUOTES+; - * One can specify (note the added spaces): - * """ «» 《》 【】 『』 ` „: QUOTES+; - * Or even: - * """: QUOTES+; - * «» : QUOTES+; - * etc. - * Note that if there are no definitions or only one definition, there is - * nothing to do. - * The result is written to the first entry. - * @param classno The given affix class. - */ -static void concat_class(Dictionary afdict, int classno) -{ - Afdict_class * ac; - size_t i; - dyn_str * qs; - - ac = AFCLASS(afdict, classno); - if (1 >= ac->length) return; - - qs = dyn_str_new(); - for (i = 0; i < ac->length; i++) - dyn_strcat(qs, ac->string[i]); - - ac->string[0] = string_set_add(qs->str, afdict->string_set); - dyn_str_delete(qs); -} - -/* Compare lengths of strings, for qsort */ -static int cmplen(const void *a, const void *b) -{ - const char * const *sa = a; - const char * const *sb = b; - return strlen(*sb) - strlen(*sa); -} - -/** - * Initialize several classes. - * In case of a future dynamic change of the affix table, this function needs to - * be invoked again after the affix table is re-constructed (changes may be - * needed - especially to first free memory and initialize the affix dict - * structure.). - */ -#define D_AI 11 -static bool afdict_init(Dictionary dict) -{ - Afdict_class * ac; - Dictionary afdict = dict->affix_table; - - /* FIXME: read_entry() builds word lists in reverse order (can we - * just create the list top-down without breaking anything?). Unless - * it is fixed to preserve the order, reverse here the word list for - * each affix class. */ - for (ac = afdict->afdict_class; - ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++) - { - int i; - int l = ac->length - 1; - const char * t; - - for (i = 0; i < l; i++, l--) - { - t = ac->string[i]; - ac->string[i] = ac->string[l]; - ac->string[l] = t; - } - } - - /* Create the affix lists */ - ac = AFCLASS(afdict, AFDICT_INFIXMARK); - if ((1 < ac->length) || ((1 == ac->length) && (1 != strlen(ac->string[0])))) - { - prt_error("Error: afdict_init: Invalid value for class %s in file %s" - " (should have been one ASCII punctuation - ignored)\n", - afdict_classname[AFDICT_INFIXMARK], afdict->name); - free((void *)ac->string); - ac->length = 0; - ac->mem_elems = 0; - ac->string = NULL; - } - /* XXX For now there is a possibility to use predefined SUF and PRE lists. - * So if SUF or PRE are defined, don't extract any of them from the dict. */ - if (1 == ac->length) - { - if ((0 == AFCLASS(afdict, AFDICT_PRE)->length) && - (0 == AFCLASS(afdict, AFDICT_SUF)->length)) - { - char last_entry[MAX_WORD+1] = ""; - get_dict_affixes(dict, dict->root, ac->string[0][0], last_entry); - } - } - else - { - /* No INFIX_MARK - create a dummy one that always mismatches */ - affix_list_add(afdict, &afdict->afdict_class[AFDICT_INFIXMARK], ""); - } - - if (verbosity_level(+D_AI)) - { - size_t l; - - for (ac = afdict->afdict_class; - ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++) - { - if (0 == ac->length) continue; - lgdebug(+0, "Class %s, %zd items:", - afdict_classname[ac-afdict->afdict_class], ac->length); - for (l = 0; l < ac->length; l++) - lgdebug(0, " '%s'", ac->string[l]); - lgdebug(0, "\n"); - } - } -#undef D_AI - - /* Store the SANEMORPHISM regex in the unused (up to now) - * regex_root element of the affix dictionary, and precompile it */ - assert(NULL == afdict->regex_root, "SM regex is already assigned"); - ac = AFCLASS(afdict, AFDICT_SANEMORPHISM); - if (0 != ac->length) - { - int rc; - - Regex_node *sm_re = malloc(sizeof(*sm_re)); - dyn_str *rebuf = dyn_str_new(); - - /* The regex used to be converted to: ^((original-regex)b)+$ - * In the initial wordgraph version word boundaries are not supported, - * so instead it is converted to: ^(original-regex)+$ */ -#ifdef WORD_BOUNDARIES - dyn_strcat(rebuf, "^(("); -#else - dyn_strcat(rebuf, "^("); -#endif - dyn_strcat(rebuf, ac->string[0]); -#ifdef WORD_BOUNDARIES - dyn_strcat(rebuf, ")b)+$"); -#else - dyn_strcat(rebuf, ")+$"); -#endif - sm_re->pattern = strdup(rebuf->str); - dyn_str_delete(rebuf); - - afdict->regex_root = sm_re; - sm_re->name = strdup(afdict_classname[AFDICT_SANEMORPHISM]); - sm_re->re = NULL; - sm_re->next = NULL; - sm_re->neg = false; - rc = compile_regexs(afdict->regex_root, afdict); - if (rc) { - prt_error("Error: afdict_init: Failed to compile " - "regex '%s' in file %s, return code %d\n", - afdict_classname[AFDICT_SANEMORPHISM], afdict->name, rc); - return false; - } - lgdebug(+5, "%s regex %s\n", - afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern); - } - - /* sort the UNITS list */ - /* Longer unit names must get split off before shorter ones. - * This prevents single-letter splits from screwing things - * up. e.g. split 7gram before 7am before 7m - */ - ac = AFCLASS(afdict, AFDICT_UNITS); - if (0 < ac->length) - { - qsort(ac->string, ac->length, sizeof(char *), cmplen); - } - -#ifdef AFDICT_ORDER_NOT_PRESERVED - /* pre-sort the MPRE list */ - ac = AFCLASS(afdict, AFDICT_MPRE); - if (0 < ac->length) - { - /* Longer subwords have priority over shorter ones, - * reverse-sort by length. - * XXX mprefix_split() for Hebrew depends on that. */ - qsort(ac->string, ac->length, sizeof(char *), revcmplen); - } -#endif /* AFDICT_ORDER_NOT_PRESERVED */ - - concat_class(afdict, AFDICT_QUOTES); - concat_class(afdict, AFDICT_BULLETS); - - if (! anysplit_init(afdict)) return false; - - return true; -} - static void free_llist(Dictionary dict, Dict_node *llist) { - free_lookup(llist); + file_free_lookup(llist); } /** @@ -423,9 +110,8 @@ { const char * t; Dictionary dict; - Dict_node *dict_node; - dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); + dict = (Dictionary) malloc(sizeof(struct Dictionary_s)); memset(dict, 0, sizeof(struct Dictionary_s)); /* Language and file-name stuff */ @@ -436,6 +122,9 @@ lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang); dict->name = string_set_add(dict_name, dict->string_set); + memset(dict->current_idiom, 'A', IDIOM_LINK_SZ-1); + dict->current_idiom[IDIOM_LINK_SZ-1] = 0; + /* * A special setup per dictionary type. The check here assumes the affix * dictionary name contains "affix". FIXME: For not using this @@ -452,32 +141,28 @@ #endif dict->insert_entry = insert_list; - dict->lookup_list = lookup_list; + dict->lookup_list = file_lookup_list; + dict->lookup_wild = file_lookup_wild; dict->free_lookup = free_llist; - dict->lookup = boolean_lookup; + dict->lookup = file_boolean_lookup; + dict->contable.num_con = 1<<13; } else { /* * Affix dictionary. */ - size_t i; - + afclass_init(dict); dict->insert_entry = load_affix; dict->lookup = return_true; - - /* initialize the class table */ - dict->afdict_class = - malloc(sizeof(*dict->afdict_class) * ARRAY_SIZE(afdict_classname)); - for (i = 0; i < ARRAY_SIZE(afdict_classname); i++) - { - dict->afdict_class[i].mem_elems = 0; - dict->afdict_class[i].length = 0; - dict->afdict_class[i].string = NULL; - } + dict->contable.num_con = 1<<9; } dict->affix_table = NULL; + dict->contable.size = 0; + dict->contable.length_limit_def = NULL; + dict->contable.length_limit_def_next = &dict->contable.length_limit_def; + /* Read dictionary from the input string. */ dict->input = input; dict->pin = dict->input; @@ -500,55 +185,7 @@ return dict; } - /* Get the locale for the dictionary. The first one of the - * following which exists, is used: - * 1. The locale which is defined in the dictionary. - * 2. The locale from the environment. - * 3. On Windows - the user's default locale. - * NULL is returned if the locale is not valid. - * Note: - * If we don't have locale_t, as a side effect of checking the locale - * it is set as the program's locale (as desired). However, in that - * case if it is not valid and this is the first dictionary which is - * opened, the program's locale may remain the initial one, i.e. "C" - * (unless the API user changed it). */ - dict->locale = linkgrammar_get_dict_locale(dict); - - /* If the program's locale doesn't have a UTF-8 codeset (e.g. it is - * "C", or because the API user has set it incorrectly) set it to one - * that has it. */ - set_utf8_program_locale(); - - /* If the dictionary locale couldn't be established - then set - * dict->locale so that it is consistent with the current program's - * locale. It will be used as the intended locale of this - * dictionary, and the locale of the compiled regexs. */ - if (NULL == dict->locale) - { - dict->locale = setlocale(LC_CTYPE, NULL); - prt_error("Warning: Couldn't set dictionary locale! " - "Using current program locale \"%s\"\n", dict->locale); - } - - /* setlocale() returns a string owned by the system. Copy it. */ - dict->locale = string_set_add(dict->locale, dict->string_set); - - -#ifdef HAVE_LOCALE_T - /* Since linkgrammar_get_dict_locale() (which is called above) - * validates the locale, the following call is guaranteed to succeed. */ - dict->lctype = newlocale_LC_CTYPE(dict->locale); - - /* If dict->locale is still not set, there is a bug. - * Without this assert(), the program may SEGFAULT when it - * uses the isw*() functions. */ - assert((locale_t) 0 != dict->lctype, "Dictionary locale is not set."); -#else - dict->lctype = 0; -#endif /* HAVE_LOCALE_T */ - - /* setlocale() returns a string owned by the system. Copy it. */ - dict->locale = string_set_add(dict->locale, dict->string_set); + dictionary_setup_locale(dict); dict->affix_table = dictionary_six(lang, affix_name, NULL, NULL, NULL, NULL); if (dict->affix_table == NULL) @@ -559,6 +196,9 @@ if (! afdict_init(dict)) goto failure; + if (! anysplit_init(dict->affix_table)) + goto failure; + /* * Process the regex file. * We have to compile regexs using the dictionary locale, @@ -574,6 +214,7 @@ if (compile_regexs(dict->regex_root, dict)) { locale = setlocale(LC_CTYPE, locale); /* Restore the locale. */ + assert(NULL != locale, "Cannot restore program locale"); goto failure; } locale = setlocale(LC_CTYPE, locale); /* Restore the locale. */ @@ -583,27 +224,17 @@ dict->corpus = lg_corpus_new(); #endif - dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); - dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); - dict->base_knowledge = pp_knowledge_open(pp_name); - if (NULL == dict->base_knowledge) goto failure; dict->hpsg_knowledge = pp_knowledge_open(cons_name); - if (NULL == dict->hpsg_knowledge) goto failure; - dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); - dict->use_unknown_word = true; + sort_condesc_by_uc_constring(dict); + dictionary_setup_defines(dict); - dict->shuffle_linkages = false; - if (0 == strcmp(dict->lang, "any") || NULL != dict->affix_table->anysplit) + // Special-case hack. + if ((0 == strncmp(dict->lang, "any", 3)) || + (NULL != dict->affix_table->anysplit)) dict->shuffle_linkages = true; - dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD); - if (dict_node != NULL) - dict->unlimited_connector_set = connector_set_create(dict_node->exp); - - free_lookup(dict_node); - return dict; failure: @@ -673,10 +304,15 @@ } +#ifdef USE_VITERBI /** * Use "string" as the input dictionary. All of the other parts, * including post-processing, affix table, etc, are NULL. * This routine is intended for unit-testing ONLY. + * + * FIXME? (Used only by code that is in use for now.) + * 1. get_default_locale() returns locale, not language. + * 2. "lang" memory leak on success. */ Dictionary dictionary_create_from_utf8(const char * input) { @@ -698,4 +334,5 @@ return dictionary; } +#endif // USE_VITERBI diff -Nru link-grammar-5.3.16/link-grammar/dict-file/read-dict.c link-grammar-5.5.0/link-grammar/dict-file/read-dict.c --- link-grammar-5.3.16/link-grammar/dict-file/read-dict.c 2017-01-27 22:01:05.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-file/read-dict.c 2018-04-25 17:04:50.000000000 +0000 @@ -13,230 +13,21 @@ #include -#include "build-disjuncts.h" -#include "dict-common.h" +#include "dict-common/dict-affix.h" // For is_stem() +#include "dict-common/dict-common.h" +#include "dict-common/dict-defines.h" // For SUBSCRIPT_MARK +#include "dict-common/file-utils.h" +#include "dict-common/idiom.h" +#include "dict-common/regex-morph.h" #include "error.h" -#include "print.h" +#include "print/print.h" #include "externs.h" -#include "idiom.h" #include "read-dict.h" -#include "regex-morph.h" #include "string-set.h" +#include "tokenize/tok-structures.h" // needed for MT_WALL #include "utilities.h" #include "word-file.h" -/** - * Format the given locale for use in setlocale(). - * POSIX systems and Windows use different conventions. - * On Windows, convert to full language and territory names, because the - * short ones doesn't work for some reason on every system (including MinGW). - * @param dict Used for putting the returned value in a string-set. - * @param ll Locale 2-letter language code. - * @param cc Locale 2-letter territory code. - * @return The formatted locale, directly usable in setlocale(). - */ -static const char * format_locale(Dictionary dict, - const char *ll, const char *cc) -{ - unsigned char *locale_ll = (unsigned char *)strdupa(ll); - unsigned char *locale_cc = (unsigned char *)strdupa(cc); - - for (unsigned char *p = locale_ll; '\0' != *p; p++) *p = tolower(*p); - for (unsigned char *p = locale_cc; '\0' != *p; p++) *p = toupper(*p); - -#ifdef _WIN32 - const int locale_size = strlen(ll) + 1 + strlen(cc) + 1; - char *locale = alloca(locale_size); - snprintf(locale, locale_size, "%s-%s", locale_ll, locale_cc); - - wchar_t wlocale[LOCALE_NAME_MAX_LENGTH]; - wchar_t wtmpbuf[LOCALE_NAME_MAX_LENGTH]; - char tmpbuf[LOCALE_NAME_MAX_LENGTH]; - char locale_buf[LOCALE_NAME_MAX_LENGTH]; - size_t r; - - r = mbstowcs(wlocale, locale, LOCALE_NAME_MAX_LENGTH); - if ((size_t)-1 == r) - { - prt_error("Error: Error converting %s to wide character.\n", locale); - return NULL; - } - wlocale[LOCALE_NAME_MAX_LENGTH-1] = L'\0'; - - if (0 >= GetLocaleInfoEx(wlocale, LOCALE_SENGLISHLANGUAGENAME, - wtmpbuf, LOCALE_NAME_MAX_LENGTH)) - { - prt_error("Error: GetLocaleInfoEx LOCALE_SENGLISHLANGUAGENAME Locale=%s: \n" - "Error %d", locale, (int)GetLastError()); - return NULL; - } - r = wcstombs(tmpbuf, wtmpbuf, LOCALE_NAME_MAX_LENGTH); - if ((size_t)-1 == r) - { - prt_error("Error: Error converting locale language from wide character.\n"); - return NULL; - } - tmpbuf[LOCALE_NAME_MAX_LENGTH-1] = '\0'; - if (0 == strncmp(tmpbuf, "Unknown", 7)) - { - prt_error("Error: Unknown territory code in locale \"%s\"\n", locale); - return NULL; - } - strcpy(locale_buf, tmpbuf); - strcat(locale_buf, "_"); - - if (0 >= GetLocaleInfoEx(wlocale, LOCALE_SENGLISHCOUNTRYNAME, - wtmpbuf, LOCALE_NAME_MAX_LENGTH)) - { - prt_error("Error: GetLocaleInfoEx LOCALE_SENGLISHCOUNTRYNAME Locale=%s: \n" - "Error %d", locale, (int)GetLastError()); - return NULL; - } - r = wcstombs(tmpbuf, wtmpbuf, LOCALE_NAME_MAX_LENGTH); - if ((size_t)-1 == r) - { - prt_error("Error: Error converting locale territory from wide character.\n"); - return NULL; - } - tmpbuf[LOCALE_NAME_MAX_LENGTH-1] = '\0'; - if (0 == strncmp(tmpbuf, "Unknown", 7)) - { - prt_error("Error: Unknown territory code in locale \"%s\"\n", locale); - return NULL; - } - locale = strcat(locale_buf, tmpbuf); -#else /* Assuming POSIX */ - const int locale_size = strlen(ll) + 1 + strlen(cc) + sizeof(".UTF-8"); - char *locale = alloca(locale_size); - snprintf(locale, locale_size, "%s_%s.UTF-8", locale_ll, locale_cc); -#endif - - return string_set_add(locale, dict->string_set); -} - -/** - * Return a locale for the given dictionary, in the OS format. - * - If is defined, use it. - * - Else use the locale from the environment. - * - On Windows, if no environment locale use the default locale. - * - * : LL4cc+; - * LL is the ISO639 language code in uppercase, - * cc is the ISO3166 territory code in lowercase. - * This particular capitalization is needed for the value to be a - * valid LG connector. - * For transliterated dictionaries: - * : C+; - * - * @param dict The dictionary for which the locale is needed. - * @return The locale, in a format suitable for use by setlocale(). - */ -const char * linkgrammar_get_dict_locale(Dictionary dict) -{ - if (dict->locale) return dict->locale; - - const char *locale; - Dict_node *dn = lookup_list(dict, ""); - - if (NULL == dn) - { - lgdebug(D_USER_FILES, "Debug: Dictionary '%s': Locale is not defined.\n", - dict->name); - goto locale_error; - } - - if (0 == strcmp(dn->exp->u.string, "C")) - { - locale = string_set_add("C", dict->string_set); - } - else - { - char c; - char locale_ll[4], locale_cc[3]; - int locale_numelement = sscanf(dn->exp->u.string, "%3[A-Z]4%2[a-z]%c", - locale_ll, locale_cc, &c); - if (2 != locale_numelement) - { - prt_error("Error: \": %s\" " - "should be in the form LL4cc+\n" - "\t(LL: language code; cc: territory code) " - "\tor C+ for transliterated dictionaries.\n", - dn->exp->u.string); - goto locale_error; - } - - locale = format_locale(dict, locale_ll, locale_cc); - - if (!try_locale(locale)) - { - prt_error("Debug: Dictionary \"%s\": Locale \"%s\" unknown\n", - dict->name, locale); - goto locale_error; - } - } - - free_lookup(dn); - lgdebug(D_USER_FILES, "Debug: Dictionary locale: \"%s\"\n", locale); - dict->locale = locale; - return locale; - -locale_error: - { - free_lookup(dn); - - const char *locale = get_default_locale(); - if (NULL == locale) return NULL; - const char *sslocale = string_set_add(locale, dict->string_set); - free((void *)locale); - prt_error("Info: Dictionary '%s': No locale definition - " - "\"%s\" will be used.\n", dict->name, sslocale); - if (!try_locale(sslocale)) - { - lgdebug(D_USER_FILES, "Debug: Unknown locale \"%s\"...\n", sslocale); - return NULL; - } - return sslocale; - } -} - -const char * linkgrammar_get_version(void) -{ - const char *s = "link-grammar-" LINK_VERSION_STRING; - return s; -} - -const char * linkgrammar_get_dict_version(Dictionary dict) -{ - char * ver; - char * p; - Dict_node *dn; - Exp *e; - - if (dict->version) return dict->version; - - /* The newer dictionaries should contain a macro of the form: - * : V4v6v6+; - * which would indicate dictionary version 4.6.6 - * Older dictionaries contain no version info. - */ - dn = lookup_list(dict, ""); - if (NULL == dn) return "[unknown]"; - - e = dn->exp; - ver = strdup(&e->u.string[1]); - p = strchr(ver, 'v'); - while (p) - { - *p = '.'; - p = strchr(p+1, 'v'); - } - - free_lookup(dn); - dict->version = string_set_add(ver, dict->string_set); - free(ver); - return dict->version; -} - /* The dictionary format: @@ -337,28 +128,41 @@ if (dict->recursive_error) return; dict->recursive_error = true; + char token[MAX_TOKEN_LENGTH]; + strcpy(token, dict->token); + bool save_is_special = dict->is_special; + const char * save_input = dict->input; + const char * save_pin = dict->pin; + int save_already_got_it = dict->already_got_it; + int save_line_number = dict->line_number; + tokens[0] = '\0'; for (i=0; i<5 && dict->token[0] != '\0' ; i++) { pos += snprintf(t, ERRBUFLEN, "\"%s\" ", dict->token); strncat(tokens, t, ERRBUFLEN-1-pos); - link_advance(dict); + if (!link_advance(dict)) break; } tokens[pos] = '\0'; + strcpy(dict->token, token); + dict->is_special = save_is_special; + dict->input = save_input; + dict->pin = save_pin; + dict->already_got_it = save_already_got_it; + dict->line_number = save_line_number; + if (s2) { - err_msg(lg_Error, "Error parsing dictionary %s.\n" - "%s %s\n\t line %d, tokens = %s", - dict->name, - s, s2, dict->line_number, tokens); + prt_error("Error: While parsing dictionary %s:\n" + "%s %s\n\t Line %d, next tokens: %s\n", + dict->name, s, s2, dict->line_number, tokens); } else { - err_msg(lg_Error, "Error parsing dictionary %s.\n" - "%s\n\t line %d, tokens = %s", - dict->name, - s, dict->line_number, tokens); + prt_error("Error: While parsing dictionary %s:\n" + "%s\n\t Line %d, next tokens: %s\n", + dict->name, s, dict->line_number, tokens); } dict->recursive_error = false; } @@ -370,8 +174,8 @@ static void warning(Dictionary dict, const char * s) { - err_msg(lg_Warn, "Warning: %s\n" - "\tline %d, current token = \"%s\"", + prt_error("Warning: %s\n" + "\tline %d, current token = \"%s\"\n", s, dict->line_number, dict->token); } @@ -380,10 +184,10 @@ * If we're in quote mode, it does not consider the % character for * comments. Note that the returned character is a wide character! */ -typedef char* utf8char; -static utf8char get_character(Dictionary dict, int quote_mode) +#define MAXUTFLEN 7 +typedef char utf8char[MAXUTFLEN]; +static bool get_character(Dictionary dict, int quote_mode, utf8char uc) { - static char uc[7]; int i = 0; while (1) @@ -393,7 +197,18 @@ /* Skip over all comments */ if ((c == '%') && (!quote_mode)) { + if (0 == strncmp(dict->pin, SUPPRESS, sizeof(SUPPRESS)-1)) + { + const char *nl = strchr(dict->pin + sizeof(SUPPRESS)-1, '\n'); + if (NULL != nl) + { + dict->suppress_warning = + strndup(dict->pin + sizeof(SUPPRESS)-1, + nl - dict->pin - sizeof(SUPPRESS) + 1); + } + } while ((c != 0x0) && (c != '\n')) c = *(dict->pin++); + if (c == 0x0) break; dict->line_number++; continue; } @@ -407,12 +222,12 @@ { uc[0] = c; uc[1] = 0x0; - return uc; + return true; } uc[0] = c; i = 1; - while (i < 6) + while (i < MAXUTFLEN-1) { c = *(dict->pin++); /* If we're onto the next char, we're done. */ @@ -420,16 +235,16 @@ { dict->pin--; uc[i] = 0x0; - return uc; + return true; } uc[i] = c; i++; } dict_error(dict, "UTF8 char is too long"); - return NULL; + return false; } uc[0] = 0x0; - return uc; + return true; } @@ -475,7 +290,7 @@ if (dict->already_got_it == EOF) { dict->token[0] = '\0'; } else { - dict->token[0] = dict->already_got_it; /* specials are one byte */ + dict->token[0] = (char)dict->already_got_it; /* specials are one byte */ dict->token[1] = '\0'; } dict->already_got_it = '\0'; @@ -484,8 +299,8 @@ do { - c = get_character(dict, false); - if (NULL == c) return false; + bool ok = get_character(dict, false, c); + if (!ok) return false; } while (lg_isspace(c[0])); @@ -504,7 +319,6 @@ (*dict->pin == ':' || *dict->pin == ';' || lg_isspace(*dict->pin))) { - quote_mode = false; dict->token[i] = '\0'; return true; } @@ -512,6 +326,11 @@ dict_error(dict, "White space inside of token"); return false; } + if (c[0] == '\0') + { + dict_error(dict, "EOF while reading quoted token"); + return false; + } nr = 0; while (c[nr]) {dict->token[i] = c[nr]; i++; nr++; } @@ -549,10 +368,10 @@ while (c[nr]) {dict->token[i] = c[nr]; i++; nr++; } } } - c = get_character(dict, quote_mode); - if (NULL == c) return false; + bool ok = get_character(dict, quote_mode, c); + if (!ok) return false; } - return true; + /* unreachable */ } /** @@ -584,16 +403,16 @@ return false; } if (*s == '@') s++; - if (!isupper((int)*s) && ('h' != *s) && ('d' != *s)) { + if (!isupper((unsigned char)*s) && ('h' != *s) && ('d' != *s)) { dict_error(dict, "The first letter of a connector must be h,d or uppercase."); return false; } - if ((*s == 'I') && (*(s+1) == 'D')) { + if ((*s == 'I') && (*(s+1) == 'D') && isupper((unsigned char)*(s+2))) { dict_error(dict, "Connectors beginning with \"ID\" are forbidden"); return false; } while (*(s+1)) { - if ((!isalnum((int)*s)) && (*s != WILD_TYPE)) { + if ((!isalnum((unsigned char)*s)) && (*s != WILD_TYPE)) { dict_error(dict, "All letters of a connector must be ASCII alpha-numeric."); return false; } @@ -657,7 +476,7 @@ /* terse version */ /* If one word contains a dot, the other one must also! */ -static inline int dict_order_strict(const char *s, Dict_node * dn) +static inline int dict_order_strict(const char *s, const Dict_node * dn) { const char * t = dn->string; while (*s != '\0' && *s == *t) {s++; t++;} @@ -707,7 +526,7 @@ * you come to the end of one of them, or until you find unequal * characters. A "*" matches anything before the subscript mark. * Otherwise, replace SUBSCRIPT_MARK by "\0", and take the difference. - * his behavior matches that of the function dict_order_bare(). + * This behavior matches that of the function dict_order_bare(). */ #define D_DOW 6 static inline int dict_order_wild(const char * s, const Dict_node * dn) @@ -728,48 +547,27 @@ /** * dict_match -- return true if strings match, else false. * A "bare" string (one without a subscript) will match any corresponding - * string with a suffix; so, for example, "make" and "make.n" are + * string with a subscript; so, for example, "make" and "make.n" are * a match. If both strings have subscripts, then the subscripts must match. * * A subscript is the part that follows the SUBSCRIPT_MARK. */ static bool dict_match(const char * s, const char * t) { - char *ds, *dt; - ds = strrchr(s, SUBSCRIPT_MARK); - dt = strrchr(t, SUBSCRIPT_MARK); - -#if SUBSCRIPT_MARK == '.' - /* a dot at the end or a dot followed by a number is NOT - * considered a subscript */ - if ((dt != NULL) && ((*(dt+1) == '\0') || - (isdigit((int)*(dt+1))))) dt = NULL; - if ((ds != NULL) && ((*(ds+1) == '\0') || - (isdigit((int)*(ds+1))))) ds = NULL; -#endif + while ((*s != '\0') && (*s == *t)) { s++; t++; } - /* dt is NULL when there's no prefix ... */ - if (dt == NULL && ds != NULL) { - if (((int)strlen(t)) > (ds-s)) return false; /* we need to do this to ensure that */ - return (strncmp(s, t, ds-s) == 0); /* "i.e." does not match "i.e" */ - } else if (dt != NULL && ds == NULL) { - if (((int)strlen(s)) > (dt-t)) return false; - return (strncmp(s, t, dt-t) == 0); - } else { - return (strcmp(s, t) == 0); - } + if (*s == *t) return true; /* both are '\0' */ + if ((*s == 0) && (*t == SUBSCRIPT_MARK)) return true; + if ((*s == SUBSCRIPT_MARK) && (*t == 0)) return true; + + return false; } /* ======================================================================== */ static inline Dict_node * dict_node_new(void) { - return (Dict_node*) xalloc(sizeof(Dict_node)); -} - -static inline void free_dict_node(Dict_node *dn) -{ - xfree((char *)dn, sizeof(Dict_node)); + return (Dict_node*) malloc(sizeof(Dict_node)); } /** @@ -794,7 +592,7 @@ } else { - free_dict_node(dn); + free(dn); } } @@ -863,7 +661,7 @@ } /** - * lookup_list() - return list of words in the file-backed dictionary. + * file_lookup_list() - return list of words in the file-backed dictionary. * * Returns a pointer to a lookup list of the words in the dictionary. * Matches include words that appear in idioms. To exclude idioms, use @@ -872,9 +670,9 @@ * This list is made up of Dict_nodes, linked by their right pointers. * The node, file and string fields are copied from the dictionary. * - * The returned list must be freed with free_lookup(). + * The returned list must be freed with file_free_lookup(). */ -Dict_node * lookup_list(const Dictionary dict, const char *s) +Dict_node * file_lookup_list(const Dictionary dict, const char *s) { Dict_node * llist = rdictionary_lookup(NULL, dict->root, s, true, dict_order_bare); @@ -882,21 +680,21 @@ return llist; } -bool boolean_lookup(Dictionary dict, const char *s) +bool file_boolean_lookup(Dictionary dict, const char *s) { - Dict_node *llist = lookup_list(dict, s); + Dict_node *llist = file_lookup_list(dict, s); bool boool = (llist != NULL); - free_lookup(llist); + file_free_lookup(llist); return boool; } -void free_lookup(Dict_node *llist) +void file_free_lookup(Dict_node *llist) { Dict_node * n; while (llist != NULL) { n = llist->right; - free_dict_node(llist); + free(llist); llist = n; } } @@ -907,12 +705,16 @@ while (ilist != NULL) { n = ilist->left; - free_dict_node(ilist); + free(ilist); ilist = n; } } -static Dict_node * dictionary_lookup_wild(Dictionary dict, const char *s) +/** + * file_lookup_wild -- allows for wildcard searches (globs) + * Used to support the !! command in the parser command-line tool. + */ +Dict_node * file_lookup_wild(Dictionary dict, const char *s) { bool lookup_idioms = test_enabled("lookup-idioms"); char * ds = strrchr(s, SUBSCRIPT_DOT); /* Only the rightmost dot is a @@ -943,9 +745,9 @@ * This list is made up of Dict_nodes, linked by their right pointers. * The node, file and string fields are copied from the dictionary. * - * The returned list must be freed with free_lookup(). + * The returned list must be freed with file_free_lookup(). */ -Dict_node * abridged_lookup_list(const Dictionary dict, const char *s) +static Dict_node * abridged_lookup_list(const Dictionary dict, const char *s) { Dict_node *llist; llist = rdictionary_lookup(NULL, dict->root, s, false, dict_order_bare); @@ -953,32 +755,47 @@ return llist; } +/** + * strict_lookup_list() - return exact match in the dictionary + * + * Returns a pointer to a lookup list of the words in the dictionary. + * Excludes any idioms that contain the word. + * + * This list is made up of Dict_nodes, linked by their right pointers. + * The node, file and string fields are copied from the dictionary. + * + * The list normally has 0 or 1 elements, unless the given word + * appears more than once in the dictionary. + * + * The returned list must be freed with file_free_lookup(). + */ +static Dict_node * strict_lookup_list(const Dictionary dict, const char *s) +{ + Dict_node *llist; + llist = rdictionary_lookup(NULL, dict->root, s, false, dict_order_strict); + llist = prune_lookup_list(llist, s); + return llist; +} + /* ======================================================================== */ /** * Allocate a new Exp node and link it into the exp_list for freeing later. */ Exp * Exp_create(Exp_list *eli) { - Exp * e; - e = (Exp *) xalloc(sizeof(Exp)); + Exp * e = malloc(sizeof(Exp)); e->next = eli->exp_list; eli->exp_list = e; return e; } -static inline void exp_free(Exp * e) -{ - xfree((char *)e, sizeof(Exp)); -} - /** * This creates a node with zero children. Initializes * the cost to zero. */ static Exp * make_zeroary_node(Exp_list * eli) { - Exp * n; - n = Exp_create(eli); + Exp * n = Exp_create(eli); n->type = AND_type; /* these must be AND types */ n->cost = 0.0; n->u.l = NULL; @@ -995,7 +812,7 @@ n = Exp_create(eli); n->type = AND_type; /* these must be AND types */ n->cost = 0.0; - n->u.l = (E_list *) xalloc(sizeof(E_list)); + n->u.l = (E_list *) malloc(sizeof(E_list)); n->u.l->next = NULL; n->u.l->e = e; return n; @@ -1014,8 +831,8 @@ n->type = AND_type; n->cost = 0.0; - n->u.l = ell = (E_list *) xalloc(sizeof(E_list)); - ell->next = elr = (E_list *) xalloc(sizeof(E_list)); + n->u.l = ell = (E_list *) malloc(sizeof(E_list)); + ell->next = elr = (E_list *) malloc(sizeof(E_list)); elr->next = NULL; ell->e = nl; @@ -1036,8 +853,8 @@ n->type = OR_type; n->cost = 0.0; - n->u.l = ell = (E_list *) xalloc(sizeof(E_list)); - ell->next = elr = (E_list *) xalloc(sizeof(E_list)); + n->u.l = ell = (E_list *) malloc(sizeof(E_list)); + ell->next = elr = (E_list *) malloc(sizeof(E_list)); elr->next = NULL; ell->e = nl; @@ -1064,18 +881,24 @@ static Exp * make_dir_connector(Dictionary dict, int i) { Exp* n = Exp_create(&dict->exp_list); + char *constring; + n->dir = dict->token[i]; dict->token[i] = '\0'; /* get rid of the + or - */ if (dict->token[0] == '@') { - n->u.string = string_set_add(dict->token+1, dict->string_set); + constring = dict->token+1; n->multi = true; } else { - n->u.string = string_set_add(dict->token, dict->string_set); + constring = dict->token; n->multi = false; } + + n->u.condesc = condesc_add(&dict->contable, + string_set_add(constring, dict->string_set)); + if (NULL == n->u.condesc) return NULL; /* Table ovf */ n->type = CONNECTOR_type; n->cost = 0.0; return n; @@ -1108,14 +931,14 @@ } if (dn == NULL) { - free_lookup(dn_head); - dict_error(dict, "\nPerhaps missing + or - in a connector.\n" + file_free_lookup(dn_head); + dict_error(dict, "Perhaps missing + or - in a connector.\n" "Or perhaps you forgot the subscript on a word.\n" - "Or perhaps a word is used before it is defined.\n"); + "Or perhaps a word is used before it is defined."); return NULL; } n = make_unary_node(&dict->exp_list, dn->exp); - free_lookup(dn_head); + file_free_lookup(dn_head); } else { @@ -1128,6 +951,7 @@ { /* A simple, unidirectional connector. Just make that. */ n = make_dir_connector(dict, i); + if (NULL == n) return NULL; } else if (dict->token[i] == ANY_DIR) { @@ -1136,8 +960,10 @@ * Make both a + and a - version, and or them together. */ dict->token[i] = '+'; plu = make_dir_connector(dict, i); + if (NULL == plu) return NULL; dict->token[i] = '-'; min = make_dir_connector(dict, i); + if (NULL == min) return NULL; n = make_or_node(&dict->exp_list, plu, min); } @@ -1150,7 +976,7 @@ if (!link_advance(dict)) { - exp_free(n); + free(n); return NULL; } return n; @@ -1169,6 +995,7 @@ Exp *zn, *an; E_list *elist, *flist; Exp_list eli = { NULL }; + const char *ZZZ = string_set_add(EMPTY_CONNECTOR, dict->string_set); /* The left-wall already has ZZZ-. The right-wall will not arrive here. */ if (MT_WALL == x->word->morpheme_type) return; @@ -1184,19 +1011,19 @@ /* zn points at {ZZZ+} */ zn = Exp_create(&eli); zn->dir = '+'; - zn->u.string = string_set_add(EMPTY_CONNECTOR, dict->string_set); + zn->u.condesc = condesc_add(&dict->contable, ZZZ); zn->multi = false; zn->type = CONNECTOR_type; zn->cost = 0.0; zn = make_optional_node(&eli, zn); /* flist is plain-word-exp */ - flist = (E_list *) xalloc(sizeof(E_list)); + flist = (E_list *) malloc(sizeof(E_list)); flist->next = NULL; flist->e = x->exp; /* elist is {ZZZ+} , (plain-word-exp) */ - elist = (E_list *) xalloc(sizeof(E_list)); + elist = (E_list *) malloc(sizeof(E_list)); elist->next = flist; elist->e = zn; @@ -1214,15 +1041,14 @@ /** * Return true if the string is a (floating point) number. - * Float points can be preceeded by a single plus or minus sign. + * Float points can be proceeded by a single plus or minus sign. */ static bool is_number(const char * str) { if ('+' == str[0] || '-' == str[0]) str++; - if (strspn(str, "0123456789.") == strlen(str)) - return true; + size_t numlen = strspn(str, "0123456789."); - return false; + return str[numlen] == '\0'; } /* ======================================================================== */ @@ -1247,7 +1073,7 @@ n->cost = 0.0; elist = &first; while((!is_equal(dict, ')')) && (!is_equal(dict, ']')) && (!is_equal(dict, '}'))) { - elist->next = (E_list *) xalloc(sizeof(E_list)); + elist->next = (E_list *) malloc(sizeof(E_list)); elist = elist->next; elist->next = NULL; elist->e = expression(dict); @@ -1614,20 +1440,33 @@ /* ======================================================================== */ /** * Insert the new node into the dictionary below node n. - * Give error message if the new element's string is already there. - * Assumes that the "n" field of new is already set, and the left - * and right fields of it are NULL. + * "newnode" left and right fields are NULL, and its string is already + * there. If the string is already found in the dictionary, give an error + * message and effectively ignore it. * * The resulting tree is highly unbalanced. It needs to be rebalanced * before being used. The DSW algo below is ideal for that. */ Dict_node * insert_dict(Dictionary dict, Dict_node * n, Dict_node * newnode) { - int comp; - if (NULL == n) return newnode; - comp = dict_order_strict(newnode->string, n); + static Exp null_exp = { .type = AND_type, .u.l = NULL }; + int comp = dict_order_strict(newnode->string, n); + if (0 == comp && + /* Suppress reporting duplicate idioms until they are fixed. */ + (!contains_underbar(newnode->string) || test_enabled("dup-idioms"))) + { + char t[80+MAX_TOKEN_LENGTH]; + snprintf(t, sizeof(t), + "Ignoring word \"%s\", which has been multiply defined:", + newnode->string); + dict_error(dict, t); + /* Too late to skip insertion - insert it with a null expression. */ + newnode->exp = &null_exp; + comp = -1; + } + if (comp < 0) { if (NULL == n->left) @@ -1636,10 +1475,8 @@ return n; } n->left = insert_dict(dict, n->left, newnode); - return n; - /* return rebalance(n); Uncomment to get an AVL tree */ } - else if (comp > 0) + else { if (NULL == n->right) { @@ -1647,16 +1484,71 @@ return n; } n->right = insert_dict(dict, n->right, newnode); - return n; - /* return rebalance(n); Uncomment to get an AVL tree */ } - else + + return n; + /* return rebalance(n); Uncomment to get an AVL tree */ +} + +/** + * Find if a warning symbol exists in the currently suppress list. + * The warning symbols are constructed in a way that disallow overlap + * matching. + */ +static bool is_warning_suppressed(Dictionary dict, const char *warning_symbol) +{ + if (NULL == dict->suppress_warning) return false; + return (NULL != strstr(dict->suppress_warning, warning_symbol)); +} + +/** + * Remember the length_limit definitions in a list according to their order. + * The order is kept to allow later more specific definitions to override + * already applied ones. + */ +static void add_condesc_length_limit(Dictionary dict, Dict_node *dn, + int length_limit) +{ + length_limit_def_t *lld = malloc(sizeof(*lld)); + lld->next = NULL; + lld->length_limit = length_limit; + lld->defexp = dn->exp; + lld->defword = dn->string; + *dict->contable.length_limit_def_next = lld; + dict->contable.length_limit_def_next = &lld->next; +} + +static void insert_length_limit(Dictionary dict, Dict_node *dn) +{ + int length_limit; + + if (0 == strcmp(UNLIMITED_CONNECTORS_WORD, dn->string)) { - char t[256]; - snprintf(t, 256, "The word \"%s\" has been multiply defined\n", newnode->string); - dict_error(dict, t); - return NULL; + length_limit = UNLIMITED_LEN; } + else + if (0 == strncmp(LIMITED_CONNECTORS_WORD, dn->string, + sizeof(LIMITED_CONNECTORS_WORD)-1)) + { + char *endp; + length_limit = + (int)strtol(dn->string + sizeof(LIMITED_CONNECTORS_WORD)-1, &endp, 10); + if ((length_limit < 0) || (length_limit > MAX_SENTENCE) || + (('\0' != *endp) && (SUBSCRIPT_MARK != *endp))) + { + prt_error("Warning: Word \"%s\" found near line %d of %s.\n" + "\tThis word should end with a number (1-%d).\n" + "\tThis word will be ignored.", + dn->string, dict->line_number, dict->name, MAX_SENTENCE); + return; + } + } + else return; + + /* We cannot set the connectors length_limit yet because the + * needed data structure is not defined yet. For now, just + * remember the definitions in their order. */ + add_condesc_length_limit(dict, dn, length_limit); } /** @@ -1694,39 +1586,62 @@ dn_second_half = dn->left; dn->left = dn->right = NULL; - if (contains_underbar(dn->string)) - { - insert_idiom(dict, dn); - } - else if (is_idiom_word(dn->string)) + if (is_idiom_word(dn->string)) { - err_msg(lg_Warn, "Warning: Word \"%s\" found near line %d of %s.\n" + prt_error("Warning: Word \"%s\" found near line %d of %s.\n" "\tWords ending \".Ix\" (x a number) are reserved for idioms.\n" "\tThis word will be ignored.", dn->string, dict->line_number, dict->name); - free_dict_node(dn); - } - else if ((dn_head = abridged_lookup_list(dict, dn->string)) != NULL) - { - char *u; - Dict_node *dnx; - - u = strchr(dn->string, SUBSCRIPT_MARK); - if (u) *u = SUBSCRIPT_DOT; - prt_error("Warning: The word \"%s\" " - "found near line %d of %s matches the following words:", - dn->string, dict->line_number, dict->name); - for (dnx = dn_head; dnx != NULL; dnx = dnx->right) { - prt_error("\a\t%s", dnx->string); - } - prt_error("\a\n\tThis word will be ignored.\n"); - free_lookup(dn_head); - free_dict_node(dn); + free(dn); } else { + if (contains_underbar(dn->string)) + { + insert_idiom(dict, dn); + } + dict->root = insert_dict(dict, dict->root, dn); + insert_length_limit(dict, dn); dict->num_entries++; + + if ((verbosity_level(D_DICT+0) && !is_warning_suppressed(dict, DUP_BASE)) || + verbosity_level(D_SPEC+3)) + { + /* Warn if there are words with a subscript that match a bare word. */ + const char *sm = strchr(dn->string, SUBSCRIPT_MARK); + char *bareword; + + if (NULL != sm) + { + bareword = strdupa(dn->string); + bareword[sm - dn->string] = '\0'; + } + else + { + bareword = (char *)dn->string; + } + + if ((dn_head = strict_lookup_list(dict, bareword)) != NULL) + { + bool match_found = false; + for (Dict_node *dnx = dn_head; dnx != NULL; dnx = dnx->right) { + if (0 != strcmp(dnx->string, dn->string)) + { + if (!match_found) + { + prt_error("Warning: The word \"%s\" found near line " + "%d of %s\n\t matches the following words:", + dn->string, dict->line_number, dict->name); + match_found = true; + } + prt_error("\t%s", dnx->string); + } + } + if (match_found) prt_error("\n"); + file_free_lookup(dn_head); + } + } } insert_list(dict, p, k); @@ -1767,7 +1682,7 @@ return false; } } - else if ((dict->token[0] == '#') && (0 == strcmp(dict->token, "#include"))) + else if (0 == strcmp(dict->token, "#include")) { bool rc; char* instr; @@ -1776,14 +1691,14 @@ bool save_is_special; const char * save_input; const char * save_pin; - char save_already_got_it; + int save_already_got_it; int save_line_number; size_t skip_slash; if (!link_advance(dict)) goto syntax_error; skip_slash = ('/' == dict->token[0]) ? 1 : 0; - dict_name = strdup(dict->token); + dict_name = strdupa(dict->token); save_name = dict->name; save_is_special = dict->is_special; save_input = dict->input; @@ -1816,7 +1731,6 @@ dict->line_number = save_line_number; free(instr); - free(dict_name); if (!rc) goto syntax_error; /* when we return, point to the next entry */ @@ -1867,12 +1781,6 @@ goto syntax_error; } - /* pass the ; */ - if (!link_advance(dict)) - { - goto syntax_error; - } - /* At this point, dn points to a list of Dict_nodes connected by * their left pointers. These are to be inserted into the dictionary */ i = 0; @@ -1882,6 +1790,20 @@ i++; } dict->insert_entry(dict, dn, i); + + if (dict->suppress_warning) + { + free((void *)dict->suppress_warning); + dict->suppress_warning = NULL; + } + + /* pass the ; */ + if (!link_advance(dict)) + { + /* Avoid freeing dn, since it is already inserted into the dict. */ + return false; + } + return true; syntax_error: @@ -1933,160 +1855,3 @@ } /* ======================================================================= */ - -/** - * Display the information about the given word. - * If the word can split, display the information about each part. - * Note that the splits may be invalid grammatically. - * - * Wild-card search is supported; the command-line user can type in !!word* or - * !!word*.sub and get a list of all words that match up to the wild-card. - * In this case no split is done. - * - * FIXME: Errors are printed twice, since display_word_split() is invoked twice - * per word. One way to fix it is to change display_word_split() to return false - * on failure. However, this is a big fix, because the failure is several - * functions deep, all not returning a value or returning a value for another - * purpose. An easy fix, which has advantages for other things, is to add (and - * use here) a "char *last_error" field in the Dictionary structure, serving - * like an "errno" of library calls. - */ - -static void display_word_split(Dictionary dict, - const char * word, - Parse_Options opts, - void (*display)(Dictionary, const char *)) -{ - Sentence sent; - struct Parse_Options_s display_word_opts = *opts; - - if ('\0' == word) return; /* avoid trying null strings */ - - parse_options_set_spell_guess(&display_word_opts, 0); - sent = sentence_create(word, dict); - if (0 == sentence_split(sent, &display_word_opts)) - { - /* List the splits */ - print_sentence_word_alternatives(sent, false, NULL, NULL); - /* List the disjuncts information. */ - print_sentence_word_alternatives(sent, false, display, NULL); - } - sentence_delete(sent); -} - -#define DJ_COL_WIDTH sizeof(" ") - -/** - * Display the number of disjuncts associated with this dict node - */ -static void display_counts(const char *word, Dict_node *dn) -{ - printf("matches:\n"); - - for (; dn != NULL; dn = dn->right) - { - unsigned int len; - char * s; - char * t; - - len = count_disjunct_for_dict_node(dn); - s = strdup(dn->string); - t = strrchr(s, SUBSCRIPT_MARK); - if (t) *t = SUBSCRIPT_DOT; - printf(" "); - left_print_string(stdout, s, DJ_COL_WIDTH); - free(s); - printf(" %8u disjuncts ", len); - if (dn->file != NULL) - { - printf("<%s>", dn->file->file); - } - printf("\n"); - } -} - -/** - * Display the number of disjuncts associated with this dict node - */ -static void display_expr(const char *word, Dict_node *dn) -{ - printf("expressions:\n"); - for (; dn != NULL; dn = dn->right) - { - char * s; - char * t; - - s = strdup(dn->string); - t = strrchr(s, SUBSCRIPT_MARK); - if (t) *t = SUBSCRIPT_DOT; - printf(" "); - left_print_string(stdout, s, DJ_COL_WIDTH); - free(s); - print_expression(dn->exp); - if (NULL != dn->right) /* avoid extra newlines at the end */ - printf("\n\n"); - } -} - -static void display_word_info(Dictionary dict, const char * word) -{ - const char * regex_name; - Dict_node *dn_head; - - dn_head = dictionary_lookup_wild(dict, word); - if (dn_head) - { - display_counts(word, dn_head); - free_lookup(dn_head); - return; - } - - /* Recurse, if it's a regex match */ - regex_name = match_regex(dict->regex_root, word); - if (regex_name) - { - display_word_info(dict, regex_name); - return; - } - printf("matches nothing in the dictionary."); -} - -static void display_word_expr(Dictionary dict, const char * word) -{ - const char * regex_name; - Dict_node *dn_head; - - dn_head = dictionary_lookup_wild(dict, word); - if (dn_head) - { - display_expr(word, dn_head); - free_lookup(dn_head); - return; - } - - /* Recurse, if it's a regex match */ - regex_name = match_regex(dict->regex_root, word); - if (regex_name) - { - display_word_expr(dict, regex_name); - return; - } - printf("matches nothing in the dictionary."); -} - -/** - * dict_display_word_info() - display the information about the given word. - */ -void dict_display_word_info(Dictionary dict, const char * word, - Parse_Options opts) -{ - display_word_split(dict, word, opts, display_word_info); -} - -/** - * dict_display_word_expr() - display the connector info for a given word. - */ -void dict_display_word_expr(Dictionary dict, const char * word, Parse_Options opts) -{ - display_word_split(dict, word, opts, display_word_expr); -} diff -Nru link-grammar-5.3.16/link-grammar/dict-file/read-dict.h link-grammar-5.5.0/link-grammar/dict-file/read-dict.h --- link-grammar-5.3.16/link-grammar/dict-file/read-dict.h 2016-09-17 22:56:25.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-file/read-dict.h 2017-07-16 04:55:06.000000000 +0000 @@ -13,7 +13,7 @@ #ifndef _LG_READ_DICT_H_ #define _LG_READ_DICT_H_ -#include "dict-structures.h" +#include "dict-common/dict-structures.h" void print_dictionary_data(Dictionary dict); void print_dictionary_words(Dictionary dict); @@ -21,9 +21,11 @@ Dictionary dictionary_create_from_file(const char * lang); bool read_dictionary(Dictionary dict); -Dict_node * lookup_list(const Dictionary dict, const char *s); -bool boolean_lookup(Dictionary dict, const char *s); -void free_lookup(Dict_node *llist); +Dict_node * file_lookup_list(const Dictionary dict, const char *s); +Dict_node * file_lookup_wild(Dictionary dict, const char *s); +bool file_boolean_lookup(Dictionary dict, const char *s); +void file_free_lookup(Dict_node *llist); + void free_insert_list(Dict_node *ilist); void insert_list(Dictionary dict, Dict_node * p, int l); diff -Nru link-grammar-5.3.16/link-grammar/dict-file/read-regex.c link-grammar-5.5.0/link-grammar/dict-file/read-regex.c --- link-grammar-5.3.16/link-grammar/dict-file/read-regex.c 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-file/read-regex.c 2018-04-23 19:49:07.000000000 +0000 @@ -10,8 +10,8 @@ #include #include "link-includes.h" -#include "api-structures.h" -#include "structures.h" +#include "dict-common/dict-common.h" +#include "dict-common/file-utils.h" #include "read-regex.h" /* @@ -65,14 +65,14 @@ { do { - c = lg_fgetc(fp); + c = fgetc(fp); if (c == '\n') { line++; } } while (lg_isspace(c)); if (c == '%') { - while ((c != EOF) && (c != '\n')) { c = lg_fgetc(fp); } + while ((c != EOF) && (c != '\n')) { c = fgetc(fp); } line++; } } @@ -84,13 +84,13 @@ i = 0; do { - if (i > MAX_REGEX_NAME_LENGTH-1) + if (i >= MAX_REGEX_NAME_LENGTH-1) { prt_error("Error: Regex name too long on line %d\n", line); goto failure; } name[i++] = c; - c = lg_fgetc(fp); + c = fgetc(fp); } while ((!lg_isspace(c)) && (c != ':') && (c != EOF)); name[i] = '\0'; @@ -99,7 +99,7 @@ while (lg_isspace(c)) { if (c == '\n') { line++; } - c = lg_fgetc(fp); + c = fgetc(fp); } if (c != ':') { @@ -111,7 +111,7 @@ do { if (c == '\n') { line++; } - c = lg_fgetc(fp); + c = fgetc(fp); } while (lg_isspace(c)); if (c == '!') @@ -120,7 +120,7 @@ do { if (c == '\n') { line++; } - c = lg_fgetc(fp); + c = fgetc(fp); } while (lg_isspace(c)); } @@ -131,7 +131,6 @@ } /* Read in the regex. */ - prev = 0; i = 0; do { @@ -141,7 +140,7 @@ goto failure; } prev = c; - c = lg_fgetc(fp); + c = fgetc(fp); regex[i++] = c; } while ((c != '/' || prev == '\\') && (c != EOF)); diff -Nru link-grammar-5.3.16/link-grammar/dict-file/word-file.c link-grammar-5.5.0/link-grammar/dict-file/word-file.c --- link-grammar-5.3.16/link-grammar/dict-file/word-file.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-file/word-file.c 2018-01-22 20:29:37.000000000 +0000 @@ -11,12 +11,34 @@ /***************************************************************************/ #include "error.h" -#include "dict-common.h" +#include "dict-common/dict-common.h" +#include "dict-common/dict-defines.h" // for MAX_WORD +#include "dict-common/file-utils.h" #include "string-set.h" -#include "structures.h" #include "read-dict.h" #include "word-file.h" +/** Replace the right-most dot with SUBSCRIPT_MARK */ +void patch_subscript(char * s) +{ + char *ds, *de; + int dp; + ds = strrchr(s, SUBSCRIPT_DOT); + if (!ds) return; + + /* a dot at the end or a dot followed by a number is NOT + * considered a subscript */ + de = ds + 1; + if (*de == '\0') return; + dp = (int) *de; + + /* If its followed by a UTF8 char, its NOT a subscript */ + if (127 < dp || dp < 0) return; + /* assert ((0 < dp) && (dp <= 127), "Bad dictionary entry!"); */ + if (isdigit(dp)) return; + *ds = SUBSCRIPT_MARK; +} + /** * Reads in one word from the file, allocates space for it, * and returns it. @@ -30,14 +52,14 @@ int c, j; do { - c = lg_fgetc(fp); + c = fgetc(fp); } while ((c != EOF) && lg_isspace(c)); if (c == EOF) return NULL; for (j=0; (j <= MAX_WORD-1) && (!lg_isspace(c)) && (c != EOF); j++) { word[j] = c; - c = lg_fgetc(fp); + c = fgetc(fp); } if (j >= MAX_WORD) { @@ -72,7 +94,7 @@ return NULL; } - wf = (Word_file *) xalloc(sizeof (Word_file)); + wf = malloc(sizeof (Word_file)); wf->file = string_set_add(filename, dict->string_set); wf->changed = false; wf->next = dict->word_file_header; @@ -85,7 +107,7 @@ free_insert_list(dn); return NULL; } - Dict_node * dn_new = (Dict_node *) xalloc(sizeof(Dict_node)); + Dict_node * dn_new = malloc(sizeof(Dict_node)); dn_new->left = dn; dn = dn_new; dn->string = s; @@ -101,7 +123,7 @@ for (;wf != NULL; wf = wf1) { wf1 = wf->next; - xfree((char *) wf, sizeof(Word_file)); + free(wf); } } diff -Nru link-grammar-5.3.16/link-grammar/dict-file/word-file.h link-grammar-5.5.0/link-grammar/dict-file/word-file.h --- link-grammar-5.3.16/link-grammar/dict-file/word-file.h 2016-09-20 09:10:11.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-file/word-file.h 2017-05-31 07:51:33.000000000 +0000 @@ -11,8 +11,7 @@ /* */ /*************************************************************************/ -#include "dict-api.h" -#include "structures.h" +#include "dict-common/dict-api.h" /* The structure below stores a list of dictionary word files. */ struct Word_file_struct @@ -26,3 +25,5 @@ Dict_node * read_word_file(Dictionary dict, Dict_node * dn, char * filename); +void patch_subscript(char *); + diff -Nru link-grammar-5.3.16/link-grammar/dict-sql/demo.sql link-grammar-5.5.0/link-grammar/dict-sql/demo.sql --- link-grammar-5.3.16/link-grammar/dict-sql/demo.sql 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-sql/demo.sql 2017-07-16 04:55:06.000000000 +0000 @@ -13,6 +13,29 @@ -- cat demo.sql | sqlite3 dict.db -- +-- These first few lines identify the dictionary version number, +-- and the locale that applies. The are optional but strongly +-- recommended. The locale is used, among other things, to identify +-- capitalized words during tokenization. +-- +INSERT INTO Morphemes VALUES ('', '', ''); +INSERT INTO Disjuncts VALUES ('', 'V5v4v0+', 0.0); + +INSERT INTO Morphemes VALUES ('', '', ''); +INSERT INTO Disjuncts VALUES ('', 'EN4us+', 0.0); + +-- The UNKNOWN-WORD device is needed in order to allow the wild-card +-- query of dictionary contents to work. That is, the user can use the +-- command-line client to type in `!!blah*` and this will search the +-- dictionary for all words whos first four letters are `blah`. You +-- are free to replace `XXXBOGUS+` by something useful, for example, +-- If you use `Ds- & Os-`, than any word will be accepted as the object! +INSERT INTO Morphemes VALUES ('UNKNOWN-WORD', 'UNKNOWN-WORD', 'UNKNOWN-WORD'); +INSERT INTO Disjuncts VALUES ('UNKNOWN-WORD', 'XXXBOGUS+', 0.0); +-- INSERT INTO Disjuncts VALUES ('UNKNOWN-WORD', 'Ds- & Os-', 0.0); + +-- The following should look familier, if you already understand +-- link-grammar basics. INSERT INTO Morphemes VALUES ('LEFT-WALL', 'LEFT-WALL', 'LEFT-WALL'); INSERT INTO Disjuncts VALUES ('LEFT-WALL', 'Wd+ & WV+', 0.0); diff -Nru link-grammar-5.3.16/link-grammar/dict-sql/dict.sql link-grammar-5.5.0/link-grammar/dict-sql/dict.sql --- link-grammar-5.3.16/link-grammar/dict-sql/dict.sql 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-sql/dict.sql 2017-07-16 04:55:06.000000000 +0000 @@ -15,12 +15,13 @@ CREATE TABLE Morphemes ( -- For English, the 'morpheme' is the 'word'. A given morpheme - -- may appear mutiple times in this table. This is the field that + -- may appear multiple times in this table. This is the field that -- the tokenizer uses to determine if a token is in the dictionary. morpheme TEXT NOT NULL, -- The subscripted form of the above. The subscripted forms are - -- always unique for the dictionary. + -- always unique for the dictionary. They serve as a debugging tool, + -- unique identifier for the database. subscript TEXT UNIQUE NOT NULL, -- The classname is the set that the subscripted 'word' belongs to. diff -Nru link-grammar-5.3.16/link-grammar/dict-sql/README link-grammar-5.5.0/link-grammar/dict-sql/README --- link-grammar-5.3.16/link-grammar/dict-sql/README 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-sql/README 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ - - SQLite-based dictionary - ----------------------- - -The SQLite-based dictionary storage is meant to provide a simple -machine-readable interface to the Link Grammar dictionaries. - -Traditionally, the Lik Grammar dictionaries are stored in text files -that are hand-edited. This is fine for manual maintenance of the -dictionary entries, but is a stumbling block when it is desired that -the dictionary be maintained in an automated fashion, e.g. by -language-learning tools. The SQLite-based dictionary provides this -simpler interface. - -The current interface remains a hybrid: some of the data will remain -in text files, until the system is more fully developed. - -The SQLite format also offers another distinct advantage over the -text-bases system: an improved mechanism for handling non-integer -costs. In an automated language learning system, the costs will -typically be log-liklihoods, and thus have fractional values; this -is in contrast to the current text system, which only offers an -integer-valued cost system. - diff -Nru link-grammar-5.3.16/link-grammar/dict-sql/README.md link-grammar-5.5.0/link-grammar/dict-sql/README.md --- link-grammar-5.3.16/link-grammar/dict-sql/README.md 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-sql/README.md 2017-08-13 22:11:23.000000000 +0000 @@ -0,0 +1,59 @@ + +SQLite-based dictionary +----------------------- + +The SQLite-based dictionary storage is meant to provide a simple +machine-readable interface to the Link Grammar dictionaries. + +Traditionally, the Link Grammar dictionaries are stored in text files +that are hand-edited. This is fine for manual maintenance of the +dictionary entries, but is a stumbling block when it is desired that +the dictionary be maintained in an automated fashion, e.g. by +language-learning tools. The SQLite-based dictionary provides this +automation interface. + +Database format +--------------- +The current interface remains a hybrid: the affix, regex and +post-processing data will remain in text files, at least until the +system is more fully developed. (Or possibly forever, depending on +future plans and outcomes). + +The current design for the entries (rows) of the SQL table is that +they should hold one disjunct only, each. Thus, the `or` keyword +is not supported in the database entries. You should use multiple +rows if you want to `or` them together. + +Also not supported: the multi-connector at-sign `@` and the optional +connector braces `{}`. Again, these should be expanded into multi-row +entries. It would be fairly easy to add this support, but this misses +the point: the SQL database is NOT supposed to be "just like the text +files, but different". Its really meant to support a different way of +doing data management. The at-sign and the braces can be thought of as +a form of compression, a way of making the dictionary entries more +human-readable, more human-friendly. By contrast, the SQL backend is +meant to be machine-readable, and machine-friendly, so that it can be +more easily updated by machine-learning algorithms. + +Existing demo: +-------------- +Run the existing demo: +``` + $ link-parser demo-sql +``` +This can parse the following sentences: "this is a test", "this is +another test", "this is a dog", "this is a cat". All other sentences +will fail to parse (intentionally so). + +Creating a demo dictionary: +--------------------------- +Use the following commands, modified as desired: +``` +mkdir data/foo +cp data/demo-sql/4.0.* data/foo +cat dict.sql |sqlite3 data/foo/dict.db +cat demo.sql |sqlite3 data/foo/dict.db +link-parser foo +``` +The above should result in a dictionary that can parse the same sentences +as the demo database. diff -Nru link-grammar-5.3.16/link-grammar/dict-sql/read-sql.c link-grammar-5.5.0/link-grammar/dict-sql/read-sql.c --- link-grammar-5.3.16/link-grammar/dict-sql/read-sql.c 2017-01-27 22:01:05.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-sql/read-sql.c 2018-03-11 23:13:19.000000000 +0000 @@ -20,21 +20,31 @@ #include #include "api-structures.h" -#include "dict-api.h" -#include "dict-common.h" -#include "dict-structures.h" +#include "connectors.h" +#include "dict-common/dict-api.h" +#include "dict-common/dict-common.h" +#include "dict-common/dict-defines.h" // for LEFT_WALL_WORD +#include "dict-common/dict-impl.h" +#include "dict-common/dict-structures.h" +#include "dict-common/file-utils.h" #include "externs.h" -#include "spellcheck.h" +#include "lg_assert.h" #include "string-set.h" -#include "structures.h" +#include "tokenize/spellcheck.h" #include "utilities.h" -#include "word-utils.h" #include "read-sql.h" /* ========================================================= */ /* Mini expression-parsing library. This is a simplified subset of * what can be found in the file-backed dictionary. + * + * This does NOT support braces {} used to indicate optional connectors, + * nor does it support the multi-connector at-sign @ and so using these + * in the SQL tables is NOT valid! It also does not support cost + * brackets [] and it does no support 'or'. This is really really + * really supposed to be one unique disjunct per SQL table row! + * This is NOT meant to be "just like the text files, but different". */ static Exp * make_expression(Dictionary dict, const char *exp_str) @@ -48,21 +58,23 @@ const char * p = exp_str; const char * con_start = NULL; - /* search for the start of a conector */ + /* search for the start of a connector */ while (*p && (lg_isspace(*p) || '&' == *p)) p++; con_start = p; if (0 == *p) return NULL; - /* search for the end of a conector */ + /* search for the end of a connector */ while (*p && (isalnum(*p) || '*' == *p)) p++; - + + if (0 == *p) return NULL; + /* Connectors always end with a + or - */ assert (('+' == *p) || ('-' == *p), "Missing direction character in connector string: %s", con_start); /* Create an expression to hold the connector */ - e = (Exp *) xalloc(sizeof(Exp)); + e = malloc(sizeof(Exp)); e->dir = *p; e->type = CONNECTOR_type; e->cost = 0.0; @@ -80,7 +92,8 @@ /* We have to use the string set, mostly because copy_Exp * in build_disjuncts fails to copy the string ... */ - e->u.string = string_set_add(constr, dict->string_set); + e->u.condesc = condesc_add(&dict->contable, + string_set_add(constr, dict->string_set)); free(constr); rest = make_expression(dict, ++p); @@ -88,11 +101,11 @@ return e; /* Join it all together with an AND node */ - and = (Exp *) xalloc(sizeof(Exp)); + and = malloc(sizeof(Exp)); and->type = AND_type; and->cost = 0.0; - and->u.l = ell = (E_list *) xalloc(sizeof(E_list)); - ell->next = elr = (E_list *) xalloc(sizeof(E_list)); + and->u.l = ell = malloc(sizeof(E_list)); + ell->next = elr = malloc(sizeof(E_list)); elr->next = NULL; ell->e = e; @@ -103,7 +116,7 @@ /* ========================================================= */ -/* Dictionary word lookup proceedures. */ +/* Dictionary word lookup procedures. */ typedef struct { @@ -121,9 +134,8 @@ Exp *e; dn = llist->right; e = llist->exp; - if (e) - xfree((char *)e, sizeof(Exp)); - xfree((char *)llist, sizeof(Dict_node)); + if (e) free(e); + free(llist); llist = dn; } } @@ -191,23 +203,31 @@ scriword = argv[0]; wclass = argv[1]; + /* Now look up the expressions for each word */ + bs->exp = NULL; + db_lookup_exp(bs->dict, wclass, bs); + + /* Well, if we found a classname for a word, then there really, + * really should be able to find one or more corresponding disjuncts. + * However, it is possible to have corrupted databases which do not + * have any disjuncts for a word class. We silently ignore these. + * Although maybe we should throw an error here? + */ + if (NULL == bs->exp) return 0; + /* Put each word into a Dict_node. */ - dn = (Dict_node *) xalloc(sizeof(Dict_node)); + dn = malloc(sizeof(Dict_node)); + memset(dn, 0, sizeof(Dict_node)); dn->string = string_set_add(scriword, bs->dict->string_set); dn->right = bs->dn; + dn->exp = bs->exp; bs->dn = dn; - /* Now look up the expressions for each word */ - bs->exp = NULL; - db_lookup_exp(bs->dict, wclass, bs); - bs->dn->exp = bs->exp; - return 0; } - static void -db_lookup_common(Dictionary dict, const char *s, +db_lookup_common(Dictionary dict, const char *s, const char *equals, int (*cb)(void *, int, char **, char **), cbdata* bs) { @@ -216,7 +236,9 @@ /* The token to look up is called the 'morpheme'. */ qry = dyn_str_new(); - dyn_strcat(qry, "SELECT subscript, classname FROM Morphemes WHERE morpheme = \'"); + dyn_strcat(qry, "SELECT subscript, classname FROM Morphemes WHERE morpheme "); + dyn_strcat(qry, equals); + dyn_strcat(qry, " \'"); dyn_strcat(qry, s); dyn_strcat(qry, "\';"); @@ -229,7 +251,7 @@ cbdata bs; bs.dict = dict; bs.found = false; - db_lookup_common(dict, s, exists_cb, &bs); + db_lookup_common(dict, s, "=", exists_cb, &bs); return bs.found; } @@ -238,7 +260,7 @@ cbdata bs; bs.dict = dict; bs.dn = NULL; - db_lookup_common(dict, s, morph_cb, &bs); + db_lookup_common(dict, s, "=", morph_cb, &bs); if (3 < verbosity) { if (bs.dn) @@ -254,8 +276,35 @@ return bs.dn; } +/** + * This is used to support wild-card lookup from the command-line + * client. That is, a user can type in `!!foo*` and look up all + * words that begin with the three letters `foo`. It works ... + * but it only works if the dictionary also has UNKNOWN-WORD defined! + */ +static Dict_node * db_lookup_wild(Dictionary dict, const char *s) +{ + cbdata bs; + bs.dict = dict; + bs.dn = NULL; + db_lookup_common(dict, s, "GLOB", morph_cb, &bs); + if (3 < verbosity) + { + if (bs.dn) + { + printf("Found expression for glob %s: ", s); + print_expression(bs.dn->exp); + } + else + { + printf("No expression for glob %s\n", s); + } + } + return bs.dn; +} + /* ========================================================= */ -/* Dictionary creation, setup, open proceedures */ +/* Dictionary creation, setup, open procedures */ bool check_db(const char *lang) { @@ -280,11 +329,9 @@ /* SQLite has a habit of leaving zero-length DB's lying around */ fd = fileno(fh); fstat(fd, &buf); + fclose(fh); if (0 == buf.st_size) - { - fclose(fh); return NULL; - } /* Found a file, of non-zero length. See if that works. */ if (sqlite3_open(fullname, &db)) @@ -311,9 +358,8 @@ char *dbname; const char * t; Dictionary dict; - Dict_node *dict_node; - dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); + dict = (Dictionary) malloc(sizeof(struct Dictionary_s)); memset(dict, 0, sizeof(struct Dictionary_s)); /* Language and file-name stuff */ @@ -341,22 +387,22 @@ dict->db_handle = object_open(dict->name, db_open, NULL); dict->lookup_list = db_lookup_list; + dict->lookup_wild = db_lookup_wild; dict->free_lookup = db_free_llist; dict->lookup = db_lookup; dict->close = db_close; - /* Misc remaining common (generic) dict setup work */ - dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); - dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); - - dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); - dict->use_unknown_word = true; - - dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD); - if (dict_node != NULL) - dict->unlimited_connector_set = connector_set_create(dict_node->exp); + /* Setup the affix table */ + dict->affix_table = (Dictionary) malloc(sizeof(struct Dictionary_s)); + memset(dict->affix_table, 0, sizeof(struct Dictionary_s)); + dict->affix_table->string_set = string_set_create(); + + afclass_init(dict->affix_table); + afdict_init(dict); + + dictionary_setup_locale(dict); - free_lookup_list(dict, dict_node); + dictionary_setup_defines(dict); return dict; } diff -Nru link-grammar-5.3.16/link-grammar/dict-sql/read-sql.h link-grammar-5.5.0/link-grammar/dict-sql/read-sql.h --- link-grammar-5.3.16/link-grammar/dict-sql/read-sql.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-sql/read-sql.h 2017-07-16 04:55:06.000000000 +0000 @@ -1,11 +1,10 @@ - /* * read-sql.h * * Read in dictionary from an SQL DB. * Keeping it simple for just right now, and using SQLite. * - * The goal of reading the dictioary from SQL is to enable some + * The goal of reading the dictionary from SQL is to enable some * other process (machine-learning algo) to dynamically update * the dictionary. * diff -Nru link-grammar-5.3.16/link-grammar/dict-structures.h link-grammar-5.5.0/link-grammar/dict-structures.h --- link-grammar-5.3.16/link-grammar/dict-structures.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/dict-structures.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,80 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_DICT_STRUCTURES_H_ -#define _LG_DICT_STRUCTURES_H_ - -#include -#include - -LINK_BEGIN_DECLS - -/* Forward decls */ -typedef struct Dict_node_struct Dict_node; -typedef struct Exp_struct Exp; -typedef struct E_list_struct E_list; -typedef struct Word_file_struct Word_file; - -/** - * Types of Exp_struct structures - */ -typedef enum -{ - OR_type = 1, - AND_type, - CONNECTOR_type -} Exp_type; - -/** - * The E_list and Exp structures defined below comprise the expression - * trees that are stored in the dictionary. The expression has a type - * (AND, OR or TERMINAL). If it is not a terminal it has a list - * (an E_list) of children. - */ -struct Exp_struct -{ - Exp * next; /* Used only for mem management,for freeing */ - Exp_type type; /* One of three types: AND, OR, or connector. */ - char dir; /* '-' means to the left, '+' means to right (for connector) */ - bool multi; /* TRUE if a multi-connector (for connector) */ - union { - E_list * l; /* Only needed for non-terminals */ - const char * string; /* Only needed if it's a connector */ - } u; - double cost; /* The cost of using this expression. - Only used for non-terminals */ -}; - -struct E_list_struct -{ - E_list * next; - Exp * e; -}; - - -/** - * The dictionary is stored as a binary tree comprised of the following - * nodes. A list of these (via right pointers) is used to return - * the result of a dictionary lookup. - */ -struct Dict_node_struct -{ - const char * string; /* The word itself */ - Word_file * file; /* The file the word came from (NULL if dict file) */ - Exp * exp; - Dict_node *left, *right; -}; - -LINK_END_DECLS - -#endif /* _LG_DICT_STRUCTURES_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/disjuncts.c link-grammar-5.5.0/link-grammar/disjuncts.c --- link-grammar-5.3.16/link-grammar/disjuncts.c 2016-11-17 18:36:07.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/disjuncts.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,91 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2008, 2009, 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ -/* - * disjuncts.c - * - * Miscellaneous utilities for returning the list of disjuncts that - * were actually used in a given parse of a sentence. - */ - -#include -#include -#include "api-structures.h" -#include "disjuncts.h" -#include "structures.h" - -/** - * Print connector list to string. - * This reverses the order of the connectors in the connector list, - * so that the resulting list is in the same order as it would appear - * in the dictionary. The character 'dir' is appended to each connector. - */ -static char * reversed_conlist_str(Connector* c, char dir, char* buf, size_t sz) -{ - char* p; - size_t len = 0; - - if (NULL == c) return buf; - p = reversed_conlist_str(c->next, dir, buf, sz); - - sz -= (p-buf); - - if (c->multi) - p[len++] = '@'; - - len += lg_strlcpy(p+len, c->string, sz-len); - if (3 < sz-len) - { - p[len++] = dir; - p[len++] = ' '; - p[len] = 0x0; - } - return p+len; -} - -/** - * Print disjunct to string. The resulting list is in the same order - * as it would appear in the dictionary. - */ -static void disjunct_str(Disjunct* dj, char* buf, size_t sz) -{ - char* p; - if (NULL == dj) { *buf = 0; return; } - p = reversed_conlist_str(dj->left, '-', buf, sz); - reversed_conlist_str(dj->right, '+', p, sz - (p-buf)); -} - -/** - * lg_compute_disjunct_strings -- Given sentence, compute disjuncts. - * - * This routine will compute the string representation of the disjunct - * used for each word in parsing the given sentence. A string - * representation of the disjunct is needed for most of the corpus - * statistics functions: this string, together with the subscripted - * word, is used as a key to index the statistics information in the - * database. - */ -void lg_compute_disjunct_strings(Linkage lkg) -{ - char djstr[MAX_TOKEN_LENGTH*20]; /* no word will have more than 20 links */ - size_t nwords = lkg->num_words; - - if (lkg->disjunct_list_str) return; - lkg->disjunct_list_str = (char **) malloc(nwords * sizeof(char *)); - memset(lkg->disjunct_list_str, 0, nwords * sizeof(char *)); - - for (WordIdx w=0; w< nwords; w++) - { - Disjunct* dj = lkg->chosen_disjuncts[w]; - disjunct_str(dj, djstr, sizeof(djstr)); - - lkg->disjunct_list_str[w] = strdup(djstr); - } -} diff -Nru link-grammar-5.3.16/link-grammar/disjuncts.h link-grammar-5.5.0/link-grammar/disjuncts.h --- link-grammar-5.3.16/link-grammar/disjuncts.h 2014-12-15 06:07:23.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/disjuncts.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2008, 2009 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "api-types.h" -#include "link-includes.h" - -void lg_compute_disjunct_strings(Linkage); diff -Nru link-grammar-5.3.16/link-grammar/disjunct-utils.c link-grammar-5.5.0/link-grammar/disjunct-utils.c --- link-grammar-5.3.16/link-grammar/disjunct-utils.c 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/disjunct-utils.c 2018-04-22 14:57:33.000000000 +0000 @@ -10,15 +10,15 @@ /* */ /*************************************************************************/ -#include #include + +#include "api-structures.h" // Sentence +#include "connectors.h" #include "disjunct-utils.h" -#include "externs.h" -#include "string-set.h" -#include "structures.h" +#include "print/print-util.h" #include "utilities.h" -#include "wordgraph.h" -#include "word-utils.h" +#include "tokenize/tok-structures.h" // XXX TODO provide gword access methods! +#include "tokenize/word-structures.h" /* Disjunct utilities ... */ @@ -37,6 +37,22 @@ } } +void free_sentence_disjuncts(Sentence sent) +{ + if (NULL != sent->disjuncts_connectors_memblock) + { + free(sent->disjuncts_connectors_memblock); + sent->disjuncts_connectors_memblock = NULL; + } + else + { + for (WordIdx i = 0; i < sent->length; i++) + { + free_disjuncts(sent->word[i].d); + } + } +} + /** * Destructively catenates the two disjunct lists d1 followed by d2. * Doesn't change the contents of the disjuncts. @@ -85,24 +101,22 @@ unsigned int i; i = 0; for (e = d->left ; e != NULL; e = e->next) { - i += string_hash(e->string); + i += e->desc->str_hash; } for (e = d->right ; e != NULL; e = e->next) { - i += string_hash(e->string); + i += e->desc->str_hash; } - i += string_hash(d->string); + i += string_hash(d->word_string); i += (i>>10); return (i & (dt->dup_table_size-1)); } /** - * The connectors must be exactly equal. A similar function - * is connectors_equal_AND(), but that ignores priorities, - * this does not. + * The connectors must be exactly equal. */ static bool connectors_equal_prune(Connector *c1, Connector *c2) { - return string_set_cmp(c1->string, c2->string) && (c1->multi == c2->multi); + return c1->desc == c2->desc && (c1->multi == c2->multi); } /** returns TRUE if the disjuncts are exactly the same */ @@ -131,8 +145,8 @@ /* Save CPU time by comparing this last, since this will * almost always be true. Rarely, the strings are not from * the same string_set and hence the 2-step comparison. */ - if (d1->string == d2->string) return true; - return (strcmp(d1->string, d2->string) == 0); + if (d1->word_string == d2->word_string) return true; + return (strcmp(d1->word_string, d2->word_string) == 0); } /** @@ -148,7 +162,7 @@ for (t = origc; t != NULL; t = t->next) { - newc = connector_new(); + newc = connector_new(NULL, NULL); *newc = *t; prevc->next = newc; @@ -173,7 +187,7 @@ for (t = origd; t != NULL; t = t->next) { newd = (Disjunct *)xalloc(sizeof(Disjunct)); - newd->string = t->string; + newd->word_string = t->word_string; newd->cost = t->cost; newd->left = connectors_dup(t->left); newd->right = connectors_dup(t->right); @@ -186,6 +200,58 @@ return head.next; } +static Connector *pack_connectors_dup(Connector *origc, Connector **cblock) +{ + Connector head; + Connector *prevc = &head; + Connector *newc = &head; + Connector *t; + Connector *lcblock = *cblock; /* Optimization. */ + + for (t = origc; t != NULL; t = t->next) + { + newc = lcblock++; + *newc = *t; + + prevc->next = newc; + prevc = newc; + } + newc->next = NULL; + + *cblock = lcblock; + return head.next; +} + +/** + * Duplicate the given disjunct chain. + * If the argument is NULL, return NULL. + */ +Disjunct *pack_disjuncts_dup(Disjunct *origd, Disjunct **dblock, Connector **cblock) +{ + Disjunct head; + Disjunct *prevd = &head; + Disjunct *newd = &head; + Disjunct *t; + Disjunct *ldblock = *dblock; /* Optimization. */ + + for (t = origd; t != NULL; t = t->next) + { + newd = ldblock++; + newd->word_string = t->word_string; + newd->cost = t->cost; + + newd->left = pack_connectors_dup(t->left, cblock); + newd->right = pack_connectors_dup(t->right, cblock); + newd->originating_gword = t->originating_gword; + prevd->next = newd; + prevd = newd; + } + newd->next = NULL; + + *dblock = ldblock; + return head.next; +} + static disjunct_dup_table * disjunct_dup_table_new(size_t sz) { size_t i; @@ -217,7 +283,7 @@ /** * Return a new gword_set element, initialized from the given element. - * @old_e Existing element. + * @param old_e Existing element. */ static gword_set *gword_set_element_new(gword_set *old_e) { @@ -251,8 +317,8 @@ * This function is used for adding the gword pointers of an eliminated * disjunct to the ones of the kept disjuncts, with no duplicates. * - * @kept gword_set of the kept disjunct. - * @eliminated gword_set of the eliminated disjunct. + * @param kept gword_set of the kept disjunct. + * @param eliminated gword_set of the eliminated disjunct. * @return Use copy-on-write semantics - the gword_set of the kept disjunct * just gets returned if there is nothing to add to it. Else - a new gword * set is returned. @@ -348,37 +414,54 @@ * Be sure to free the string upon return. */ -static char * prt_con(Connector *c, char * p, char dir, size_t * bufsz) +static void prt_con(Connector *c, dyn_str * p, char dir) { - size_t n; - - if (NULL == c) return p; - p = prt_con (c->next, p, dir, bufsz); + if (NULL == c) return; + prt_con (c->next, p, dir); if (c->multi) { - n = snprintf(p, *bufsz, "@%s%c ", c->string, dir); - *bufsz -= n; + append_string(p, "@%s%c ", connector_string(c), dir); } else { - n = snprintf(p, *bufsz, "%s%c ", c->string, dir); - *bufsz -= n; + append_string(p, "%s%c ", connector_string(c), dir); } - return p+n; } char * print_one_disjunct(Disjunct *dj) { - char buff[MAX_LINE]; - char * p = buff; - size_t bufsz = MAX_LINE; - - p = prt_con(dj->left, p, '-', &bufsz); - p = prt_con(dj->right, p, '+', &bufsz); - buff[MAX_LINE-1] = 0; + dyn_str *p = dyn_str_new(); + + prt_con(dj->left, p, '-'); + prt_con(dj->right, p, '+'); + + return dyn_str_take(p); +} + +/* ============================================================= */ + +/** + * returns the number of connectors in the left lists of the disjuncts. + */ +int left_connector_count(Disjunct * d) +{ + Connector *c; + int i=0; + for (;d!=NULL; d=d->next) { + for (c = d->left; c!=NULL; c = c->next) i++; + } + return i; +} - return strdup(buff); +int right_connector_count(Disjunct * d) +{ + Connector *c; + int i=0; + for (;d!=NULL; d=d->next) { + for (c = d->right; c!=NULL; c = c->next) i++; + } + return i; } /* ============================================================= */ diff -Nru link-grammar-5.3.16/link-grammar/disjunct-utils.h link-grammar-5.5.0/link-grammar/disjunct-utils.h --- link-grammar-5.3.16/link-grammar/disjunct-utils.h 2017-02-03 00:13:22.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/disjunct-utils.h 2018-04-22 14:57:33.000000000 +0000 @@ -4,7 +4,7 @@ /* All rights reserved */ /* */ /* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this softwares. */ +/* license set forth in the LICENSE file included with this softwares. */ /* This license allows free redistribution and use in source and binary */ /* forms, with or without modification, subject to certain conditions. */ /* */ @@ -13,16 +13,43 @@ #ifndef _LINK_GRAMMAR_DISJUNCT_UTILS_H_ #define _LINK_GRAMMAR_DISJUNCT_UTILS_H_ +#include + #include "api-types.h" -#include "structures.h" +#include "api-structures.h" // Sentence + +// Can undefine VERIFY_MATCH_LIST when done debugging... +#define VERIFY_MATCH_LIST + +struct Disjunct_struct +{ + Disjunct *next; + Connector *left, *right; + double cost; + bool marked; /* unmarked disjuncts get deleted */ + + /* match_left, right used only during parsing, for the match list. */ + bool match_left, match_right; + +#ifdef VERIFY_MATCH_LIST + int match_id; /* verify the match list integrity */ +#endif + gword_set *originating_gword; /* Set of originating gwords */ + const char * word_string; /* subscripted dictionary word */ +}; /* Disjunct utilities ... */ void free_disjuncts(Disjunct *); +void free_sentence_disjuncts(Sentence); unsigned int count_disjuncts(Disjunct *); Disjunct * catenate_disjuncts(Disjunct *, Disjunct *); Disjunct * eliminate_duplicate_disjuncts(Disjunct * ); char * print_one_disjunct(Disjunct *); void word_record_in_disjunct(const Gword *, Disjunct *); Disjunct * disjuncts_dup(Disjunct *origd); +int left_connector_count(Disjunct *); +int right_connector_count(Disjunct *); + +Disjunct *pack_disjuncts_dup(Disjunct *, Disjunct **, Connector **); #endif /* _LINK_GRAMMAR_DISJUNCT_UTILS_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/error.c link-grammar-5.5.0/link-grammar/error.c --- link-grammar-5.3.16/link-grammar/error.c 2017-01-27 22:01:05.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/error.c 2018-02-22 01:37:52.000000000 +0000 @@ -17,9 +17,9 @@ #include #include "error.h" -#include "structures.h" -#include "api-structures.h" -#include "print-util.h" +#include "api-structures.h" // For Sentence_s +#include "print/print.h" // For print_sentence_context() +#include "print/print-util.h" // For append_string() static void default_error_handler(lg_errinfo *, void *); static TLS struct @@ -170,9 +170,7 @@ */ char *lg_error_formatmsg(lg_errinfo *lge) { - char *formated_error_message; - - String *s = string_new(); + dyn_str *s = dyn_str_new(); /* Prepend libname to messages with higher severity than Debug. */ if (lge->severity < lg_Debug) @@ -183,10 +181,23 @@ append_string(s, "%s", lge->text); - formated_error_message = string_copy(s); - string_delete(s); + return dyn_str_take(s); +} + +static TLS dyn_str *outbuf = NULL; - return formated_error_message; +/** + * Flush a partial error message if exists. + * Return true iff a message has been actually flushed. + * + * (Just using prt_error("\n") also flushes a buffered partial error + * message, but if there is no such message an empty message is generated). + */ +bool lg_error_flush(void) +{ + if (outbuf == NULL) return false; + prt_error("\n"); + return true; } /* ================================================================== */ @@ -198,11 +209,8 @@ { FILE *outfile = stdout; - if (((NULL == data) && (lge->severity <= lg_Debug)) || - ((NULL != data) && (lge->severity <= *(lg_error_severity *)data) && - (lg_None != lge->severity))) if (((NULL == data) && (lge->severity < lg_Debug)) || - ((NULL != data) && (lge->severity < *(lg_error_severity *)data) && + ((NULL != data) && (lge->severity < *(lg_error_severity *)(int *)data) && (lg_None != lge->severity))) { fflush(stdout); /* Make sure that stdout has been written out first. */ @@ -239,69 +247,12 @@ return strdup(sevlabel); } -static void print_sentence_context(String *outbuf, const err_ctxt *ec) -{ - size_t i, j; - const char **a, **b; - -#if 0 - /* Previous code. Documenting its problem: - * In the current library version (using Wordgraph) it may print a - * nonsense sequence of morphemes if the words have been split to - * morphemes in various ways, because the "alternatives" array doesn't - * hold real alternatives any more (see example in the comments of - * print_sentence_word_alternatives()). - * - * We could print the first path in the Wordgraph, analogous to what we - * did here, but (same problem as printing alternatives[0] only) it may - * not contain all the words, including those that failed (because they - * are in another path). */ - - fprintf(stderr, "\tFailing sentence was:\n\t"); - for (i=0; isent->length; i++) - { - fprintf(stderr, "%s ", ec->sent->word[i].alternatives[0]); - } -#else - /* The solution is just to print all the sentence tokenized subwords in - * their order in the sentence, without duplications. */ - - append_string(outbuf, - "\tFailing sentence contains the following words/morphemes:\n\t"); - for (i=0; isent->length; i++) - { - for (a = ec->sent->word[i].alternatives; NULL != *a; a++) - { - bool next_word = false; - - for (j=0; jsent->length; j++) - { - for (b = ec->sent->word[j].alternatives; NULL != *b; b++) - { - /* print only the first occurrence. */ - if (0 == strcmp(*a, *b)) - { - next_word = true; - if (a != b) break; - append_string(outbuf, "%s ", *a); - break; - } - } - if (next_word) break; - } - } - } - append_string(outbuf, "\n"); -#endif -} - static void verr_msg(err_ctxt *ec, lg_error_severity sev, const char *fmt, va_list args) GNUC_PRINTF(3,0); static void verr_msg(err_ctxt *ec, lg_error_severity sev, const char *fmt, va_list args) { - static TLS String *outbuf; - if (NULL == outbuf) outbuf = string_new(); + if (NULL == outbuf) outbuf = dyn_str_new(); /* * If the message is a complete one, it ends with a newline. Else the @@ -328,11 +279,11 @@ if (partline) return; if ((NULL != ec) && (NULL != ec->sent)) - print_sentence_context(outbuf, ec); + print_sentence_context(ec->sent, outbuf); lg_errinfo current_error; /* current_error.ec = *ec; */ - const char *error_text = string_value(outbuf); + const char *error_text = outbuf->str; lg_error_severity msg_sev = message_error_severity(error_text); if (lg_None != msg_sev) { @@ -355,7 +306,7 @@ free((void *)current_error.severity_label); } - string_delete(outbuf); + dyn_str_delete(outbuf); outbuf = NULL; } @@ -378,9 +329,9 @@ * See \link verr_msg \endlink for how the severity is handled * if it is not specified. * - * @fmt printf()-like format. - * @... printf()-like arguments. - * @retrun Always 0, not to be used. This is needed so prt_error() + * @param fmt printf()-like format. + * @param ... printf()-like arguments. + * @return Always 0, not to be used. This is needed so prt_error() * can be used in complex macros that have to use the comma operator. */ int prt_error(const char *fmt, ...) @@ -413,6 +364,7 @@ while (NULL != (feature = va_arg(given_features, char *))) { + if ('\0' == feature[0]) continue; size_t len = strlen(feature); char *buff = alloca(len + 2 + 1); /* leading comma + comma/colon + NUL */ @@ -430,9 +382,17 @@ strcpy(buff+1, feature); strcat(buff, ","); - if (NULL != strstr(list, buff)) return ","; + if (NULL != strstr(list, buff)) + { + va_end(given_features); + return ","; + } buff[len+1] = ':'; /* check for "feature:param" */ - if (NULL != strstr(list, buff)) return strstr(list, buff) + len + 1; + if (NULL != strstr(list, buff)) + { + va_end(given_features); + return strstr(list, buff) + len + 1; + } } va_end(given_features); diff -Nru link-grammar-5.3.16/link-grammar/error.h link-grammar-5.5.0/link-grammar/error.h --- link-grammar-5.3.16/link-grammar/error.h 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/error.h 2018-04-23 19:49:07.000000000 +0000 @@ -5,7 +5,7 @@ /* All rights reserved */ /* */ /* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ +/* license set forth in the LICENSE file included with this software. */ /* This license allows free redistribution and use in source and binary */ /* forms, with or without modification, subject to certain conditions. */ /* */ @@ -14,6 +14,7 @@ #define _LINK_GRAMMAR_ERROR_H_ #include "link-includes.h" +#include "externs.h" // Needed for verbosity /* User verbosity levels are 1-4, to be used for user info/debug. * For now hard-coded numbers are still used instead of D_USER_BASIC/TIMES. */ @@ -22,6 +23,8 @@ #define D_USER_FILES 3 /* Display data file search and locale setup. */ // #define D_USER_X 4 /* Not in use yet. */ #define D_USER_MAX 4 /* Maximum user verbosity level. */ +#define D_DICT 10 /* Base of dictionary debug levels. */ +#define D_SPEC 100 /* Base of special stand-alone levels. */ typedef struct { @@ -33,17 +36,23 @@ const char *feature_enabled(const char *, ...); /** - * Print a debug message at verbosity >= level. - * Preceding the level number by a + (+level) adds printing of the - * function name. - * Level numbers 2 to D_USER_MAX are not printed on verbosity>D_USER_MAX, - * because they are designed only for extended user information. - * The !debug variable can be set to a comma-separated list of functions - * in order to restrict the debug messages to these functions only. + * Print a debug message according to their level. + * Print the messages at levels <= the specified verbosity, with the + * following restrictions: + * - Level numbers 2 to D_USER_MAX are not printed on verbosity>D_USER_MAX, + * because they are designed only for extended user information. + * - When verbosity > D_SPEC, print messages only when level==verbosity. + * - The !debug variable can be set to a comma-separated list of functions + * or source filenames in order to restrict the debug messages to these + * functions or filenames only. + * + * Invoking lgdebug() with a level number preceded by a + (+level) adds + * printing of the function name. */ #define lgdebug(level, ...) \ -(((verbosity>=(level)) && (((level)<=1) || \ - !(((level)<=D_USER_MAX) && (verbosity>D_USER_MAX))) && \ + (( \ + (((D_SPEC>=verbosity) && (verbosity>=(level))) || (verbosity==(level))) && \ + (((level)<=1) || !(((level)<=D_USER_MAX) && (verbosity>D_USER_MAX))) && \ (('\0' == debug[0]) || \ feature_enabled(debug, __func__, __FILE__, NULL))) ? \ ( \ @@ -73,14 +82,17 @@ * and not err_msg(), in order to not specify the message severity. * Also note there is no trailing newline in that case. These things * ensured the message severity will be taken from a following message - * which includes a newline. So verbosity_level()V) can be used for any + * which includes a newline. So verbosity_level(V) can be used for any * desired message severity. + * The optional argument is used for additional names that can be used + * in the "debug" option (in addition to the current function and file names). */ -#define verbosity_level(level) \ -(((verbosity>=(level)) && (((level)<=1) || \ - !(((level)<=D_USER_MAX) && (verbosity>D_USER_MAX))) && \ +#define verbosity_level(level, ...) \ + (( \ + (((D_SPEC>=verbosity) && (verbosity>=(level))) || (verbosity==(level))) && \ + (((level)<=1) || !(((level)<=D_USER_MAX) && (verbosity>D_USER_MAX))) && \ (('\0' == debug[0]) || \ - feature_enabled(debug, __func__, __FILE__, NULL))) \ + feature_enabled(debug, __func__, __FILE__, (__VA_ARGS__ ""), NULL))) \ ? ((STRINGIFY(level)[0] == '+' ? prt_error("%s: ", __func__) : 0), true) \ : false) diff -Nru link-grammar-5.3.16/link-grammar/expand.c link-grammar-5.5.0/link-grammar/expand.c --- link-grammar-5.3.16/link-grammar/expand.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/expand.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,66 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2009 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ -/* - * expand.c - * - * Enlarge the range of possible disjunct to consider while parsing. - */ - -#include "api-structures.h" -#include "expand.h" -#include "externs.h" -#include "disjunct-utils.h" -#include "word-utils.h" -#include "corpus/cluster.h" - -/* ========================================================= */ - -static Disjunct * build_expansion_disjuncts(Cluster *clu, X_node *x) -{ - Disjunct *dj; - dj = lg_cluster_get_disjuncts(clu, x->string); - if (dj && (verbosity > 0)) prt_error("Expanded %s \n", x->string); - return dj; -} - -/** - * Increase the number of disjuncts associated to each word in the - * sentence by working with word-clusters. Return true if the number - * of disjuncts were expanded, else return false. - */ -bool lg_expand_disjunct_list(Sentence sent) -{ - size_t w; - - Cluster *clu = lg_cluster_new(); - - bool expanded = false; - for (w = 0; w < sent->length; w++) - { - X_node * x; - Disjunct * d = sent->word[w].d; - for (x = sent->word[w].x; x != NULL; x = x->next) - { - Disjunct *dx = build_expansion_disjuncts(clu, x); - if (dx) - { - unsigned int cnt = count_disjuncts(d); - d = catenate_disjuncts(dx, d); - d = eliminate_duplicate_disjuncts(d); - if (cnt < count_disjuncts(d)) expanded = true; - } - } - sent->word[w].d = d; - } - lg_cluster_delete(clu); - - return expanded; -} diff -Nru link-grammar-5.3.16/link-grammar/expand.h link-grammar-5.5.0/link-grammar/expand.h --- link-grammar-5.3.16/link-grammar/expand.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/expand.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2009 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this softwares. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/* Defined in link-includes.h */ -/* int lg_expand_disjunct_list(Sentence sent); */ - diff -Nru link-grammar-5.3.16/link-grammar/externs.h link-grammar-5.5.0/link-grammar/externs.h --- link-grammar-5.3.16/link-grammar/externs.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/externs.h 2017-05-28 11:14:43.000000000 +0000 @@ -12,7 +12,7 @@ #ifndef _EXTERNS_H #define _EXTERNS_H -/* verbosity global is held in utilities.c */ +/* verbosity global is held in api.c */ extern int verbosity; /* the verbosity level for error messages */ extern char * debug; /* comma-separated functions/files to debug */ extern char * test; /* comma-separated features to test */ diff -Nru link-grammar-5.3.16/link-grammar/extract-links.c link-grammar-5.5.0/link-grammar/extract-links.c --- link-grammar-5.3.16/link-grammar/extract-links.c 2017-01-27 22:07:11.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/extract-links.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,666 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2010, 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include /* For UINT_MAX */ - -#include "count.h" -#include "extract-links.h" -#include "fast-match.h" -#include "linkage.h" -#include "word-utils.h" - -/** - * The first thing we do is we build a data structure to represent the - * result of the entire parse search. There will be a set of nodes - * built for each call to the count() function that returned a non-zero - * value, AND which is part of a valid linkage. Each of these nodes - * represents a valid continuation, and contains pointers to two other - * sets (one for the left continuation and one for the right - * continuation). - */ - -static void free_set(Parse_set *s) -{ - Parse_choice *p, *xp; - if (s == NULL) return; - for (p=s->first; p != NULL; p = xp) - { - xp = p->next; - xfree((void *)p, sizeof(*p)); - } -} - -static Parse_choice * -make_choice(Parse_set *lset, Connector * llc, Connector * lrc, - Parse_set *rset, Connector * rlc, Connector * rrc, - Disjunct *ld, Disjunct *md, Disjunct *rd) -{ - Parse_choice *pc; - pc = (Parse_choice *) xalloc(sizeof(*pc)); - pc->next = NULL; - pc->set[0] = lset; - pc->link[0].link_name = NULL; - pc->link[0].lw = lset->lw; - pc->link[0].rw = lset->rw; - pc->link[0].lc = llc; - pc->link[0].rc = lrc; - pc->set[1] = rset; - pc->link[1].link_name = NULL; - pc->link[1].lw = rset->lw; - pc->link[1].rw = rset->rw; - pc->link[1].lc = rlc; - pc->link[1].rc = rrc; - pc->ld = ld; - pc->md = md; - pc->rd = rd; - return pc; -} - -/** - * Put this parse_choice into a given set. The tail pointer is always - * left pointing to the end of the list. - */ -static void put_choice_in_set(Parse_set *s, Parse_choice *pc) -{ - if (s->first == NULL) - { - s->first = pc; - } - else - { - s->tail->next = pc; - } - s->tail = pc; - pc->next = NULL; -} - -static void record_choice( - Parse_set *lset, Connector * llc, Connector * lrc, - Parse_set *rset, Connector * rlc, Connector * rrc, - Disjunct *ld, Disjunct *md, Disjunct *rd, Parse_set *s) -{ - put_choice_in_set(s, make_choice(lset, llc, lrc, - rset, rlc, rrc, - ld, md, rd)); -} - -/** - * Allocate the parse info struct - * - * A piecewise exponential function determines the size of the hash - * table. Probably should make use of the actual number of disjuncts, - * rather than just the number of words. - */ -Parse_info parse_info_new(int nwords) -{ - int log2_table_size; - Parse_info pi; - - pi = (Parse_info) xalloc(sizeof(struct Parse_info_struct)); - memset(pi, 0, sizeof(struct Parse_info_struct)); - pi->N_words = nwords; - - /* Alloc the x_table */ - if (nwords >= 10) { - log2_table_size = 14; - } else if (nwords >= 4) { - log2_table_size = nwords; - } else { - log2_table_size = 4; - } - pi->log2_x_table_size = log2_table_size; - pi->x_table_size = (1 << log2_table_size); - - /*printf("Allocating x_table of size %d\n", x_table_size);*/ - pi->x_table = (X_table_connector**) xalloc(pi->x_table_size * sizeof(X_table_connector*)); - memset(pi->x_table, 0, pi->x_table_size * sizeof(X_table_connector*)); - - return pi; -} - -/** - * This is the function that should be used to free the set structure. Since - * it's a dag, a recursive free function won't work. Every time we create - * a set element, we put it in the hash table, so this is OK. - */ -void free_parse_info(Parse_info pi) -{ - unsigned int i; - X_table_connector *t, *x; - if (!pi) return; - - for (i=0; ix_table_size; i++) - { - for (t = pi->x_table[i]; t!= NULL; t=x) - { - x = t->next; - free_set(&t->set); - xfree((void *) t, sizeof(X_table_connector)); - } - } - pi->parse_set = NULL; - - /*printf("Freeing x_table of size %d\n", x_table_size);*/ - xfree((void *) pi->x_table, pi->x_table_size * sizeof(X_table_connector*)); - pi->x_table_size = 0; - pi->x_table = NULL; - - xfree((void *) pi, sizeof(struct Parse_info_struct)); -} - -/** - * Returns the pointer to this info, NULL if not there. - */ -static X_table_connector * x_table_pointer(int lw, int rw, - Connector *le, Connector *re, - unsigned int null_count, Parse_info pi) -{ - X_table_connector *t; - t = pi->x_table[pair_hash(pi->x_table_size, lw, rw, le, re, null_count)]; - for (; t != NULL; t = t->next) { - if ((t->set.lw == lw) && (t->set.rw == rw) && - (t->set.le == le) && (t->set.re == re) && - (t->set.null_count == null_count)) return t; - } - return NULL; -} - -/** - * Stores the value in the x_table. Assumes it's not already there. - */ -static X_table_connector * x_table_store(int lw, int rw, - Connector *le, Connector *re, - unsigned int null_count, Parse_info pi) -{ - X_table_connector *t, *n; - unsigned int h; - - n = (X_table_connector *) xalloc(sizeof(X_table_connector)); - n->set.lw = lw; - n->set.rw = rw; - n->set.null_count = null_count; - n->set.le = le; - n->set.re = re; - n->set.count = 0; - n->set.first = NULL; - n->set.tail = NULL; - - h = pair_hash(pi->x_table_size, lw, rw, le, re, null_count); - t = pi->x_table[h]; - n->next = t; - pi->x_table[h] = n; - return n; -} - -/** Create a bogus parse set that only holds lw, rw. */ -static Parse_set* dummy_set(int lw, int rw, - unsigned int null_count, Parse_info pi) -{ - X_table_connector *dummy; - dummy = x_table_pointer(lw, rw, NULL, NULL, null_count, pi); - if (dummy) return &dummy->set; - - dummy = x_table_store(lw, rw, NULL, NULL, null_count, pi); - dummy->set.count = 1; - return &dummy->set; -} - -#ifdef FINISH_THIS_IDEA_MAYBE_LATER -static int cost_compare(const void *a, const void *b) -{ - const Match_node* const * ma = a; - const Match_node* const * mb = b; - if ((*ma)->d->cost < (*mb)->d->cost) return -1; - if ((*ma)->d->cost > (*mb)->d->cost) return 1; - return 0; -} - -/** - * Sort the matchlist into ascending disjunct cost. The goal here - * is to issue the lowest-cost disjuncts first, so that the parse - * set ends up quasi-sorted. This is not enough to get us a totally - * sorted parse set, but it does guarantee that at least the very - * first parse really will be the lowest cost. - */ -static Match_node* sort_matchlist(Match_node* mlist) -{ - Match_node* mx; - Match_node** marr; - size_t len = 1; - size_t i; - - for (mx = mlist; mx->next != NULL; mx = mx->next) len++; - if (1 == len) return mlist; - - /* Avoid blowing out the stack. Its hopeless. */ - if (100000 < len) return mlist; - - marr = alloca(len * sizeof(Match_node*)); - i = 0; - for (mx = mlist; mx != NULL; mx = mx->next) marr[i++] = mx; - - qsort((void *) marr, len, sizeof(Match_node*), cost_compare); - for (i=0; inext = marr[i+1]; - marr[len-1]->next = NULL; - return marr[0]; -} -#endif /* FINISH_THIS_IDEA_MAYBE_LATER */ - -/** - * returns NULL if there are no ways to parse, or returns a pointer - * to a set structure representing all the ways to parse. - * - * This code is similar to do_count() in count.c -- for a good reason: - * the do_count() function did a full parse, but didn't actually - * allocate an memory structures to hold the parse. This also does - * a full parse, but it also allocates and fills out the various - * parse structures. - */ -static -Parse_set * mk_parse_set(Sentence sent, fast_matcher_t *mchxt, - count_context_t * ctxt, - Disjunct *ld, Disjunct *rd, int lw, int rw, - Connector *le, Connector *re, unsigned int null_count, - bool islands_ok, Parse_info pi) -{ - int start_word, end_word, w; - X_table_connector *xt; - Count_bin * count; - - assert(null_count < 0x7fff, "mk_parse_set() called with null_count < 0."); - - count = table_lookup(ctxt, lw, rw, le, re, null_count); - - /* If there's no counter, then there's no way to parse. */ - if (NULL == count) return NULL; - if (hist_total(count) == 0) return NULL; - - xt = x_table_pointer(lw, rw, le, re, null_count, pi); - - /* Perhaps we've already computed it; if so, return it. */ - if (xt != NULL) return &xt->set; - - /* Start it out with the empty set of parse choices. */ - /* This entry must be updated before we return. */ - xt = x_table_store(lw, rw, le, re, null_count, pi); - - /* The count we previously computed; its non-zero. */ - xt->set.count = hist_total(count); - -#define NUM_PARSES 4 - // xt->set.cost_cutoff = hist_cost_cutoff(count, NUM_PARSES); - // xt->set.cut_count = hist_cut_total(count, NUM_PARSES); - -#define RECOUNT(X) /* Make it disappear... */ - RECOUNT({xt->set.recount = 1;}) - - /* If the two words are next to each other, the count == 1 */ - if (lw + 1 == rw) return &xt->set; - - /* The left and right connectors are null, but the two words are - * NOT next to each-other. */ - if ((le == NULL) && (re == NULL)) - { - Parse_set* pset; - Parse_set* dummy; - Disjunct* dis; - - if (!islands_ok && (lw != -1)) return &xt->set; - if (null_count == 0) return &xt->set; - - RECOUNT({xt->set.recount = 0;}) - - w = lw + 1; - for (dis = sent->word[w].d; dis != NULL; dis = dis->next) - { - if (dis->left == NULL) - { - pset = mk_parse_set(sent, mchxt, ctxt, - dis, NULL, w, rw, dis->right, NULL, - null_count-1, islands_ok, pi); - if (pset == NULL) continue; - dummy = dummy_set(lw, w, null_count-1, pi); - record_choice(dummy, NULL, NULL, - pset, NULL, NULL, - NULL, NULL, NULL, &xt->set); - RECOUNT({xt->set.recount += pset->recount;}) - } - } - pset = mk_parse_set(sent, mchxt, ctxt, - NULL, NULL, w, rw, NULL, NULL, - null_count-1, islands_ok, pi); - if (pset != NULL) - { - dummy = dummy_set(lw, w, null_count-1, pi); - record_choice(dummy, NULL, NULL, - pset, NULL, NULL, - NULL, NULL, NULL, &xt->set); - RECOUNT({xt->set.recount += pset->recount;}) - } - return &xt->set; - } - - if (le == NULL) - { - start_word = lw + 1; - } - else - { - start_word = le->nearest_word; - } - - if (re == NULL) - { - end_word = rw; - } - else - { - end_word = re->nearest_word + 1; - } - - /* This condition can never be true here. It is included so GCC - * will be able to optimize the loop over "null_count". Without - * this check, GCC thinks this loop may be an infinite loop and - * it may omit some optimizations. */ - if (UINT_MAX == null_count) return NULL; - - RECOUNT({xt->set.recount = 0;}) - for (w = start_word; w < end_word; w++) - { - size_t mlb, mle; - mle = mlb = form_match_list(mchxt, w, le, lw, re, rw); - // if (mlist) mlist = sort_matchlist(mlist); - for (; get_match_list_element(mchxt, mle) != NULL; mle++) - { - unsigned int lnull_count, rnull_count; - Disjunct *d = get_match_list_element(mchxt, mle); - bool Lmatch = d->match_left; - bool Rmatch = d->match_right; - - for (lnull_count = 0; lnull_count <= null_count; lnull_count++) - { - int i, j; - Parse_set *ls[4], *rs[4]; - - /* Here, lnull_count and rnull_count are the null_counts - * we're assigning to those parts respectively. */ - rnull_count = null_count - lnull_count; - - /* Now, we determine if (based on table only) we can see that - the current range is not parsable. */ - - for (i=0; i<4; i++) { ls[i] = rs[i] = NULL; } - if (Lmatch) - { - ls[0] = mk_parse_set(sent, mchxt, ctxt, - ld, d, lw, w, le->next, d->left->next, - lnull_count, islands_ok, pi); - - if (le->multi) - ls[1] = mk_parse_set(sent, mchxt, ctxt, - ld, d, lw, w, le, d->left->next, - lnull_count, islands_ok, pi); - - if (d->left->multi) - ls[2] = mk_parse_set(sent, mchxt, ctxt, - ld, d, lw, w, le->next, d->left, - lnull_count, islands_ok, pi); - - if (le->multi && d->left->multi) - ls[3] = mk_parse_set(sent, mchxt, ctxt, - ld, d, lw, w, le, d->left, - lnull_count, islands_ok, pi); - } - - if (Rmatch) - { - rs[0] = mk_parse_set(sent, mchxt, ctxt, - d, rd, w, rw, d->right->next, re->next, - rnull_count, islands_ok, pi); - - if (d->right->multi) - rs[1] = mk_parse_set(sent, mchxt, ctxt, - d, rd, w, rw, d->right, re->next, - rnull_count, islands_ok, pi); - - if (re->multi) - rs[2] = mk_parse_set(sent, mchxt, ctxt, - d, rd, w, rw, d->right->next, re, - rnull_count, islands_ok, pi); - - if (d->right->multi && re->multi) - rs[3] = mk_parse_set(sent, mchxt, ctxt, - d, rd, w, rw, d->right, re, - rnull_count, islands_ok, pi); - } - - for (i=0; i<4; i++) - { - /* This ordering is probably not consistent with that - * needed to use list_links. (??) */ - if (ls[i] == NULL) continue; - for (j=0; j<4; j++) - { - if (rs[j] == NULL) continue; - record_choice(ls[i], le, d->left, - rs[j], d->right, re, - ld, d, rd, &xt->set); - RECOUNT({xt->set.recount += ls[i]->recount * rs[j]->recount;}) - } - } - - if (ls[0] != NULL || ls[1] != NULL || ls[2] != NULL || ls[3] != NULL) - { - /* Evaluate using the left match, but not the right */ - Parse_set* rset = mk_parse_set(sent, mchxt, ctxt, - d, rd, w, rw, d->right, re, - rnull_count, islands_ok, pi); - if (rset != NULL) - { - for (i=0; i<4; i++) - { - if (ls[i] == NULL) continue; - /* this ordering is probably not consistent with - * that needed to use list_links */ - record_choice(ls[i], le, d->left, - rset, NULL /* d->right */, - re, /* the NULL indicates no link*/ - ld, d, rd, &xt->set); - RECOUNT({xt->set.recount += ls[i]->recount * rset->recount;}) - } - } - } - if ((le == NULL) && (rs[0] != NULL || - rs[1] != NULL || rs[2] != NULL || rs[3] != NULL)) - { - /* Evaluate using the right match, but not the left */ - Parse_set* lset = mk_parse_set(sent, mchxt, ctxt, - ld, d, lw, w, le, d->left, - lnull_count, islands_ok, pi); - - if (lset != NULL) - { - for (j=0; j<4; j++) - { - if (rs[j] == NULL) continue; - /* this ordering is probably not consistent with - * that needed to use list_links */ - record_choice(lset, NULL /* le */, - d->left, /* NULL indicates no link */ - rs[j], d->right, re, - ld, d, rd, &xt->set); - RECOUNT({xt->set.recount += lset->recount * rs[j]->recount;}) - } - } - } - } - } - pop_match_list(mchxt, mlb); - } - return &xt->set; -} - -/** - * Return TRUE if and only if an overflow in the number of parses - * occurred. Use a 64-bit int for counting. - */ -static bool set_node_overflowed(Parse_set *set) -{ - Parse_choice *pc; - s64 n = 0; - if (set == NULL || set->first == NULL) return false; - - for (pc = set->first; pc != NULL; pc = pc->next) - { - n += pc->set[0]->count * pc->set[1]->count; - if (PARSE_NUM_OVERFLOW < n) return true; - } - return false; -} - -static bool set_overflowed(Parse_info pi) -{ - unsigned int i; - - assert(pi->x_table != NULL, "called set_overflowed with x_table==NULL"); - for (i=0; ix_table_size; i++) - { - X_table_connector *t; - for (t = pi->x_table[i]; t != NULL; t = t->next) - { - if (set_node_overflowed(&t->set)) return true; - } - } - return false; -} - -/** - * This is the top level call that computes the whole parse_set. It - * points whole_set at the result. It creates the necessary hash - * table (x_table) which will be freed at the same time the - * whole_set is freed. - * - * This assumes that do_parse() has been run, and that the count_context - * is filled with the values thus computed. This function is structured - * much like do_parse(), which wraps the main workhorse do_count(). - * - * If the number of linkages gets huge, then the counts can overflow. - * We check if this has happened when verifying the parse set. - * This routine returns TRUE iff an overflow occurred. - */ - -bool build_parse_set(Sentence sent, fast_matcher_t *mchxt, - count_context_t *ctxt, - unsigned int null_count, Parse_Options opts) -{ - sent->parse_info->parse_set = - mk_parse_set(sent, mchxt, ctxt, - NULL, NULL, -1, sent->length, NULL, NULL, null_count+1, - opts->islands_ok, sent->parse_info); - - - return set_overflowed(sent->parse_info); -} - -// Cannot be static, also called by SAT-solver. -void check_link_size(Linkage lkg) -{ - if (lkg->lasz <= lkg->num_links) - { - lkg->lasz = 2 * lkg->lasz + 10; - lkg->link_array = realloc(lkg->link_array, lkg->lasz * sizeof(Link)); - } -} - -static void issue_link(Linkage lkg, Disjunct * ld, Disjunct * rd, Link * link) -{ - check_link_size(lkg); - lkg->link_array[lkg->num_links] = *link; - lkg->num_links++; - - lkg->chosen_disjuncts[link->lw] = ld; - lkg->chosen_disjuncts[link->rw] = rd; -} - -static void issue_links_for_choice(Linkage lkg, Parse_choice *pc) -{ - if (pc->link[0].lc != NULL) { /* there is a link to generate */ - issue_link(lkg, pc->ld, pc->md, &pc->link[0]); - } - if (pc->link[1].lc != NULL) { - issue_link(lkg, pc->md, pc->rd, &pc->link[1]); - } -} - -static void list_links(Linkage lkg, const Parse_set * set, int index) -{ - Parse_choice *pc; - s64 n; - - if (set == NULL || set->first == NULL) return; - for (pc = set->first; pc != NULL; pc = pc->next) { - n = pc->set[0]->count * pc->set[1]->count; - if (index < n) break; - index -= n; - } - assert(pc != NULL, "walked off the end in list_links"); - issue_links_for_choice(lkg, pc); - list_links(lkg, pc->set[0], index % pc->set[0]->count); - list_links(lkg, pc->set[1], index / pc->set[0]->count); -} - -static void list_random_links(Linkage lkg, Parse_info pi, const Parse_set * set) -{ - Parse_choice *pc; - int num_pc, new_index; - - if (set == NULL || set->first == NULL) return; - num_pc = 0; - for (pc = set->first; pc != NULL; pc = pc->next) { - num_pc++; - } - - new_index = rand_r(&pi->rand_state) % num_pc; - - num_pc = 0; - for (pc = set->first; pc != NULL; pc = pc->next) { - if (new_index == num_pc) break; - num_pc++; - } - - assert(pc != NULL, "Couldn't get a random parse choice"); - issue_links_for_choice(lkg, pc); - list_random_links(lkg, pi, pc->set[0]); - list_random_links(lkg, pi, pc->set[1]); -} - -/** - * Generate the list of all links of the index'th parsing of the - * sentence. For this to work, you must have already called parse, and - * already built the whole_set. - */ -void extract_links(Linkage lkg, Parse_info pi) -{ - int index = lkg->lifo.index; - if (index < 0) - { - bool repeatable = false; - if (0 == pi->rand_state) repeatable = true; - if (repeatable) pi->rand_state = index; - list_random_links(lkg, pi, pi->parse_set); - if (repeatable) pi->rand_state = 0; - } - else { - list_links(lkg, pi->parse_set, index); - } -} diff -Nru link-grammar-5.3.16/link-grammar/extract-links.h link-grammar-5.5.0/link-grammar/extract-links.h --- link-grammar-5.3.16/link-grammar/extract-links.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/extract-links.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,23 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this softwares. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _EXTRACT_LINKS_H -#define _EXTRACT_LINKS_H - -#include "api-structures.h" -#include "link-includes.h" - -Parse_info parse_info_new(int nwords); -void free_parse_info(Parse_info); -bool build_parse_set(Sentence, fast_matcher_t*, count_context_t*, unsigned int null_count, Parse_Options); -void extract_links(Linkage, Parse_info); -#endif /* _EXTRACT_LINKS_H */ diff -Nru link-grammar-5.3.16/link-grammar/fast-match.c link-grammar-5.5.0/link-grammar/fast-match.c --- link-grammar-5.3.16/link-grammar/fast-match.c 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/fast-match.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,647 +0,0 @@ -/**************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/**************************************************************************/ - -#include "api-structures.h" -#include "externs.h" -#include "fast-match.h" -#include "string-set.h" -#include "wordgraph.h" -#include "word-utils.h" - -/** - * The entire goal of this file is provide a fast lookup of all of the - * disjuncts on a given word that might be able to connect to a given - * connector on the left or the right. The main entry point is - * form_match_list(), which performs this lookup. - * - * The lookup is fast, because it uses a precomputed lookup table to - * find the match candidates. The lookup table is stocked by looking - * at all disjuncts on all words, and sorting them into bins organized - * by connectors they could potentially connect to. The lookup table - * is created by calling the alloc_fast_matcher() function. - * - * free_fast_matcher() is used to free the matcher. - * form_match_list() manages its memory as a "stack" - match-lists are - * pushed on this stack. The said stack size gets over 2048 entries only - * for long and/or complex sentences. - * pop_match_list() releases the memory that form_match_list() returned - * by unwinding this stack. - */ - -#define MATCH_LIST_SIZE_INIT 4096 /* the initial size of the match-list stack */ -#define MATCH_LIST_SIZE_INC 2 /* match-list stack increase size factor */ - -/** - * Returns the number of disjuncts in the list that have non-null - * left connector lists. - */ -static int left_disjunct_list_length(const Disjunct * d) -{ - int i; - for (i=0; d!=NULL; d=d->next) { - if (d->left != NULL) i++; - } - return i; -} - -static int right_disjunct_list_length(const Disjunct * d) -{ - int i; - for (i=0; d!=NULL; d=d->next) { - if (d->right != NULL) i++; - } - return i; -} - -/** - * Push a match-list element into the match-list array. - */ -static void push_match_list_element(fast_matcher_t *ctxt, Disjunct *d) -{ - if (ctxt->match_list_end >= ctxt->match_list_size) - { - ctxt->match_list_size *= MATCH_LIST_SIZE_INC; - ctxt->match_list = realloc(ctxt->match_list, - ctxt->match_list_size * sizeof(*ctxt->match_list)); - } - - ctxt->match_list[ctxt->match_list_end++] = d; -} - -static void free_match_list(Match_node * t) -{ - Match_node *xt; - for (; t!=NULL; t=xt) { - xt = t->next; - xfree((char *)t, sizeof(Match_node)); - } -} - -/** - * Free all of the hash tables and Match_nodes - */ -void free_fast_matcher(fast_matcher_t *mchxt) -{ - size_t w; - unsigned int i; - - if (NULL == mchxt) return; - for (w = 0; w < mchxt->size; w++) - { - for (i = 0; i < mchxt->l_table_size[w]; i++) - { - free_match_list(mchxt->l_table[w][i]); - } - xfree((char *)mchxt->l_table[w], mchxt->l_table_size[w] * sizeof (Match_node *)); - for (i = 0; i < mchxt->r_table_size[w]; i++) - { - free_match_list(mchxt->r_table[w][i]); - } - xfree((char *)mchxt->r_table[w], mchxt->r_table_size[w] * sizeof (Match_node *)); - } - - free(mchxt->match_list); - lgdebug(6, "Sentence size %zu, match_list_size %zu\n", - mchxt->size, mchxt->match_list_size); - - xfree(mchxt->l_table_size, mchxt->size * sizeof(unsigned int)); - xfree(mchxt->l_table, mchxt->size * sizeof(Match_node **)); - xfree(mchxt, sizeof(fast_matcher_t)); -} - -/** - * Adds the match node m to the sorted list of match nodes l. - * The parameter dir determines the order of the sorting to be used. - * Makes the list sorted from smallest to largest. - */ -static Match_node * add_to_right_table_list(Match_node * m, Match_node * l) -{ - Match_node *p, *prev; - - if (l == NULL) return m; - - /* Insert m at head of list */ - if ((m->d->right->nearest_word) <= (l->d->right->nearest_word)) - { - m->next = l; - return m; - } - - /* Walk list to insertion point */ - prev = l; - p = prev->next; - while (p != NULL && ((m->d->right->nearest_word) > (p->d->right->nearest_word))) - { - prev = p; - p = p->next; - } - - m->next = p; - prev->next = m; - - return l; /* return pointer to original head */ -} - -/** - * Adds the match node m to the sorted list of match nodes l. - * The parameter dir determines the order of the sorting to be used. - * Makes the list sorted from largest to smallest - */ -static Match_node * add_to_left_table_list(Match_node * m, Match_node * l) -{ - Match_node *p, *prev; - - if (l == NULL) return m; - - /* Insert m at head of list */ - if ((m->d->left->nearest_word) >= (l->d->left->nearest_word)) - { - m->next = l; - return m; - } - - /* Walk list to insertion point */ - prev = l; - p = prev->next; - while (p != NULL && ((m->d->left->nearest_word) < (p->d->left->nearest_word))) - { - prev = p; - p = p->next; - } - - m->next = p; - prev->next = m; - - return l; /* return pointer to original head */ -} - -/** - * Compare only the uppercase part of two connectors. - * Return true if they are the same, else false. - * FIXME: Use connector enumeration. - */ -static bool con_uc_eq(const Connector *c1, const Connector *c2) -{ - if (string_set_cmp(c1->string, c2->string)) return true; - if (c1->hash != c2->hash) return false; - if (c1->uc_length != c2->uc_length) return false; - - /* We arrive here for less than 50% of the cases for "en" and - * less then 20% of the cases for "ru", and, in practice, the - * two strings are always equal, because there is almost never - * a hash collision that would lead to a miscompare, because - * we are hashing, at most, a few dozen connectors into a - * 16-bit hash space (65536 slots). - */ - const char *uc1 = &c1->string[c1->uc_start]; - const char *uc2 = &c2->string[c2->uc_start]; - if (0 == strncmp(uc1, uc2, c1->uc_length)) return true; - - return false; -} - -static Match_node **get_match_table_entry(unsigned int size, Match_node **t, - Connector * c, int dir) -{ - unsigned int h, s; - - s = h = connector_hash(c) & (size-1); - - if (dir == 1) { - while (NULL != t[h]) - { - if (con_uc_eq(t[h]->d->right, c)) break; - - /* Increment and try again. Every hash bucket MUST have - * a unique upper-case part, since later on, we only - * compare the lower-case parts, assuming upper-case - * parts are already equal. So just look for the next - * unused hash bucket. - */ - h = (h + 1) & (size-1); - if (NULL == t[h]) break; - if (h == s) return NULL; - } - } - else - { - while (NULL != t[h]) - { - if (con_uc_eq(t[h]->d->left, c)) break; - h = (h + 1) & (size-1); - if (NULL == t[h]) break; - if (h == s) return NULL; - } - } - - return &t[h]; -} - -/** - * The disjunct d (whose left or right pointer points to c) is put - * into the appropriate hash table - * dir = 1, we're putting this into a right table. - * dir = -1, we're putting this into a left table. - */ -static void put_into_match_table(unsigned int size, Match_node ** t, - Disjunct * d, Connector * c, int dir ) -{ - Match_node *m, **xl; - - m = (Match_node *) xalloc (sizeof(Match_node)); - m->next = NULL; - m->d = d; - - xl = get_match_table_entry(size, t, c, dir); - assert(NULL != xl, "get_match_table_entry: Overflow"); - if (dir == 1) { - *xl = add_to_right_table_list(m, *xl); - } - else - { - *xl = add_to_left_table_list(m, *xl); - } -} - -fast_matcher_t* alloc_fast_matcher(const Sentence sent) -{ - unsigned int size; - size_t w; - int len; - Match_node ** t; - Disjunct * d; - fast_matcher_t *ctxt; - - ctxt = (fast_matcher_t *) xalloc(sizeof(fast_matcher_t)); - ctxt->size = sent->length; - ctxt->l_table_size = xalloc(2 * sent->length * sizeof(unsigned int)); - ctxt->r_table_size = ctxt->l_table_size + sent->length; - ctxt->l_table = xalloc(2 * sent->length * sizeof(Match_node **)); - ctxt->r_table = ctxt->l_table + sent->length; - memset(ctxt->l_table, 0, 2 * sent->length * sizeof(Match_node **)); - - ctxt->match_list_size = MATCH_LIST_SIZE_INIT; - ctxt->match_list = xalloc(ctxt->match_list_size * sizeof(*ctxt->match_list)); - ctxt->match_list_end = 0; - - for (w=0; wlength; w++) - { - len = left_disjunct_list_length(sent->word[w].d); - size = next_power_of_two_up(len); - ctxt->l_table_size[w] = size; - t = ctxt->l_table[w] = (Match_node **) xalloc(size * sizeof(Match_node *)); - memset(t, 0, size * sizeof(Match_node *)); - - for (d = sent->word[w].d; d != NULL; d = d->next) - { - if (d->left != NULL) - { - put_into_match_table(size, t, d, d->left, -1); - } - } - - len = right_disjunct_list_length(sent->word[w].d); - size = next_power_of_two_up(len); - ctxt->r_table_size[w] = size; - t = ctxt->r_table[w] = (Match_node **) xalloc(size * sizeof(Match_node *)); - memset(t, 0, size * sizeof(Match_node *)); - - for (d = sent->word[w].d; d != NULL; d = d->next) - { - if (d->right != NULL) - { - put_into_match_table(size, t, d, d->right, 1); - } - } - } - - return ctxt; -} - -#if 0 -/** - * Print statistics on various connector matching aspects. - * A summary can be found by the shell commands: - * link-parser < file.batch | grep match_stats: | sort | uniq -c - */ -static void match_stats(Connector *c1, Connector *c2) -{ - if (NULL == c1) printf("match_stats: cache\n"); - if (NULL == c2) return; - if ((1 == c1->uc_start) && (1 == c2->uc_start) && - (c1->string[0] == c2->string[0])) - { - printf("match_stats: h/d mismatch\n"); - } - - if (0 == c1->lc_start) printf("match_stats: no lc (c1)\n"); - if (0 == c2->lc_start) printf("match_stats: no lc (c2)\n"); - - if (string_set_cmp(c1->string, c2->string)) printf("match_stats: same\n"); - - const char *a = &c1->string[c1->lc_start]; - const char *b = &c2->string[c2->lc_start]; - do - { - if (*a != *b && (*a != '*') && (*b != '*')) printf("match_stats: lc false\n"); - a++; - b++; - } while (*a != '\0' && *b != '\0'); - printf("match_stats: lc true\n"); -} -#else -#define match_stats(a, b) -#endif - -#ifdef DEBUG -#undef N -#define N(c) (c?c->string:"") - -/** - * Print the match list, including connector match indications. - * Usage: link-parser -verbosity=9 -debug=print_match_list - * Output format: - * MATCH_NODE list_id: lw>lc [=] leftright [=] rcmatch_list[mlb]; - - for (; NULL != *m; m++) - { - Disjunct *d = *m; - - printf("MATCH_NODE %5d: %02d>%-9s %c %9s<%02d>%-9s %c %9s<%02d\n", - id, lw , N(lc), d->match_left ? '=': ' ', - N(d->left), w, N(d->right), - d->match_right? '=' : ' ', N(rc), rw); - } -} -#else -#define print_match_list(...) -#endif - -/** - * Compare only the lower-case parts of two connectors. When this - * function is called, it is assumed that the upper-case parts are - * equal, and thus do not need to be checked again. - * - * We know that the uc parts of the connectors are the same, - * because we fetch the matching lists according to the uc part or the - * connectors to be matched. So the uc parts are not checked here. The - * head/dependent indicators are in the caller function, and only when - * connectors match here, to save CPU when the connectors don't match - * otherwise. This is because h/d mismatch is rare. - * FIXME: Use connector enumeration. - */ -static bool match_lower_case(Connector *c1, Connector *c2) -{ - match_stats(c1, c2); - - /* If the connectors are identical, they match. */ - if (string_set_cmp(c1->string, c2->string)) return true; - - /* If any of the connectors doesn't have a lc part, they match */ - if ((0 == c2->lc_start) || (0 == c1->lc_start)) return true; - - /* Compare the lc parts according to the connector matching rules. */ - const char *a = &c1->string[c1->lc_start]; - const char *b = &c2->string[c2->lc_start]; - do - { - if (*a != *b && (*a != '*') && (*b != '*')) return false; - a++; - b++; - } while (*a != '\0' && *b != '\0'); - - return true; -} - -/** - * Return false if the connectors cannot match due to identical - * head/dependent parts. Else return true. - */ -static bool match_hd(Connector *c1, Connector *c2) -{ - if ((1 == c1->uc_start) && (1 == c2->uc_start) && - (c1->string[0] == c2->string[0])) - { - return false; - } - return true; -} - -typedef struct -{ - const char *string; - bool match; -} match_cache; - -/** - * Match the lower-case parts of connectors, and the head-dependent, - * using a cache of the most recent compare. Due to the way disjuncts - * are written, we are often asked to compare to the same connector - * 3 or 4 times in a row. So if we already did that compare, just use - * the cached result. (i.e. the caching here is almost trivial, but it - * works well). - */ -static bool do_match_with_cache(Connector *a, Connector *b, match_cache *c_con) -{ - /* The following uses a string-set compare - string_set_cmp() cannot - * be used here because c_con->string may be NULL. */ - match_stats(c_con->string == a->string ? NULL : a, NULL); -#ifdef HAVE_MAYBE_UNINITIALIZED -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif /* HAVE_MAYBE_UNINITIALIZED */ - /* The string field is initialized to NULL, and this is enough because - * the connector string cannot be NULL, as it actually fetched a - * non-empty match list. */ - if (c_con->string == a->string) return c_con->match; -#ifdef HAVE_MAYBE_UNINITIALIZED -#pragma GCC diagnostic pop -#endif /* HAVE_MAYBE_UNINITIALIZED */ - - /* No cache exists. Check if the connectors match and cache the result. */ - c_con->match = match_lower_case(a, b) && match_hd(a, b); - c_con->string = a->string; - - return c_con->match; -} - -typedef struct -{ - const Gword *gword; - bool same_alternative; -} gword_cache; - -/** - * Return true iff c1 and c2 are from the same alternative. - * An optimization for English checks if one of the connectors belong - * to an original sentence word (c2 is checked first for an inline - * optimization opportunity). - * If a wordgraph word of the checked connector is the same - * as of the previously checked one, use the cached result. - * (The first wordgraph word is used for cache validity indication, - * but there is only one most of the times anyway.) - */ -#define ALT_CONNECTION_POSSIBLE -#define OPTIMIZE_EN -static bool alt_connection_possible(Connector *c1, Connector *c2, - gword_cache *c_con) -{ -#ifdef ALT_CONNECTION_POSSIBLE - bool same_alternative = false; - -#ifdef OPTIMIZE_EN - /* Try a shortcut first. */ - if ((c2->originating_gword->o_gword->hier_depth == 0) || - (c1->originating_gword->o_gword->hier_depth == 0)) - { - return true; - } -#endif /* OPTIMIZE_EN */ - - if (c1->originating_gword->o_gword == c_con->gword) - return c_con->same_alternative; - - /* Each of the loops is of one iteration most of the times. */ - for (const gword_set *ga = c1->originating_gword; NULL != ga; ga = ga->next) { - for (const gword_set *gb = c2->originating_gword; NULL != gb; gb = gb->next) { - if (in_same_alternative(ga->o_gword, gb->o_gword)) { - same_alternative = true; - break; - } - } - if (same_alternative) break; - } - - c_con->same_alternative = same_alternative; - c_con->gword = c1->originating_gword->o_gword; - - - return same_alternative; -#else - return true; -#endif /* ALT_CONNECTION_POSSIBLE */ -} - -/** - * Forms and returns a list of disjuncts coming from word w, that - * actually matches lc or rc or both. The lw and rw are the words from - * which lc and rc came respectively. - * - * The list is returned in a linked list of Match_nodes. This list - * contains no duplicates, because when processing the ml list, only - * elements whose match_left is true are included, and such elements are - * not included again when processing the mr list. - * - * Note that if both lc and rc match the corresponding connectors of w, - gc.word = NULL; - * match_left is set to true when the ml list is processed and the - * disjunct is then added to the result list, and match_right of the - * same disjunct is set to true when the mr list is processed, and this - * disjunct is not added again. - */ -size_t -form_match_list(fast_matcher_t *ctxt, int w, - Connector *lc, int lw, - Connector *rc, int rw) -{ - Match_node *mx, *mr_end, **mxp; - size_t front = ctxt->match_list_end; - Match_node *ml = NULL, *mr = NULL; - match_cache mc; - gword_cache gc; - - gc.same_alternative = NULL; - -#ifdef VERIFY_MATCH_LIST - static int id = 0; - int lid = ++id; /* A local copy, for multi-threading support. */ -#endif - - /* Get the lists of candidate matching disjuncts of word w for lc and - * rc. Consider each of these lists only if the length_limit of lc - * rc and also w, is not greater then the distance between their word - * and the word w. */ - if ((lc != NULL) && ((w - lw) <= lc->length_limit)) - { - mxp = get_match_table_entry(ctxt->l_table_size[w], ctxt->l_table[w], lc, -1); - if (NULL != mxp) ml = *mxp; - } - if ((rc != NULL) && ((rw - w) <= rc->length_limit)) - { - mxp = get_match_table_entry(ctxt->r_table_size[w], ctxt->r_table[w], rc, 1); - if (NULL != mxp) mr = *mxp; - } - - for (mx = mr; mx != NULL; mx = mx->next) - { - if (mx->d->right->nearest_word > rw) break; - mx->d->match_left = false; - } - mr_end = mx; - - /* Construct the list of things that could match the left. */ - mc.string = NULL; - gc.gword = NULL; - for (mx = ml; mx != NULL; mx = mx->next) - { - if (mx->d->left->nearest_word < lw) break; - if ((w - lw) > mx->d->left->length_limit) continue; - - mx->d->match_left = do_match_with_cache(mx->d->left, lc, &mc) && - alt_connection_possible(mx->d->left, lc, &gc); - if (!mx->d->match_left) continue; - mx->d->match_right = false; - -#ifdef VERIFY_MATCH_LIST - mx->d->match_id = lid; -#endif - push_match_list_element(ctxt, mx->d); - } - - /* Append the list of things that could match the right. - * Note that it is important to set here match_right correctly even - * if we are going to skip this element here because its match_left - * is true, since then it means it is already included in the match - * list. */ - mc.string = NULL; - gc.gword = NULL; - for (mx = mr; mx != mr_end; mx = mx->next) - { - if ((rw - w) > mx->d->right->length_limit) continue; - - mx->d->match_right = do_match_with_cache(mx->d->right, rc, &mc) && - alt_connection_possible(mx->d->right, rc, &gc); - if (!mx->d->match_right || mx->d->match_left) continue; - -#ifdef VERIFY_MATCH_LIST - mx->d->match_id = lid; -#endif - push_match_list_element(ctxt, mx->d); - } - - push_match_list_element(ctxt, NULL); - print_match_list(ctxt, lid, front, w, lc, lw, rc, rw); - return front; -} diff -Nru link-grammar-5.3.16/link-grammar/fast-match.h link-grammar-5.5.0/link-grammar/fast-match.h --- link-grammar-5.3.16/link-grammar/fast-match.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/fast-match.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,57 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _FAST_MATCH_H_ -#define _FAST_MATCH_H_ - -#include "link-includes.h" -#include "structures.h" - -struct fast_matcher_s -{ - size_t size; - unsigned int *l_table_size; /* the sizes of the hash tables */ - unsigned int *r_table_size; - - /* the beginnings of the hash tables */ - Match_node *** l_table; - Match_node *** r_table; - - /* I'll pedantically maintain my own array of these cells */ - Disjunct ** match_list; /* match-list stack */ - size_t match_list_end; /* index to the match-list stack end */ - size_t match_list_size; /* number of allocated elements */ -}; - -/* See the source file for documentation. */ -fast_matcher_t* alloc_fast_matcher(const Sentence); -void free_fast_matcher(fast_matcher_t*); - -size_t form_match_list(fast_matcher_t *, int, Connector *, int, Connector *, int); - -/** - * Return the match-list element at the given index. - */ -static inline Disjunct *get_match_list_element(fast_matcher_t *ctxt, size_t mli) -{ - return ctxt->match_list[mli]; -} - -/** - * Pop up the match-list stack - */ -static inline void pop_match_list(fast_matcher_t *ctxt, size_t match_list_last) -{ - ctxt->match_list_end = match_list_last; -} - -#endif diff -Nru link-grammar-5.3.16/link-grammar/histogram.c link-grammar-5.5.0/link-grammar/histogram.c --- link-grammar-5.3.16/link-grammar/histogram.c 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/histogram.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,181 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2015 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include "histogram.h" - -#ifdef PERFORM_COUNT_HISTOGRAMMING -/* A histogram distribution of the parse counts. */ - -Count_bin hist_zero(void) -{ - static Count_bin zero - = {0, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 0}; - return zero; -} - -Count_bin hist_one(void) -{ - static Count_bin one - = {0, 1, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 0}; - return one; -} - -#define BIN_WIDTH 0.334 - -/** - * Accumulate counts in 'a', adding them to sum. - * The histogram is shifted by the amount 'cost'. - * That is, the bins are shifted over by the interger part of the cost - * (scaled to the bin-width). - */ -void hist_accum(Count_bin* sum, double cost, const Count_bin* a) -{ - unsigned int i; - unsigned int start; - - // Skip, if nothing to accumulate. - if (0 == a->total) return; - sum->total += a->total; - - // The cost tells us how much to offset the histogram in a, - // before accumulating it. 'base' is the bin number of the first - // non-empty bin. - start = (unsigned int) floor (cost / BIN_WIDTH); - if (0 == sum->bin[0]) - { - sum->base = start; - start = 0; - } - - for (i = start; i < NUM_BINS; i++) - { - sum->bin[i] += a->bin[i-start]; - } - for (i = NUM_BINS-start; i < NUM_BINS; i++) - { - sum->overrun += a->bin[i]; - } - sum->overrun += a->overrun; -} - -/** Same as above */ -void hist_accumv(Count_bin* sum, double cost, const Count_bin a) -{ - hist_accum(sum, cost, &a); -} - -/** - * Create a product of two histogrammed counts. - * Observe that doing so requires a kind-of cross-product to - * be performed, thus, a nested double sum. - */ -void hist_prod(Count_bin* prod, const Count_bin* a, const Count_bin* b) -{ - unsigned int i, k; - - // Skip, if the product is zero. - if (0 == a->total || 0 == b->total) return; - prod->total = a->total * b->total; - -// #define SLOW_BUT_SIMPLE 1 -#ifdef SLOW_BUT_SIMPLE - /* The below implements the straight-forward concept of the product. - * Its not quite optimal, because the intialization loop, and the - * if check can be eliminated by re-writing j = k-i. - */ - for (i = 0; i < NUM_BINS; i++) prod->bin[i] = 0; - prod->overrun = 0; - for (i = 0; i < NUM_BINS; i++) - { - for (j = 0; j < NUM_BINS; j++) - { - if (i+j < NUM_BINS) - prod->bin[i+j] += a->bin[i] * b->bin[j]; - else - prod->overrun += a->bin[i] * b->bin[j]; - } - - prod->overrun += a->bin[i] * b->overrun; - prod->overrun += a->overrun * b->bin[i]; - } - prod->overrun += a->overrun * b->overrun; -#else - /* The below does exactly the same thing as the above, but - * ever so slightly more quickly. Some pointless checks get - * eliminated. - */ - prod->overrun = 0; - for (k = 0; k < NUM_BINS; k++) - { - prod->bin[k] = 0; - for (i = 0; i <= k; i++) - { - prod->bin[k] += a->bin[i] * b->bin[k-i]; - } - prod->overrun += a->bin[k] * b->overrun; - prod->overrun += a->overrun * b->bin[k]; - } - for (k = NUM_BINS; k < 2 * NUM_BINS - 1; k++) - { - for (i = k - NUM_BINS + 1; i < NUM_BINS; i++) - { - prod->overrun += a->bin[i] * b->bin[k-i]; - } - } - prod->overrun += a->overrun * b->overrun; -#endif -} - -/** - * Multiply two historams 'a' and 'b', and accumulate them into 'acc'. - * The accumulated historgram is first shifted by 'cost'. - */ -void hist_muladd(Count_bin* acc, const Count_bin* a, double cost, const Count_bin* b) -{ - Count_bin tmp = hist_zero(); - hist_prod(&tmp, a, b); - hist_accum(acc, cost, &tmp); -} - -void hist_muladdv(Count_bin* acc, const Count_bin* a, double cost, const Count_bin b) -{ - hist_muladd(acc, a, cost, &b); -} - -double hist_cost_cutoff(Count_bin* hist, int count) -{ - int i; - s64 cnt = 0; - - for (i=0; ibin[i]; - if (count <= cnt) - return ((double) i + hist->base) * BIN_WIDTH; - } - return 1.0e38; -} - -s64 hist_cut_total(Count_bin* hist, int min_total) -{ - int i; - s64 cnt = 0; - - for (i=0; ibin[i]; - if (min_total <= cnt) return cnt; - } - return hist->total; -} - -#endif /* PERFORM_COUNT_HISTOGRAMMING */ diff -Nru link-grammar-5.3.16/link-grammar/histogram.h link-grammar-5.5.0/link-grammar/histogram.h --- link-grammar-5.3.16/link-grammar/histogram.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/histogram.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,94 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2015 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _HISTOGRAM_H_ -#define _HISTOGRAM_H_ - -#ifndef _MSC_VER -typedef long long s64; /* signed 64-bit integer, even on 32-bit cpus */ -#define PARSE_NUM_OVERFLOW (1LL<<24) -#else -/* Microsoft Visual C Version 6 doesn't support long long. */ -typedef signed __int64 s64; /* signed 64-bit integer, even on 32-bit cpus */ -#define PARSE_NUM_OVERFLOW (((s64)1)<<24) -#endif - - -/* - * Count Histogramming is currently not required for anything, and the - * code runs about 6% faster when it is disabled. - * -#define PERFORM_COUNT_HISTOGRAMMING 1 - */ -#ifdef PERFORM_COUNT_HISTOGRAMMING - -/** - * A histogram distribution of the parse counts. - * The histogram is with respect to the cost of the parse. Thus, each - * bin of the historgram contains a count of the number of parses - * acheivable with that cost. Rather than setting the baseline cost - * at zero, it is dynamically scaled, so that 'base' is the number of - * the first bin with a non-zero count in it. If there are counts that - * don't fit into the available bins, then they are accumulated into - * the overrun bin. It is always the case that - * total == sum_i bin[i] + overrun - */ -#define NUM_BINS 12 -struct Count_bin_s -{ - short base; - s64 total; - s64 bin[NUM_BINS]; - s64 overrun; -}; - -typedef struct Count_bin_s Count_bin; - -Count_bin hist_zero(void); -Count_bin hist_one(void); - -void hist_accum(Count_bin* sum, double, const Count_bin*); -void hist_accumv(Count_bin* sum, double, const Count_bin); -void hist_prod(Count_bin* prod, const Count_bin*, const Count_bin*); -void hist_muladd(Count_bin* prod, const Count_bin*, double, const Count_bin*); -void hist_muladdv(Count_bin* prod, const Count_bin*, double, const Count_bin); - -static inline s64 hist_total(Count_bin* tot) { return tot->total; } -s64 hist_cut_total(Count_bin* tot, int min_total); - -double hist_cost_cutoff(Count_bin*, int count); - -#else - -typedef s64 Count_bin; - -static inline Count_bin hist_zero(void) { return 0; } -static inline Count_bin hist_one(void) { return 1; } - -static inline void hist_accum(Count_bin* sum, double cost, Count_bin* a) - { *sum += *a; } -static inline void hist_accumv(Count_bin* sum, double cost, Count_bin a) - { *sum += a; } -static inline void hist_prod(Count_bin* prod, Count_bin* a, Count_bin* b) - { *prod = (*a) * (*b); } -static inline void hist_muladd(Count_bin* prod, Count_bin* a, double cost, Count_bin* b) - { *prod += (*a) * (*b); } -static inline void hist_muladdv(Count_bin* prod, Count_bin* a, double cost, Count_bin b) - { *prod += (*a) * b; } - -static inline s64 hist_total(Count_bin* tot) { return *tot; } -static inline s64 hist_cut_total(Count_bin* tot, int min_total) { return *tot; } - -static inline double hist_cost_cutoff(Count_bin* tot, int count) { return 1.0e38; } - -#endif /* PERFORM_COUNT_HISTOGRAMMING */ - -#endif /* _HISTOGRAM_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/idiom.c link-grammar-5.5.0/link-grammar/idiom.c --- link-grammar-5.3.16/link-grammar/idiom.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/idiom.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,362 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "api-structures.h" -#include "api-types.h" -#include "dict-api.h" -#include "dict-common.h" -#include "error.h" -#include "idiom.h" -#include "string-set.h" -#include "structures.h" - -/** - * Find if a string signifies an idiom. - * Returns true if the string contains an underbar character. - * The check of s[0] prevents inclusion of "_". In that case no check for - * length=1 is done because it is not going to be a valid idiom anyway. - * - */ -bool contains_underbar(const char * s) -{ - if (s[0] == '_') return false; - while (*s != '\0') { - if (*s == '_') return true; - s++; - } - return false; -} - -/** - * Returns false if it is not a correctly formed idiom string. - * Such a string is correct if it: - * () contains no SUBSCRIPT_MARK - * () non-empty strings separated by _ - */ -static bool is_idiom_string(const char * s) -{ - size_t len; - const char * t; - - for (t = s; *t != '\0'; t++) - { - if (*t == SUBSCRIPT_MARK) return false; - } - - len = strlen(s); - if ((s[0] == '_') || (s[len-1] == '_')) - { - return false; - } - - for (t = s; *t != '\0'; t++) - { - if ((*t == '_') && (*(t+1) == '_')) return false; - } - return true; -} - -/** - * Return true if the string s is a sequence of digits. - */ -static bool is_number(const char *s) -{ - while(*s != '\0') { - if (!isdigit(*s)) return false; - s++; - } - return true; -} - -/** - * If the string contains a SUBSCIPT_MARK, and ends in ".Ix" where - * x is a number, return x. Return -1 if not of this form. - */ -static int numberfy(const char * s) -{ - for (; (*s != '\0') && (*s != SUBSCRIPT_MARK); s++) - ; - if (*s++ != SUBSCRIPT_MARK) return -1; - if (*s++ != 'I') return -1; - if (!is_number(s)) return -1; - return atoi(s); -} - -/** - * Look for words that end in ".Ix" where x is a number. - * Return the largest x found. - */ -static int max_postfix_found(Dict_node * d) -{ - int i, j; - i = 0; - while(d != NULL) { - j = numberfy(d->string); - if (j > i) i = j; - d = d->right; - } - return i; -} - -/** - * build_idiom_word_name() -- return idiomized name of given string. - * - * Allocates string space and returns a pointer to it. - * In this string is placed the idiomized name of the given string s. - * This is the same as s, but with a postfix of ".Ix", where x is an - * appropriate number. x is the minimum number that distinguishes - * this word from others in the dictionary. - */ -static const char * build_idiom_word_name(Dictionary dict, const char * s) -{ - char buff[2*MAX_WORD]; - size_t bufsz = 2*MAX_WORD; - char *x; - int count; - - Dict_node *dn = dictionary_lookup_list(dict, s); - count = max_postfix_found(dn) + 1; - free_lookup_list(dict, dn); - - x = buff; - while((*s != '\0') && (*s != SUBSCRIPT_MARK) && (0 < bufsz)) - { - *x = *s; - x++; - s++; - bufsz--; - } - snprintf(x, bufsz, "%cI%d", SUBSCRIPT_MARK, count); - - return string_set_add(buff, dict->string_set); -} - -/** - * Tear the idiom string apart. - * Put the parts into a list of Dict_nodes (connected by their right pointers) - * Sets the string fields of these Dict_nodes pointing to the - * fragments of the string s. Later these will be replaced by - * correct names (with .Ix suffixes). - * The list is reversed from the way they occur in the string. - * A pointer to this list is returned. - */ -static Dict_node * make_idiom_Dict_nodes(Dictionary dict, const char * string) -{ - Dict_node * dn, * dn_new; - char * t, *s, *p; - bool more; - unsigned int sz; - dn = NULL; - - sz = strlen(string)+1; - p = s = (char *) xalloc(sz); - strcpy(s, string); - - while (*s != '\0') { - t = s; - while ((*s != '\0') && (*s != '_')) s++; - if (*s == '_') { - more = true; - *s = '\0'; - } else { - more = false; - } - dn_new = (Dict_node *) xalloc(sizeof (Dict_node)); - dn_new->right = dn; - dn = dn_new; - dn->string = string_set_add(t, dict->string_set); - dn->file = NULL; - if (more) s++; - } - - xfree(p, sz); - return dn; -} - -static char current_name[] = "AAAAAAAA"; -#define CN_size (sizeof(current_name)-1) - -static void increment_current_name(void) -{ - int i = CN_size-1; - - do - { - current_name[i]++; - if (current_name[i] <= 'Z') return; - current_name[i] = 'A'; - } while (i-- > 0); - assert(0, "increment_current_name: Overflow"); -} - -/** - * Generate a new connector name obtained from the current_name. - * allocate string space for it. - * @return a pointer to connector name. - */ -static const char * generate_id_connector(Dictionary dict) -{ - char buff[2*MAX_WORD]; - unsigned int i; - char * t; - - for (i=0; current_name[i] == 'A'; i++) - ; - /* i is now the number of characters of current_name to skip */ - t = buff; - - /* All idiom connector names start with the two letters "ID" */ - *t++ = 'I'; - *t++ = 'D'; - for (; i < CN_size; i++ ) - { - *t++ = current_name[i] ; - } - *t++ = '\0'; - return string_set_add(buff, dict->string_set); -} - -/** - * Takes as input a pointer to a Dict_node. - * The string of this Dict_node is an idiom string. - * This string is torn apart, and its components are inserted into the - * dictionary as special idiom words (ending in .I*, where * is a number). - * The expression of this Dict_node (its node field) has already been - * read and constructed. This will be used to construct the special idiom - * expressions. - * The given dict node is freed. The string is also freed. - */ -void insert_idiom(Dictionary dict, Dict_node * dn) -{ - Exp * nc, * no, * n1; - E_list *ell, *elr; - const char * s; - Dict_node * dn_list, * xdn, * start_dn_list; - - no = dn->exp; - s = dn->string; - - if (!is_idiom_string(s)) - { - prt_error("Warning: Word \"%s\" on line %d " - "is not a correctly formed idiom string.\n" - "\tThis word will be ignored\n", - s, dict->line_number); - - xfree((char *)dn, sizeof (Dict_node)); - return; - } - - dn_list = start_dn_list = make_idiom_Dict_nodes(dict, s); - - xfree((char *)dn, sizeof (Dict_node)); - dn = NULL; - - assert(dn_list->right != NULL, "Idiom string with only one connector"); - - /* first make the nodes for the base word of the idiom (last word) */ - /* note that the last word of the idiom is first in our list */ - - /* ----- this code just sets up the node fields of the dn_list ----*/ - nc = Exp_create(&dict->exp_list); - nc->u.string = generate_id_connector(dict); - nc->dir = '-'; - nc->multi = false; - nc->type = CONNECTOR_type; - nc->cost = 0; - - n1 = Exp_create(&dict->exp_list); - n1->u.l = ell = (E_list *) xalloc(sizeof(E_list)); - ell->next = elr = (E_list *) xalloc(sizeof(E_list)); - elr->next = NULL; - ell->e = nc; - elr->e = no; - n1->type = AND_type; - n1->cost = 0; - - dn_list->exp = n1; - - dn_list = dn_list->right; - - while(dn_list->right != NULL) - { - /* generate the expression for a middle idiom word */ - - n1 = Exp_create(&dict->exp_list); - n1->u.string = NULL; - n1->type = AND_type; - n1->cost = 0; - n1->u.l = ell = (E_list *) xalloc(sizeof(E_list)); - ell->next = elr = (E_list *) xalloc(sizeof(E_list)); - elr->next = NULL; - - nc = Exp_create(&dict->exp_list); - nc->u.string = generate_id_connector(dict); - nc->dir = '+'; - nc->multi = false; - nc->type = CONNECTOR_type; - nc->cost = 0; - elr->e = nc; - - increment_current_name(); - - nc = Exp_create(&dict->exp_list); - nc->u.string = generate_id_connector(dict); - nc->dir = '-'; - nc->multi = false; - nc->type = CONNECTOR_type; - nc->cost = 0; - - ell->e = nc; - - dn_list->exp = n1; - - dn_list = dn_list->right; - } - /* now generate the last one */ - - nc = Exp_create(&dict->exp_list); - nc->u.string = generate_id_connector(dict); - nc->dir = '+'; - nc->multi = false; - nc->type = CONNECTOR_type; - nc->cost = 0; - - dn_list->exp = nc; - - increment_current_name(); - - /* ---- end of the code alluded to above ---- */ - - /* now its time to insert them into the dictionary */ - - dn_list = start_dn_list; - - while (dn_list != NULL) - { - xdn = dn_list->right; - dn_list->left = dn_list->right = NULL; - dn_list->string = build_idiom_word_name(dict, dn_list->string); - dict->root = insert_dict(dict, dict->root, dn_list); - dict->num_entries++; - dn_list = xdn; - } - /* xfree((char *)s, s_length+1); strings are handled by string_set */ -} - -/** - * returns true if this is a word ending in ".Ix", where x is a number. - */ -bool is_idiom_word(const char * s) -{ - return (numberfy(s) != -1) ; -} diff -Nru link-grammar-5.3.16/link-grammar/idiom.h link-grammar-5.5.0/link-grammar/idiom.h --- link-grammar-5.3.16/link-grammar/idiom.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/idiom.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -/********************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/********************************************************************************/ - -#include "dict-structures.h" -#include "link-includes.h" - -void insert_idiom(Dictionary dict, Dict_node *); -bool contains_underbar(const char *); -bool is_idiom_word(const char *); diff -Nru link-grammar-5.3.16/link-grammar/lg_assert.h link-grammar-5.5.0/link-grammar/lg_assert.h --- link-grammar-5.3.16/link-grammar/lg_assert.h 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/lg_assert.h 2018-01-29 21:20:45.000000000 +0000 @@ -14,15 +14,25 @@ #define FILELINE __FILE__ ":" STRINGIFY(__LINE__) #ifdef _WIN32 -#define DEBUG_TRAP (*((volatile int*) 0x0) = 42) + #define DEBUG_TRAP (*((volatile int*) 0x0) = 42) +#elif defined GNUC || defined __clang_analyzer__ + #define DEBUG_TRAP __builtin_trap() #else -#define DEBUG_TRAP __builtin_trap() + #define DEBUG_TRAP ((void(*)(void))0)() #endif +/* FIXME: + * 1. If the error_handler is not NULL, use prt_error() too (after calling + * fprintf()), in order to allow an error_handler to log, produce trace, or + * show the assert() message in a (possibly pop-up) window. + * 2. Don't use DEBUG_TRAP (or exit) directly, but instead call a function + * pointer like lg_exit(code) to allow the LG library to be embedded in an + * application like an editor. If not set, the default will still be + * DEBUG_TRAP. */ #define assert(ex, ...) { \ if (!(ex)) { \ - prt_error("Fatal error: \nAssertion (" #ex ") failed at " FILELINE ": " __VA_ARGS__); \ - prt_error("\n"); \ + fprintf(stderr, "Fatal error: \nAssertion (" #ex ") failed at " FILELINE ": " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ DEBUG_TRAP; /* leave stack trace in debugger */ \ } \ } diff -Nru link-grammar-5.3.16/link-grammar/linkage/analyze-linkage.c link-grammar-5.5.0/link-grammar/linkage/analyze-linkage.c --- link-grammar-5.3.16/link-grammar/linkage/analyze-linkage.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/analyze-linkage.c 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,86 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2012, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include + +#include "analyze-linkage.h" +#include "connectors.h" // Needed for connector_string +#include "linkage.h" +#include "string-set.h" + +/* Links are *always* less than 10 chars long. For now... */ +#define MAX_LINK_NAME_LENGTH 10 + +/** + * This returns a string that is the the GCD of the two given strings. + * If the GCD is equal to one of them, a pointer to it is returned. + * Otherwise a new string for the GCD is put in the string set. + */ +static const char * intersect_strings(String_set *sset, const char * s, const char * t) +{ + int i, j, d; + const char *w, *s0; + char u0[MAX_LINK_NAME_LENGTH]; + char *u; + + /* The head indicator is lower-case h, the dependent indicator is + * lower-case d. If they are present, skip them. The intersection + * cannot include them. */ + if (islower((int) *s)) s++; + if (islower((int) *t)) t++; + + if (strcmp(s,t) == 0) return s; /* would work without this */ + i = strlen(s); + j = strlen(t); + if (j > i) { + w = s; s = t; t = w; + } + /* s is now the longer (at least not the shorter) string */ + u = u0; + d = 0; + s0 = s; + while (*t != '\0') { + if ((*s == *t) || (*t == '*')) { + *u = *s; + } else { + assert(*s == '*', "Invalid intersection!"); + d++; + *u = *t; + } + s++; t++; u++; + } + if (d == 0) { + return s0; + } else { + strcpy(u, s); /* get the remainder of s */ + return string_set_add(u0, sset); + } +} + +/** + * The name of the link is set to be the GCD of the names of + * its two endpoints. Must be called after each extract_links(), + * etc. since that call issues a brand-new set of links into + * parse_info. + */ +void compute_link_names(Linkage lkg, String_set *sset) +{ + size_t i; + for (i = 0; i < lkg->num_links; i++) + { + lkg->link_array[i].link_name = intersect_strings(sset, + connector_string(lkg->link_array[i].lc), + connector_string(lkg->link_array[i].rc)); + } +} diff -Nru link-grammar-5.3.16/link-grammar/linkage/analyze-linkage.h link-grammar-5.5.0/link-grammar/linkage/analyze-linkage.h --- link-grammar-5.3.16/link-grammar/linkage/analyze-linkage.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/analyze-linkage.h 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,20 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _ANALYZE_LINKAGE_H +#define _ANALYZE_LINKAGE_H + +#include "api-types.h" +#include "link-includes.h" + +void compute_link_names(Linkage, String_set *); +#endif /* _ANALYZE_LINKAGE_H */ diff -Nru link-grammar-5.3.16/link-grammar/linkage/freeli.c link-grammar-5.5.0/link-grammar/linkage/freeli.c --- link-grammar-5.3.16/link-grammar/linkage/freeli.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/freeli.c 2018-01-22 20:54:12.000000000 +0000 @@ -0,0 +1,87 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright 2008, 2009, 2013, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-structures.h" +#include "linkage.h" +#include "post-process/post-process.h" // for linkage_free_pp_info +#include "utilities.h" + +void free_linkage(Linkage linkage) +{ + exfree((void *) linkage->word, sizeof(const char *) * linkage->num_words); + exfree(linkage->chosen_disjuncts, linkage->num_words * sizeof(Disjunct *)); + free(linkage->link_array); + + /* Q: Why isn't this in a string set ?? A: Because there is no + * string-set handy when we compute this. */ + if (linkage->disjunct_list_str) + { + size_t j; + for (j=0; jnum_words; j++) + { + if (linkage->disjunct_list_str[j]) + free(linkage->disjunct_list_str[j]); + } + free(linkage->disjunct_list_str); + } +#ifdef USE_CORPUS + lg_sense_delete(linkage); +#endif + + linkage_free_pp_domains(linkage); + + /* XXX FIXME */ + free(linkage->wg_path); + free(linkage->wg_path_display); +} + +void free_linkages(Sentence sent) +{ + size_t in; + Linkage lkgs = sent->lnkages; + if (!lkgs) return; + + for (in=0; innum_linkages_alloced; in++) + { + free_linkage(&lkgs[in]); + } + + free(lkgs); + sent->num_linkages_alloced = 0; + sent->num_linkages_found = 0; + sent->num_linkages_post_processed = 0; + sent->num_valid_linkages = 0; + sent->lnkages = NULL; +} + +/* Partial, but not full initialization of the linkage struct ... */ +void partial_init_linkage(Sentence sent, Linkage lkg, unsigned int N_words) +{ + lkg->num_links = 0; + lkg->lasz = 2 * N_words; + lkg->link_array = (Link *) malloc(lkg->lasz * sizeof(Link)); + memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); + + lkg->num_words = N_words; + lkg->cdsz = N_words; + lkg->chosen_disjuncts = (Disjunct **) exalloc(lkg->cdsz * sizeof(Disjunct *)); + memset(lkg->chosen_disjuncts, 0, N_words * sizeof(Disjunct *)); + + lkg->disjunct_list_str = NULL; +#ifdef USE_CORPUS + lkg->sense_list = NULL; +#endif + + lkg->pp_domains = NULL; + lkg->sent = sent; +} diff -Nru link-grammar-5.3.16/link-grammar/linkage/linkage.c link-grammar-5.5.0/link-grammar/linkage/linkage.c --- link-grammar-5.3.16/link-grammar/linkage/linkage.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/linkage.c 2018-04-25 17:04:50.000000000 +0000 @@ -0,0 +1,876 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright 2008, 2009, 2013, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include + +#include "api-structures.h" +#include "connectors.h" +#include "dict-common/dict-affix.h" // for INFIX_MARK from dict. +#include "dict-common/dict-defines.h" // for SUBSCRIPT_MARK +#include "dict-common/idiom.h" +#include "disjunct-utils.h" +#include "link-includes.h" +#include "linkage.h" +#include "lisjuncts.h" +#include "sat-solver/sat-encoder.h" +#include "string-set.h" +#include "tokenize/wordgraph.h" +#include "tokenize/tok-structures.h" // XXX TODO provide gword access methods! +#include "tokenize/word-structures.h" // For Word_struct + +#define INFIX_MARK_L 1 /* INFIX_MARK is 1 character */ +#define STEM_MARK_L 1 /* stem mark is 1 character */ + +/* Marks around a null word. */ +#define NULLWORD_START '[' +#define NULLWORD_END ']' + +/** + * Append an unmarked (i.e. without INFIXMARK) morpheme to join_buff. + * join_buff is a zeroed-out buffer which has enough room for morpheme to be + * added + terminating NUL. + * Note that MT_PREFIX or MT_SUFFIX can be without an INFIX_MARK, in case + * INFIX_MARK is not defined. XXX: What about MT_MIDDLE? (not in use yet). + * + * FIXME Combining contracted words is not handled yet, because combining + * morphemes which have non-LL links to other words is not yet implemented. + */ +static void add_morpheme_unmarked(Sentence sent, char *join_buff, + const char *wm, Morpheme_type mt) +{ + const char infix_mark = INFIX_MARK(sent->dict->affix_table); + const char *sm = strrchr(wm, SUBSCRIPT_MARK); + + if (NULL == sm) sm = (char *)wm + strlen(wm); + + if ((MT_PREFIX == mt) && (infix_mark == sm[-INFIX_MARK_L])) + strncat(join_buff, wm, sm-wm-INFIX_MARK_L); + else if ((MT_SUFFIX == mt) && (infix_mark == wm[0])) + strncat(join_buff, INFIX_MARK_L+wm, sm-wm-INFIX_MARK_L); + else if ((MT_MIDDLE == mt)) + strncat(join_buff, INFIX_MARK_L+wm, sm-wm-2*INFIX_MARK_L); + else + strncat(join_buff, wm, sm-wm); +} + +static const char *join_null_word(Sentence sent, Gword **wgp, size_t count) +{ + size_t i; + char *join_buff; + const char *s; + size_t join_len = 0; + + for (i = 0; i < count; i++) + join_len += strlen(wgp[i]->subword); + + join_buff = alloca(join_len+1); + memset(join_buff, '\0', join_len+1); + + for (i = 0; i < count; i++) + add_morpheme_unmarked(sent, join_buff, wgp[i]->subword, + wgp[i]->morpheme_type); + + s = string_set_add(join_buff, sent->string_set); + + return s; +} + +/** + * Add a null word node that represents two or more null morphemes. + * Used for "unifying" null morphemes that are part of a single subword, + * when only some of its morphemes (2 or more) don't have a linkage. + * The words "start" to "end" (including) are unified by the new node. + */ +static Gword *wordgraph_null_join(Sentence sent, Gword **start, Gword **end) +{ + Gword *new_word; + Gword **w; + char *usubword; + size_t join_len = 0; + + for (w = start; w <= end; w++) join_len += strlen((*w)->subword); + usubword = calloc(join_len+1, 1); /* zeroed out */ + + for (w = start; w <= end; w++) + add_morpheme_unmarked(sent, usubword, (*w)->subword, (*w)->morpheme_type); + + new_word = gword_new(sent, usubword); + free(usubword); + new_word->status |= WS_PL; + new_word->label = "NJ"; + new_word->null_subwords = NULL; + new_word->start = (*start)->start; + new_word->end = (*end)->end; + + /* Link the null_subwords links of the added unifying node to the null + * subwords it unified. */ + for (w = start; w <= end; w++) + gwordlist_append(&new_word->null_subwords, (Gword *)(*w)); + /* Removing const qualifier, but gwordlist_append doesn't change w->... . */ + + return new_word; +} + +/** + * The functions defined in this file are primarily a part of the user API + * for working with linkages. + */ + +#define SUBSCRIPT_SEP SUBSCRIPT_DOT /* multiple-subscript separator */ + +#define PREFIX_SUPPRESS ("PL") /* prefix links start with this */ +#define PREFIX_SUPPRESS_L 2 /* length of above */ +#define SUFFIX_SUPPRESS ("LL") /* suffix links start with this */ +#define SUFFIX_SUPPRESS_L 2 /* length of above */ + +#define HIDE_MORPHO (!display_morphology) +/* TODO? !display_guess_marks is not implemented. */ +#define DISPLAY_GUESS_MARKS true // (opts->display_guess_marks) + +/* FIXME: Define an affix class MORPHOLOGY_LINKS. */ +static inline bool is_morphology_link(const char *link_name) +{ + if (NULL == link_name) return false; + return (0 == strncmp(link_name, SUFFIX_SUPPRESS, SUFFIX_SUPPRESS_L)) || + (0 == strncmp(link_name, PREFIX_SUPPRESS, PREFIX_SUPPRESS_L)); +} + +/* + * Remap the link array according to discarded links and words. + * + * The remap[] elements indicate the new WordIdx of the word. + * A value which is -1 indicates a discarded word. + * A NULL link_name indicates a discarded link. + */ +static void remap_linkages(Linkage lkg, const int *remap) +{ + LinkIdx i, j; + + for (i = 0, j = 0; i < lkg->num_links; i++) + { + Link *old_lnk = &lkg->link_array[i]; + + if (NULL != old_lnk->link_name && /* discarded link */ + (-1 != remap[old_lnk->rw]) && (-1 != remap[old_lnk->lw])) + { + Link *new_lnk = &lkg->link_array[j]; + Connector *ctmp; + + /* Copy the entire link contents, thunking the word numbers. + * Note that j is always <= i so this is always safe. */ + + new_lnk->lw = remap[old_lnk->lw]; + new_lnk->rw = remap[old_lnk->rw]; + + ctmp = new_lnk->lc; + new_lnk->lc = old_lnk->lc; + old_lnk->lc = ctmp; + + ctmp = new_lnk->rc; + new_lnk->rc = old_lnk->rc; + old_lnk->rc = ctmp; + + new_lnk->link_name = old_lnk->link_name; + + j++; + } + } + + lkg->num_links = j; + /* Unused memory not freed - all of it will be freed in free_linkages(). */ +} + +/** + * Remove unlinked optional words from a linkage. + * XXX Should we remove here also the dict-cap tokens? In any case, for now they + * are left for debug. + */ +#define D_REE 7 +void remove_empty_words(Linkage lkg) +{ + size_t i, j; + Disjunct **cdj = lkg->chosen_disjuncts; + int *remap = alloca(lkg->num_words * sizeof(*remap)); + Gword **wgp = lkg->wg_path; + + for (i = 0, j = 0; i < lkg->num_words; i++) + { + /* Discard optional words that are not real null-words. Note that + * if optional words don't have non-optional words after them, + * wg_path doesn't include them, and hence *wgp is NULL then. */ + if ((NULL == *wgp) || ((*wgp)->sent_wordidx != i)) + { + assert((NULL == cdj[i]) && lkg->sent->word[i].optional); + remap[i] = -1; + continue; + } + + Disjunct *cdtmp = cdj[j]; + cdj[j] = cdj[i]; + cdj[i] = cdtmp; /* The SAT parser frees chosen_disjuncts elements. */ + remap[i] = j; + j++; + wgp++; + } + if (lkg->num_words != j) + { + /* Unused memory not freed - all of it will be freed in free_linkages(). */ + lkg->num_words = j; + remap_linkages(lkg, remap); /* Update lkg->link_array and lkg->num_links. */ + } +} +#undef D_REE + +/** + * This takes the Wordgraph path array and uses it to + * compute the chosen_words array. "I.xx" suffixes are eliminated. + * + * chosen_words + * A pointer to an array of pointers to strings. These are the words to be + * displayed when printing the solution, the links, etc. Computed as a + * function of chosen_disjuncts[] by compute_chosen_words(). This differs + * from sentence.word[].alternatives because it contains the subscripts. It + * differs from chosen_disjunct[].string in that the idiom symbols have been + * removed. Furthermore, several chosen_disjuncts[].string elements may be + * combined into one chosen_words[] element if opts->display_morphology==0 or + * that they where linkage null-words that are morphemes of the same original + * word (i.e. subwords of an unsplit_word which are marked as morphemes). + * + * wg_path + * A pointer to a NULL-terminated array of pointers to Wordgraph words. + * It corresponds 1-1 to the chosen_disjuncts array in Linkage structure. + * A new one is constructed below to correspond 1-1 to chosen_words. + * + * FIXME Sometimes the word strings are taken from chosen_disjuncts, + * and sometimes from wordgraph subwords. + */ +#define D_CCW 8 +void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts) +{ + WordIdx i; /* index of chosen_words */ + WordIdx j; + Disjunct **cdjp = linkage->chosen_disjuncts; + const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words)); + int *remap = alloca(linkage->num_words * sizeof(*remap)); + bool *show_word = alloca(linkage->num_words * sizeof(*show_word)); + bool display_morphology = opts->display_morphology; + + Gword **lwg_path = linkage->wg_path; + Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */ + + Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */ + size_t nbsize = 0; /* number of word in a null block */ + Gword *sentence_word; + + memset(show_word, 0, linkage->num_words * sizeof(*show_word)); + + if (verbosity_level(D_CCW)) + print_lwg_path(lwg_path, "Linkage"); + + for (i = 0; i < linkage->num_words; i++) + { + Disjunct *cdj = cdjp[i]; + Gword *w; /* current word */ + const Gword *nw; /* next word (NULL if none) */ + Gword **wgp; /* wordgraph_path traversing pointer */ + + const char *t = NULL; /* current word string */ + bool at_nullblock_end; /* current word is at end of a nullblock */ + bool join_alt = false; /* morpheme-join this alternative */ + char *s; + size_t l; + size_t m; + + lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n", + i, cdj ? cdj->word_string : "NULL", + lwg_path[i] ? lwg_path[i]->subword : "NULL"); + + w = lwg_path[i]; + nw = lwg_path[i+1]; + wgp = &lwg_path[i]; + sentence_word = wg_get_sentence_word(sent, w); + + /* FIXME If the original word was capitalized in a capitalizable + * position, the displayed null word may be its downcase version. */ + + if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */ + { + chosen_words[i] = NULL; + nbsize++; + if (NULL == nullblock_start) /* it starts a new null block */ + nullblock_start = wgp; + + at_nullblock_end = (NULL == nw) || + (wg_get_sentence_word(sent, nw->unsplit_word) != sentence_word); + + /* Accumulate null words in this alternative */ + if (!at_nullblock_end && (NULL == cdjp[i+1]) && + ((w->morpheme_type == MT_PUNC) == (nw->morpheme_type == MT_PUNC))) + { + lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n", + i, nbsize, w->subword); + chosen_words[i] = NULL; + continue; + } + + if (NULL != nullblock_start) + { + /* If we are here, this null word is an end of a null block */ + lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i); + + if (1 == nbsize) + { + /* Case 1: A single null subword. */ + lgdebug(D_CCW, "A single null subword.\n"); + t = join_null_word(sent, wgp, nbsize); + + gwordlist_append(&n_lwg_path, w); + } + else + { + lgdebug(D_CCW, "Combining null subwords"); + /* Use alternative_id to check for start of alternative. */ + if (((*nullblock_start)->alternative_id == *nullblock_start) + && at_nullblock_end) + { + /* Case 2: A null unsplit_word (all-nulls alternative).*/ + lgdebug(D_CCW, " (null alternative)\n"); + t = sentence_word->subword; + + gwordlist_append(&n_lwg_path, sentence_word); + } + else + { + /* Case 3: Join together >=2 null morphemes. */ + Gword *wgnull; + + lgdebug(D_CCW, " (null partial word)\n"); + wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp); + gwordlist_append(&n_lwg_path, wgnull); + t = wgnull->subword; + } + } + + nullblock_start = NULL; + nbsize = 0; + show_word[i] = true; + + if (MT_WALL != w->morpheme_type) + { + /* Put brackets around the null word. */ + l = strlen(t) + 2; + s = (char *) alloca(l+1); + s[0] = NULLWORD_START; + strcpy(&s[1], t); + s[l-1] = NULLWORD_END; + s[l] = '\0'; + t = string_set_add(s, sent->string_set); + lgdebug(D_CCW, " %s\n", t); + /* Null words have no links, so take care not to drop them. */ + } + } + } + else + { + /* This word has a linkage. */ + + /* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */ + char *sm; + + t = cdj->word_string; + /* Print the subscript, as in "dog.n" as opposed to "dog". */ + + if (0) + { + /* TODO */ + } + else + { + /* Get rid of those ugly ".Ixx" */ + if (is_idiom_word(t)) + { + s = strdupa(t); + sm = strrchr(s, SUBSCRIPT_MARK); /* Possible double subscript. */ + UNREACHABLE(NULL == sm); /* We know it has a subscript. */ + *sm = '\0'; + t = string_set_add(s, sent->string_set); + } + else if (HIDE_MORPHO) + { + /* Concatenate the word morphemes together into one word. + * Concatenate their subscripts into one subscript. + * Use subscript separator SUBSCRIPT_SEP. + * XXX Check whether we can encounter an idiom word here. + * FIXME Combining contracted words is not handled yet, because + * combining morphemes which have non-LL links to other words is + * not yet implemented. + * FIXME Move to a separate function. */ + Gword **wgaltp; + size_t join_len = 0; + size_t mcnt = 0; + + /* If the alternative contains morpheme subwords, mark it + * for joining... */ + + const Gword *unsplit_word = w->unsplit_word; + for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++) + { + + if ((*wgaltp)->unsplit_word != unsplit_word) break; + if (MT_INFRASTRUCTURE == + (*wgaltp)->unsplit_word->morpheme_type) break; + + mcnt++; + + if (NULL == cdjp[j]) + { + /* ... but not if it contains a null word */ + join_alt = false; + break; + } + join_len += strlen(cdjp[j]->word_string) + 1; + if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME) + join_alt = true; + } + + if (join_alt) + { + /* Join it in two steps: 1. Base words. 2. Subscripts. + * FIXME? Can be done in one step (more efficient but maybe + * less clear). + * Put SUBSCRIPT_SEP between the subscripts. + * XXX No 1-1 correspondence between the hidden base words + * and the subscripts after the join, in case there are base + * words with and without subscripts. */ + + const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'}; + char *join = calloc(join_len + 1, 1); /* zeroed out */ + + join[0] = '\0'; + + /* 1. Join base words. (Could just use the unsplit_word.) */ + for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) + { + add_morpheme_unmarked(sent, join, cdjp[i+m]->word_string, + (*wgaltp)->morpheme_type); + } + + strcat(join, subscript_mark_str()); /* tentative */ + + /* 2. Join subscripts. */ + for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) + { + /* Cannot NULLify the word - we may have links to it. */ + if (m != mcnt-1) chosen_words[i+m] = ""; + + sm = strchr(cdjp[i+m]->word_string, SUBSCRIPT_MARK); + + if (NULL != sm) + { + /* Supposing stem subscript is .=x (x optional) */ + if (MT_STEM == (*wgaltp)->morpheme_type) + { + sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */ + if ('\0' == *sm) sm = NULL; +#if 0 + if ((cnt-1) == m) + { + /* Support a prefix-stem combination. In that case + * we have just nullified the combined word, so we + * need to move it to the position of the prefix. + * FIXME: May still not be good enough. */ + move_combined_word = i+m-1; + + /* And the later chosen_word assignment should be: + * chosen_words[-1 != move_combined_word ? + * move_combined_word : i] = t; + */ + } + else + { + move_combined_word = -1; + } +#endif + } + } + if (NULL != sm) + { + strcat(join, sm+1); + strcat(join, subscript_sep_str); + } + } + + /* Remove an extra mark, if any */ + join_len = strlen(join); + if ((SUBSCRIPT_SEP == join[join_len-1]) || + (SUBSCRIPT_MARK == join[join_len-1])) + join[join_len-1] = '\0'; + + gwordlist_append(&n_lwg_path, sentence_word); + t = string_set_add(join, sent->string_set); + free(join); + + i += mcnt-1; + } + } + } + + if (!join_alt) gwordlist_append(&n_lwg_path, *wgp); + + /* + * Add guess marks in [] square brackets, if needed, at the + * end of the base word. Convert the badly-printing + * SUBSCRIPT_MARK (hex 03 or ^C) into a period. + */ + if (t) + { + + s = strdupa(t); + sm = strrchr(s, SUBSCRIPT_MARK); + if (sm) *sm = SUBSCRIPT_DOT; + + if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT)) + || !DISPLAY_GUESS_MARKS) + { + t = string_set_add(s, sent->string_set); + } + else + { + const char *regex_name = w->regex_name; + /* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */ + int baselen = NULL == sm ? strlen(t) : (size_t)(sm-s); + char guess_mark = 0; + + switch (w->status & WS_GUESS) + { + case WS_SPELL: + guess_mark = GM_SPELL; + break; + case WS_RUNON: + guess_mark = GM_RUNON; + break; + case WS_REGEX: + guess_mark = GM_REGEX; + break; + case 0: + guess_mark = GM_UNKNOWN; + break; + default: + assert(0, "Missing 'case: %2x'", w->status & WS_GUESS); + } + + /* In the case of display_morphology==0, the guess indication of + * the last subword is used as the guess indication of the whole + * word. + * FIXME? The guess indications of other subwords are ignored in + * this mode. This implies that if a first or middle subword has + * a guess indication but the last subword doesn't have, no guess + * indication would be shown at all. */ + + if ((NULL == regex_name) || HIDE_MORPHO) regex_name = ""; + s = alloca(strlen(t) + strlen(regex_name) + 4); + strncpy(s, t, baselen); + s[baselen] = '['; + s[baselen + 1] = guess_mark; + strcpy(s + baselen + 2, regex_name); + strcat(s, "]"); + if (NULL != sm) strcat(s, sm); + t = string_set_add(s, sent->string_set); + } + } + } + + assert(t != NULL, "Word %zu: NULL", i); + chosen_words[i] = t; + } + + /* Conditional test removal of quotation marks and the "capdict" tokens, + * to facilitate using diff on sentence batch runs. */ + if (test_enabled("removeZZZ")) + { + for (i=0, j=0; inum_links; i++) + { + Link *lnk = &(linkage->link_array[i]); + + if (0 == strcmp("ZZZ", lnk->link_name)) + chosen_words[lnk->rw] = NULL; + } + } + + /* If morphology printing is being suppressed, then all links + * connecting morphemes will be discarded. */ + if (HIDE_MORPHO) + { + /* Discard morphology links. */ + for (i=0; inum_links; i++) + { + Link * lnk = &linkage->link_array[i]; + + if (is_morphology_link(lnk->link_name)) + { + /* Mark link for discarding. */ + lnk->link_name = NULL; + } + else + { + /* Mark word for not discarding. */ + show_word[lnk->rw] = true; + show_word[lnk->lw] = true; + } + } + } + + /* We alloc a little more than needed, but so what... */ + linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *)); + + /* Copy over the chosen words, dropping the discarded words. + * However, don't discard existing words (chosen_words[i][0]). + * Note that if a word only has morphology links and is not combined with + * another word, then it will get displayed with no links at all (e.g. + * when explicitly specifying root and suffix for debug: root.= =suf */ + for (i=0, j=0; inum_words; ++i) + { + if (chosen_words[i] && + (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i]))) + { + const char *cwtmp = linkage->word[j]; + linkage->word[j] = chosen_words[i]; + chosen_words[i] = cwtmp; + remap[i] = j; + j++; + } + else + { + remap[i] = -1; + } + } + linkage->num_words = j; + + remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */ + + linkage->wg_path_display = n_lwg_path; + + if (verbosity_level(D_CCW)) + print_lwg_path(n_lwg_path, "Display"); +} +#undef D_CCW + +Linkage linkage_create(LinkageIdx k, Sentence sent, Parse_Options opts) +{ + Linkage linkage; + + if (opts->use_sat_solver) + { + linkage = sat_create_linkage(k, sent, opts); + if (!linkage) return NULL; + } + else + { + /* Cannot create a Linkage for a discarded linkage. */ + if (sent->num_linkages_post_processed <= k) return NULL; + linkage = &sent->lnkages[k]; + } + + /* Perform remaining initialization we haven't done yet...*/ + compute_chosen_words(sent, linkage, opts); + + linkage->is_sent_long = (linkage->num_words >= opts->twopass_length); + + return linkage; +} + +void linkage_delete(Linkage linkage) +{ + /* Currently a no-op */ +} + +size_t linkage_get_num_words(const Linkage linkage) +{ + if (!linkage) return 0; + return linkage->num_words; +} + +size_t linkage_get_num_links(const Linkage linkage) +{ + if (!linkage) return 0; + return linkage->num_links; +} + +static inline bool verify_link_index(const Linkage linkage, LinkIdx index) +{ + if (!linkage) return false; + if (index >= linkage->num_links) return false; + return true; +} + +int linkage_get_link_length(const Linkage linkage, LinkIdx index) +{ + Link *link; + if (!verify_link_index(linkage, index)) return -1; + link = &(linkage->link_array[index]); + return link->rw - link->lw; +} + +WordIdx linkage_get_link_lword(const Linkage linkage, LinkIdx index) +{ + if (!verify_link_index(linkage, index)) return SIZE_MAX; + return linkage->link_array[index].lw; +} + +WordIdx linkage_get_link_rword(const Linkage linkage, LinkIdx index) +{ + if (!verify_link_index(linkage, index)) return SIZE_MAX; + return linkage->link_array[index].rw; +} + +const char * linkage_get_link_label(const Linkage linkage, LinkIdx index) +{ + if (!verify_link_index(linkage, index)) return NULL; + return linkage->link_array[index].link_name; +} + +const char * linkage_get_link_llabel(const Linkage linkage, LinkIdx index) +{ + if (!verify_link_index(linkage, index)) return NULL; + return connector_string(linkage->link_array[index].lc); +} + +const char * linkage_get_link_rlabel(const Linkage linkage, LinkIdx index) +{ + if (!verify_link_index(linkage, index)) return NULL; + return connector_string(linkage->link_array[index].rc); +} + +const char ** linkage_get_words(const Linkage linkage) +{ + return linkage->word; +} + +const char * linkage_get_disjunct_str(const Linkage linkage, WordIdx w) +{ + Disjunct *dj; + + if (NULL == linkage) return ""; + if (NULL == linkage->disjunct_list_str) + { + lg_compute_disjunct_strings(linkage); + } + + if (linkage->num_words <= w) return NULL; /* bounds-check */ + + /* dj will be null if the word wasn't used in the parse. */ + dj = linkage->chosen_disjuncts[w]; + if (NULL == dj) return ""; + + return linkage->disjunct_list_str[w]; +} + +double linkage_get_disjunct_cost(const Linkage linkage, WordIdx w) +{ + Disjunct *dj; + + if (linkage->num_words <= w) return 0.0; /* bounds-check */ + + dj = linkage->chosen_disjuncts[w]; + + /* dj may be null, if the word didn't participate in the parse. */ + if (dj) return dj->cost; + return 0.0; +} + +double linkage_get_disjunct_corpus_score(const Linkage linkage, WordIdx w) +{ + Disjunct *dj; + + if (linkage->num_words <= w) return 99.999; /* bounds-check */ + dj = linkage->chosen_disjuncts[w]; + + /* dj may be null, if the word didn't participate in the parse. */ + if (NULL == dj) return 99.999; + + return lg_corpus_disjunct_score(linkage, w); +} + +const char * linkage_get_word(const Linkage linkage, WordIdx w) +{ + if (!linkage) return NULL; + if (linkage->num_words <= w) return NULL; /* bounds-check */ + return linkage->word[w]; +} + +int linkage_unused_word_cost(const Linkage linkage) +{ + /* The sat solver (currently) fails to fill in info */ + if (!linkage) return 0; + return linkage->lifo.unused_word_cost; +} + +double linkage_disjunct_cost(const Linkage linkage) +{ + /* The sat solver (currently) fails to fill in info */ + if (!linkage) return 0.0; + return linkage->lifo.disjunct_cost; +} + +int linkage_link_cost(const Linkage linkage) +{ + /* The sat solver (currently) fails to fill in info */ + if (!linkage) return 0; + return linkage->lifo.link_cost; +} + +double linkage_corpus_cost(const Linkage linkage) +{ + /* The sat solver (currently) fails to fill in info */ + if (!linkage) return 0.0; + return linkage->lifo.corpus_cost; +} + +/* =========== Get word sentence positions ============================== */ + +size_t linkage_get_word_byte_start(const Linkage linkage, WordIdx w) +{ + if (linkage->num_words <= w) return 0; /* bounds-check */ + return linkage->wg_path_display[w]->start - linkage->sent->orig_sentence; +} + +size_t linkage_get_word_byte_end(const Linkage linkage, WordIdx w) +{ + if (linkage->num_words <= w) return 0; /* bounds-check */ + return linkage->wg_path_display[w]->end - linkage->sent->orig_sentence; +} + +/* The character position is computed in a straightforward way, which may + * not be efficient if more than one position is needed. If needed, it can + * be changed to use caching of already-calculated positions. */ + +size_t linkage_get_word_char_start(const Linkage linkage, WordIdx w) +{ + if (linkage->num_words <= w) return 0; /* bounds-check */ + int pos = linkage->wg_path_display[w]->start - linkage->sent->orig_sentence; + char *sentchunk = alloca(pos+1); + strncpy(sentchunk, linkage->sent->orig_sentence, pos); + sentchunk[pos] = '\0'; + return utf8_strlen(sentchunk); +} + +size_t linkage_get_word_char_end(const Linkage linkage, WordIdx w) +{ + if (linkage->num_words <= w) return 0; /* bounds-check */ + int pos = linkage->wg_path_display[w]->end - linkage->sent->orig_sentence; + char *sentchunk = alloca(pos+1); + strncpy(sentchunk, linkage->sent->orig_sentence, pos); + sentchunk[pos] = '\0'; + return utf8_strlen(sentchunk); +} diff -Nru link-grammar-5.3.16/link-grammar/linkage/linkage.h link-grammar-5.5.0/link-grammar/linkage/linkage.h --- link-grammar-5.3.16/link-grammar/linkage/linkage.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/linkage.h 2018-01-22 18:48:21.000000000 +0000 @@ -0,0 +1,96 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _LINKAGE_H +#define _LINKAGE_H + +#include +#include "api-types.h" +#include "link-includes.h" // Needed for typedef WordIdx + +/** + * This summarizes the linkage status. + */ +struct Linkage_info_struct +{ + int index; /* Index into the parse_set */ + bool discarded; + short N_violations; + short unused_word_cost; + short link_cost; + + double disjunct_cost; + double corpus_cost; + const char *pp_violation_msg; +}; + +/** + * num_links: + * The number of links in the current linkage. Computed by + * extract_linkage(). + * + * chosen_disjuncts[] + * This is an array pointers to disjuncts, one for each word, that is + * computed by extract_links(). It represents the chosen disjuncts + * for the current linkage. It is used to compute the cost of the + * linkage, and also by compute_chosen_words() to compute the + * chosen_words[]. + * + * link_array[] + * This is an array of links. These links define the current linkage. + * It is computed by extract_links(). It is used by analyze_linkage(). + */ +struct Linkage_s +{ + WordIdx num_words; /* Number of (tokenized) words */ + bool is_sent_long; /* num_words >= twopass_length */ + const char * * word; /* Array of word spellings */ + + size_t num_links; /* Number of links in array */ + Link * link_array; /* Array of links */ + size_t lasz; /* Alloc'ed length of link_array */ + + Disjunct ** chosen_disjuncts; /* Disjuncts used, one per word */ + size_t cdsz; /* Alloc'ed length of chosen_disjuncts */ + char ** disjunct_list_str; /* Stringified version of above */ +#ifdef USE_CORPUS + Sense ** sense_list; /* Word senses, inferred from disjuncts */ +#endif + + Gword **wg_path; /* Linkage Wordgraph path */ + Gword **wg_path_display; /* Wordgraph path after morpheme combining */ + + Linkage_info lifo; /* Parse_set index and cost information */ + PP_domains * pp_domains; /* PP domain info, one for each link */ + + Sentence sent; /* Used for common linkage data */ +}; + +struct Link_s +{ + size_t lw; /* Offset into Linkage->word NOT Sentence->word */ + size_t rw; /* Offset into Linkage->word NOT Sentence->word */ + Connector * lc; + Connector * rc; + const char * link_name; /* Spelling of full link name */ +}; + +void compute_chosen_words(Sentence, Linkage, Parse_Options); + +void partial_init_linkage(Sentence, Linkage, unsigned int N_words); +void check_link_size(Linkage); +void remove_empty_words(Linkage); +void free_linkage(Linkage); +void free_linkages(Sentence); + +#endif /* _LINKAGE_H */ diff -Nru link-grammar-5.3.16/link-grammar/linkage/lisjuncts.c link-grammar-5.5.0/link-grammar/linkage/lisjuncts.c --- link-grammar-5.3.16/link-grammar/linkage/lisjuncts.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/lisjuncts.c 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,97 @@ +/*************************************************************************/ +/* Copyright (c) 2008, 2009, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ +/* + * lisjuncts.c + * + * Miscellaneous utilities for returning the list of disjuncts that + * were actually used in a given parse of a sentence. + */ + +#include +#include +#include "api-structures.h" +#include "connectors.h" +#include "disjunct-utils.h" +#include "linkage.h" +#include "lisjuncts.h" + +/* Links are *always* less than 10 chars long . For now. The estimate + * below is somewhat dangerous .... could be fixed. */ +#define MAX_LINK_NAME_LENGTH 10 + +/** + * Print connector list to string. + * This reverses the order of the connectors in the connector list, + * so that the resulting list is in the same order as it would appear + * in the dictionary. The character 'dir' is appended to each connector. + */ +static char * reversed_conlist_str(Connector* c, char dir, char* buf, size_t sz) +{ + char* p; + size_t len = 0; + + if (NULL == c) return buf; + p = reversed_conlist_str(c->next, dir, buf, sz); + + sz -= (p-buf); + + if (c->multi) + p[len++] = '@'; + + len += lg_strlcpy(p+len, connector_string(c), sz-len); + if (3 < sz-len) + { + p[len++] = dir; + p[len++] = ' '; + p[len] = 0x0; + } + return p+len; +} + +/** + * Print disjunct to string. The resulting list is in the same order + * as it would appear in the dictionary. + */ +static void disjunct_str(Disjunct* dj, char* buf, size_t sz) +{ + char* p; + if (NULL == dj) { *buf = 0; return; } + p = reversed_conlist_str(dj->left, '-', buf, sz); + reversed_conlist_str(dj->right, '+', p, sz - (p-buf)); +} + +/** + * lg_compute_disjunct_strings -- Given sentence, compute disjuncts. + * + * This routine will compute the string representation of the disjunct + * used for each word in parsing the given sentence. A string + * representation of the disjunct is needed for most of the corpus + * statistics functions: this string, together with the subscripted + * word, is used as a key to index the statistics information in the + * database. + */ +void lg_compute_disjunct_strings(Linkage lkg) +{ + char djstr[MAX_LINK_NAME_LENGTH*20]; /* no word will have more than 20 links */ + size_t nwords = lkg->num_words; + + if (lkg->disjunct_list_str) return; + lkg->disjunct_list_str = (char **) malloc(nwords * sizeof(char *)); + memset(lkg->disjunct_list_str, 0, nwords * sizeof(char *)); + + for (WordIdx w=0; w< nwords; w++) + { + Disjunct* dj = lkg->chosen_disjuncts[w]; + disjunct_str(dj, djstr, sizeof(djstr)); + + lkg->disjunct_list_str[w] = strdup(djstr); + } +} diff -Nru link-grammar-5.3.16/link-grammar/linkage/lisjuncts.h link-grammar-5.5.0/link-grammar/linkage/lisjuncts.h --- link-grammar-5.3.16/link-grammar/linkage/lisjuncts.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/lisjuncts.h 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,14 @@ +/*************************************************************************/ +/* Copyright (c) 2008, 2009 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "link-includes.h" + +void lg_compute_disjunct_strings(Linkage); diff -Nru link-grammar-5.3.16/link-grammar/linkage/sane.c link-grammar-5.5.0/link-grammar/linkage/sane.c --- link-grammar-5.3.16/link-grammar/linkage/sane.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/sane.c 2018-01-29 21:20:45.000000000 +0000 @@ -0,0 +1,563 @@ +/*************************************************************************/ +/* Copyright 2013, 2014 Linas Vepstas */ +/* Copyright 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-structures.h" // for Sentence_s +#include "api-types.h" +#include "dict-common/regex-morph.h" // for match_regex +#include "connectors.h" // for MAX_SENTENCE +#include "disjunct-utils.h" // for Disjunct_struct +#include "lg_assert.h" +#include "linkage.h" +#include "sane.h" +#include "tokenize/tok-structures.h" // Needed for Wordgraph_pathpos_s +#include "tokenize/word-structures.h" // for Word_struct +#include "tokenize/wordgraph.h" +#include "utilities.h" + +/** + * Construct word paths (one or more) through the Wordgraph. + * + * Add 'current_word" to the potential path. + * Add "p" to the path queue, which defines the start of the next potential + * paths to be checked. + * + * Each path is up to the current word (not including). It doesn't actually + * construct a full path if there are null words - they break it. The final path + * is constructed when the Wordgraph termination word is encountered. + * + * Note: The final path doesn't match the linkage word indexing if the linkage + * contains empty words, at least until empty words are eliminated from the + * linkage (in compute_chosen_words()). Further processing of the path is done + * there in case morphology splits are to be hidden or there are morphemes with + * null linkage. + */ +#define D_WPA 7 +static void wordgraph_path_append(Wordgraph_pathpos **nwp, const Gword **path, + Gword *current_word, /* add to the path */ + Gword *p) /* add to the path queue */ +{ + size_t n = wordgraph_pathpos_len(*nwp); + + assert(NULL != p, "Tried to add a NULL word to the word queue"); + if (current_word == p) + { + lgdebug(D_WPA, "Adding the same word '%s' again\n", p->subword); + //print_lwg_path((Gword **)path, "After adding the same word"); + } + + /* Check if the path queue already contains the word to be added to it. */ + const Wordgraph_pathpos *wpt = NULL; + + if (NULL != *nwp) + { + for (wpt = *nwp; NULL != wpt->word; wpt++) + { + if (p == wpt->word) + { + lgdebug(D_WPA, "Word %s (after %zu) exists (after %zu)\n", + p->subword, + wpt->path[gwordlist_len(wpt->path)-1]->sent_wordidx, + path[gwordlist_len(path)-1]->sent_wordidx); + /* If we are here, there are 2 or more paths leading to this word + * (p) that end with the same number of consecutive null words that + * consist an entire alternative. These null words represent + * different ways to split the subword upward in the hierarchy. + * For a nicer result we choose the shorter path. */ + if (wpt->path[gwordlist_len(wpt->path)-1]->sent_wordidx <= + path[gwordlist_len(path)-1]->sent_wordidx) + { + lgdebug(D_WPA, "Shorter path already queued\n"); + return; /* The shorter path is already in the queue. */ + } + lgdebug(D_WPA, "Longer path is in the queue\n"); + //print_lwg_path((Gword **)wpt->path, "Freeing"); + free(wpt->path); /* To be replaced by a shorter path. */ + break; + } + } + } + + if ((NULL == wpt) || (p != wpt->word)) + { + /* Not already in the path queue - add it. */ + *nwp = wordgraph_pathpos_resize(*nwp, n+1); + } + else + { + lgdebug(D_WPA, "Path position to be replaced (len %zu): %zu\n", n, + wpt - *nwp); + n = wpt - *nwp; /* Replace this path. */ + } + (*nwp)[n].word = p; + + if (MT_INFRASTRUCTURE == p->prev[0]->morpheme_type) + { + /* Previous word is the Wordgraph dummy word. Initialize the path. */ + (*nwp)[n].path = NULL; + } + else + { + /* Duplicate the path from the current one. */ + assert(NULL != path, "wordgraph_path_append(): Duplicating a null path"); + + size_t path_arr_size = (gwordlist_len(path)+1)*sizeof(*path); + + (*nwp)[n].path = malloc(path_arr_size); + memcpy((*nwp)[n].path, path, path_arr_size); + } + + /* If we queue the same word again, its path remains the same. + * Else append the current word to it. */ + if (p != current_word) + { + /* FIXME (cast) but anyway gwordlist_append() doesn't modify Gword. */ + gwordlist_append((Gword ***)&(*nwp)[n].path, current_word); + } +} + +/** + * Free the Wordgraph paths and the Wordgraph_pathpos array. + * In case of a match, the final path is still needed so this function is + * then invoked with free_final_path=false. + */ +static void wordgraph_path_free(Wordgraph_pathpos *wp, bool free_final_path) +{ + Wordgraph_pathpos *twp; + + if (NULL == wp) return; + for (twp = wp; NULL != twp->word; twp++) + { + if (free_final_path || (MT_INFRASTRUCTURE != twp->word->morpheme_type)) + free(twp->path); + } + free(wp); +} + +#define NO_WORD (MAX_SENTENCE+1) + +/** + * Return the number of islands in a linkage. + * First, each word appears in its own linked list. + * Then all the links in the linkage are traversed, and the lists pointed + * by each of them are combined. + * Finally, the words are traversed and the lists are followed and + * numbered. The WG path is used to skip optional words which are null. + */ +static size_t num_islands(const Linkage lkg, const Gword **wg_path) +{ + struct word + { + int prev; + int next; + int inum; + }; + struct word *word = alloca(lkg->sent->length * sizeof(struct word)); + + /* Initially, each word is in its own island. */ + for (WordIdx w = 0; w < lkg->sent->length; w++) + { + word[w].prev = word[w].next = w; + } + + /* Unify the potential islands pointed by each link + * (if they are already unified, they remain so.) */ + for (LinkIdx li = 0; li < lkg->num_links; li++) + { + Link *l = &lkg->link_array[li]; + + WordIdx iw; + for (iw = word[l->lw].next; (iw != l->rw) && (iw != l->lw); iw = word[iw].next) + ; + + if (iw != l->rw) + { + int nextl = word[l->lw].next; + int prevr = word[l->rw].prev; + + word[l->lw].next = l->rw; + word[l->rw].prev = l->lw; + + word[prevr].next = nextl; + word[nextl].prev = prevr; + } + + if (verbosity_level(+8)) + { + for (WordIdx w = 0; w < lkg->sent->length; w++) + { + err_msg(lg_Debug, "%d<-%zu->%d ", word[w].prev, w, word[w].next); + } + err_msg(lg_Debug, "\n"); + } + } + + /* Count islands. */ + int inum = -1; + Disjunct **cdj = lkg->chosen_disjuncts; + + for (WordIdx w = 0; w < lkg->sent->length; w++) + { + /* Skip null words which are optional words. */ + if ((NULL == *wg_path) || ((*wg_path)->sent_wordidx != w)) + { + assert(word[w].prev == word[w].next); + assert((NULL == cdj[w]) && lkg->sent->word[w].optional); + + word[w].prev = NO_WORD; + word[w].inum = -1; /* not belonging to any island */ + continue; + } + + wg_path++; + if (NO_WORD == word[w].prev) continue; + + inum++; + for (WordIdx iw = w; NO_WORD != word[iw].prev; iw = word[iw].next) + { + word[iw].prev = NO_WORD; + word[iw].inum = inum; + } + } + + if (verbosity_level(8)) + { + err_msg(lg_Debug, "Island count %d: ", inum); + for (WordIdx w = 0; w < lkg->sent->length; w++) + { + err_msg(lg_Debug, "%d ", word[w].inum); + } + err_msg(lg_Debug, "\n"); + } + + return inum; +} + +/* ============================================================== */ +/* A kind of morphism post-processing */ + +/* These letters create a string that should be matched by a + * SANEMORPHISM regex, given in the affix file. The empty word + * doesn't have a letter. E.g. for the Russian dictionary: "w|ts". + * It is converted here to: "^((w|ts)b)+$". + * It matches "wbtsbwbtsbwb" but not "wbtsbwsbtsb". + * FIXME? In this version of the function, 'b' is not yet supported, + * so "w|ts" is converted to "^(w|ts)+$" for now. + */ +#define AFFIXTYPE_PREFIX 'p' /* prefix */ +#define AFFIXTYPE_STEM 't' /* stem */ +#define AFFIXTYPE_SUFFIX 's' /* suffix */ +#define AFFIXTYPE_MIDDLE 'm' /* middle morpheme */ +#define AFFIXTYPE_WORD 'w' /* regular word */ +#ifdef WORD_BOUNDARIES +#define AFFIXTYPE_END 'b' /* end of input word */ +#endif + +/** + * This routine solves the problem of mis-linked alternatives, + * i.e a morpheme in one alternative that is linked to a morpheme in + * another alternative. This can happen due to the way in which word + * alternatives are implemented. + * + * It does so by checking that all the chosen disjuncts in a linkage + * (including null words) match, in the same order, a path in the + * Wordgraph. + * + * An important side effect of this check is that if the linkage is + * good, its Wordgraph path is found. + * + * Optionally (if SANEMORPHISM regex is defined in the affix file), it + * also validates that the morpheme-type sequence is permitted for the + * language. This is a sanity check of the program and the dictionary. + * + * Return true if the linkage is good, else return false. + */ +#define D_SLM 8 +bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts) +{ + Wordgraph_pathpos *wp_new = NULL; + Wordgraph_pathpos *wp_old = NULL; + Wordgraph_pathpos *wpp; + Gword **next; /* next Wordgraph words of the current word */ + size_t i; + size_t null_count_found = 0; + + bool match_found = true; /* if all the words are null - it's still a match */ + Gword **lwg_path; + + Dictionary afdict = sent->dict->affix_table; /* for SANEMORPHISM */ + char *const affix_types = alloca(sent->length*2 + 1); /* affix types */ + affix_types[0] = '\0'; + + lkg->wg_path = NULL; + + /* Populate the path word queue, initializing the path to NULL. */ + for (next = sent->wordgraph->next; *next; next++) + { + wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next); + } + assert(NULL != wp_new, "Path word queue is empty"); + + for (i = 0; i < lkg->num_words; i++) + { + Disjunct *cdj; /* chosen disjunct */ + + lgdebug(D_SLM, "lkg=%p Word %zu: ", lkg, i); + + if (NULL == wp_new) + { + lgdebug(D_SLM, "- No more words in the wordgraph\n"); + match_found = false; + break; + } + + if (wp_old != wp_new) + { + wordgraph_path_free(wp_old, true); + wp_old = wp_new; + } + wp_new = NULL; + //wordgraph_pathpos_print(wp_old); + + cdj = lkg->chosen_disjuncts[i]; + /* Handle null words */ + if (NULL == cdj) + { + lgdebug(D_SLM, "- Null word"); + /* A null word matches any word in the Wordgraph - + * so, unconditionally proceed in all paths in parallel. */ + match_found = false; + bool optional_word_found = false; + for (wpp = wp_old; NULL != wpp->word; wpp++) + { + if ((MT_INFRASTRUCTURE == wpp->word->morpheme_type) || + (wpp->word->sent_wordidx > i)) + { + assert(sent->word[i].optional, "wordindex=%zu", i); + lgdebug(D_SLM, " (Optional, index=%zu)\n", i); + // Retain the same word in the new path queue. + wordgraph_path_append(&wp_new, wpp->path, wpp->word, wpp->word); + match_found = true; + optional_word_found = true; + continue; /* Disregard this chosen disjunct. */ + } + + /* The null words cannot be marked here because wpp->path consists + * of pointers to the Wordgraph words, and these words are common to + * all the linkages, with potentially different null words in each + * of them. However, the position of the null words can be inferred + * from the null words in the word array of the Linkage structure. + */ + for (next = wpp->word->next; NULL != *next; next++) + { + if (MT_INFRASTRUCTURE != wpp->word->morpheme_type) + match_found = true; + wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); + } + } + + if (!optional_word_found) + { + null_count_found++; + /* Note that if all the sentence words are null-words, its + * null_count is only sent->length-1 so this is not a mismatch. */ + if ((null_count_found > lkg->sent->null_count) && + (lkg->sent->null_count != sent->length-1)) + { + lgdebug(D_SLM, " (Extra, count > %zu)\n", lkg->sent->null_count); + match_found = false; + break; + } + lgdebug(D_SLM, "\n"); + } + + //if (NULL != wpp->word) break; /* Extra null count; XXX always false*/ + continue; + } + + if (!match_found) + { + const char *e = "Internal error: Too many words in the linkage"; + lgdebug(D_SLM, "- %s\n", e); + prt_error("Error: %s.\n", e); + break; + } + + if (verbosity_level(D_SLM)) prt_error("%s", cdj->word_string); + + match_found = false; + /* Proceed in all the paths in which the word is found. */ + for (wpp = wp_old; NULL != wpp->word; wpp++) + { + for (gword_set *gl = cdj->originating_gword; NULL != gl; gl = gl->next) + { + if (gl->o_gword == wpp->word) + { + match_found = true; + for (next = wpp->word->next; NULL != *next; next++) + { + wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); + } + break; + } + } + } + + if (!match_found) + { + /* FIXME? A message can be added here if there are too many words + * in the linkage (can happen only if there is an internal error). */ + lgdebug(D_SLM, "- No Wordgraph match\n"); + break; + } + lgdebug(D_SLM, "\n"); + } + + if (match_found) + { + match_found = false; + /* Validate that there are no missing words in the linkage. + * It is so, if the dummy termination word is found in the + * new pathpos queue. + */ + if (NULL != wp_new) + { + for (wpp = wp_new; NULL != wpp->word; wpp++) + { + if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) { + match_found = true; + /* Exit the loop with with wpp of the termination word. */ + break; + } + } + } + if (!match_found) + lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg); + } + + /* Reject found null count that is not consistent with sent->null_count. + * Here islands_ok=1 is handled, and also a lower-than-expected null + * count when islands_ok=0. */ + if (match_found) + { + size_t count_found = + opts->islands_ok ? num_islands(lkg, wpp->path) : null_count_found; + + if ((count_found != lkg->sent->null_count) && + (lkg->sent->null_count != sent->length-1) && (count_found != sent->length)) + { + lgdebug(D_SLM, "Null count mismatch: Found %zu != null_count %zu\n", + count_found, lkg->sent->null_count); + match_found = false; + } + } + +#define DEBUG_morpheme_type 0 + /* Check the morpheme type combination. + * If null_count > 0, the morpheme type combination may be invalid + * due to null subwords, so skip this check. */ + if (match_found && (0 == sent->null_count) && + (NULL != afdict) && (NULL != afdict->regex_root)) + { + const Gword **w; + char *affix_types_p = affix_types; + + /* Construct the affix_types string. */ +#if DEBUG_morpheme_type + print_lwg_path(wpp->path, "Linkage"); +#endif + i = 0; + for (w = wpp->path; *w; w++) + { + i++; + + PRAGMA_START(GCC diagnostic ignored "-Wswitch-enum") + switch ((*w)->morpheme_type) + { + default: + /* What to do with the rest? */ + case MT_WORD: + *affix_types_p = AFFIXTYPE_WORD; + break; + case MT_PREFIX: + *affix_types_p = AFFIXTYPE_PREFIX; + break; + case MT_STEM: + *affix_types_p = AFFIXTYPE_STEM; + break; + case MT_MIDDLE: + *affix_types_p = AFFIXTYPE_MIDDLE; + break; + case MT_SUFFIX: + *affix_types_p = AFFIXTYPE_SUFFIX; + break; + } + PRAGMA_END + +#if DEBUG_morpheme_type + lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n", + i, (*w)->subword, *affix_types_p); +#endif + + affix_types_p++; + } + *affix_types_p = '\0'; + +#ifdef WORD_BOUNDARIES /* not yet implemented */ + { + const Gword *uw; + + /* If w is an "end subword", return its unsplit word, else NULL. */ + uw = word_boundary(w); /* word_boundary() unimplemented */ + + if (NULL != uw) + { + *affix_types_p++ = AFFIXTYPE_END; + lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword); + } + } +#endif + + /* Check if affix_types is valid according to SANEMORPHISM. */ + if (('\0' != affix_types[0]) && + (NULL == match_regex(afdict->regex_root, affix_types))) + { + /* Morpheme type combination is invalid */ + match_found = false; + /* Notify to stdout, so it will be shown along with the result. + * XXX We should have a better way to notify. */ + if (0 < opts->verbosity) + prt_error("Warning: Invalid morpheme type combination '%s'.\n" + "Run with !bad and !verbosity>"STRINGIFY(D_USER_MAX) + " to debug\n", affix_types); + } + } + + if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */ + wordgraph_path_free(wp_old, true); + wordgraph_path_free(wp_new, !match_found); + + if (match_found) + { + if ('\0' != affix_types[0]) + { + lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types); + } + lgdebug(+D_SLM-1, "%p SUCCEEDED\n", lkg); + lkg->wg_path = lwg_path; + return true; + } + + /* Oh no ... invalid morpheme combination! */ + lgdebug(+D_SLM-1, "%p FAILED\n", lkg); + return false; +} +#undef D_SLM diff -Nru link-grammar-5.3.16/link-grammar/linkage/sane.h link-grammar-5.5.0/link-grammar/linkage/sane.h --- link-grammar-5.3.16/link-grammar/linkage/sane.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/sane.h 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,4 @@ + +#include "link-includes.h" + +bool sane_linkage_morphism(Sentence, Linkage, Parse_Options); diff -Nru link-grammar-5.3.16/link-grammar/linkage/score.c link-grammar-5.5.0/link-grammar/linkage/score.c --- link-grammar-5.3.16/link-grammar/linkage/score.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/score.c 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,86 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2012, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include "api-structures.h" // Needed for Parse_Options +#include "disjunct-utils.h" // Needed for Disjunct +#include "linkage.h" +#include "score.h" + +/** + * This function defines the cost of a link as a function of its length. + */ +static inline int cost_for_length(int length) +{ + return length-1; +} + +/** + * Computes the cost of the current parse of the current sentence, + * due to the length of the links. + */ +static size_t compute_link_cost(Linkage lkg) +{ + size_t lcost, i; + lcost = 0; + for (i = 0; i < lkg->num_links; i++) + { + lcost += cost_for_length(lkg->link_array[i].rw - lkg->link_array[i].lw); + } + return lcost; +} + +static int unused_word_cost(Linkage lkg) +{ + int lcost; + size_t i; + lcost = 0; + for (i = 0; i < lkg->num_words; i++) + lcost += (lkg->chosen_disjuncts[i] == NULL); + return lcost; +} + +/** + * Computes the cost of the current parse of the current sentence + * due to the cost of the chosen disjuncts. + */ +static double compute_disjunct_cost(Linkage lkg) +{ + size_t i; + double lcost; + lcost = 0.0; + for (i = 0; i < lkg->num_words; i++) + { + if (lkg->chosen_disjuncts[i] != NULL) + lcost += lkg->chosen_disjuncts[i]->cost; + } + return lcost; +} + +/** Assign parse score (cost) to linkage, used for parse ranking. */ +void linkage_score(Linkage lkg, Parse_Options opts) +{ + lkg->lifo.unused_word_cost = unused_word_cost(lkg); + if (opts->use_sat_solver) + { + lkg->lifo.disjunct_cost = 0.0; + } + else + { + lkg->lifo.disjunct_cost = compute_disjunct_cost(lkg); + } + lkg->lifo.link_cost = compute_link_cost(lkg); + lkg->lifo.corpus_cost = -1.0; + + lg_corpus_score(lkg); +} diff -Nru link-grammar-5.3.16/link-grammar/linkage/score.h link-grammar-5.5.0/link-grammar/linkage/score.h --- link-grammar-5.3.16/link-grammar/linkage/score.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage/score.h 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,21 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2012, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _SCORE_H +#define _SCORE_H + +#include "api-types.h" +#include "link-includes.h" + +void linkage_score(Linkage, Parse_Options); +#endif /* _SCORE_H */ diff -Nru link-grammar-5.3.16/link-grammar/linkage.c link-grammar-5.5.0/link-grammar/linkage.c --- link-grammar-5.3.16/link-grammar/linkage.c 2017-04-15 21:43:39.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,876 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright 2008, 2009, 2013, 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include - -#include "analyze-linkage.h" -#include "api-structures.h" -#include "dict-common.h" -#include "disjuncts.h" -#include "externs.h" -#include "extract-links.h" -#include "idiom.h" -#include "link-includes.h" -#include "linkage.h" -#include "post-process.h" -#include "print.h" -#include "print-util.h" -#include "sat-solver/sat-encoder.h" -#include "string-set.h" -#include "structures.h" -#include "wordgraph.h" -#include "word-utils.h" - -#define INFIX_MARK_L 1 /* INFIX_MARK is 1 character */ -#define STEM_MARK_L 1 /* stem mark is 1 character */ - -/* Marks around a null word. */ -#define NULLWORD_START '[' -#define NULLWORD_END ']' - -/** - * Append an unmarked (i.e. without INFIXMARK) morpheme to join_buff. - * join_buff is a zeroed-out buffer which has enough room for morpheme to be - * added + terminating NUL. - * Note that MT_PREFIX or MT_SUFFIX can be without an INFIX_MARK, in case - * INFIX_MARK is not defined. XXX: What about MT_MIDDLE? (not in use yet). - * - * FIXME Combining contracted words is not handled yet, because combining - * morphemes which have non-LL links to other words is not yet implemented. - */ -static void add_morpheme_unmarked(Sentence sent, char *join_buff, - const char *wm, Morpheme_type mt) -{ - const char infix_mark = INFIX_MARK(sent->dict->affix_table); - const char *sm = strrchr(wm, SUBSCRIPT_MARK); - - if (NULL == sm) sm = (char *)wm + strlen(wm); - - if ((MT_PREFIX == mt) && (infix_mark == sm[-INFIX_MARK_L])) - strncat(join_buff, wm, sm-wm-INFIX_MARK_L); - else if ((MT_SUFFIX == mt) && (infix_mark == wm[0])) - strncat(join_buff, INFIX_MARK_L+wm, sm-wm-INFIX_MARK_L); - else if ((MT_MIDDLE == mt)) - strncat(join_buff, INFIX_MARK_L+wm, sm-wm-2*INFIX_MARK_L); - else - strncat(join_buff, wm, sm-wm); -} - -static const char *join_null_word(Sentence sent, Gword **wgp, size_t count) -{ - size_t i; - char *join_buff; - const char *s; - size_t join_len = 0; - - for (i = 0; i < count; i++) - join_len += strlen(wgp[i]->subword); - - join_buff = alloca(join_len+1); - memset(join_buff, '\0', join_len+1); - - for (i = 0; i < count; i++) - add_morpheme_unmarked(sent, join_buff, wgp[i]->subword, - wgp[i]->morpheme_type); - - s = string_set_add(join_buff, sent->string_set); - - return s; -} - -/** - * Add a null word node that represents two or more null morphemes. - * Used for "unifying" null morphemes that are part of a single subword, - * when only some of its morphemes (2 or more) don't have a linkage. - * The words "start" to "end" (including) are unified by the new node. - * XXX Experimental. - */ -static Gword *wordgraph_null_join(Sentence sent, Gword **start, Gword **end) -{ - Gword *new_word; - Gword **w; - char *usubword; - size_t join_len = 0; - - for (w = start; w <= end; w++) join_len += strlen((*w)->subword); - usubword = calloc(join_len+1, 1); /* zeroed out */ - - for (w = start; w <= end; w++) - add_morpheme_unmarked(sent, usubword, (*w)->subword, (*w)->morpheme_type); - - new_word = gword_new(sent, usubword); - free(usubword); - new_word->status |= WS_PL; - new_word->label = "NJ"; - new_word->null_subwords = NULL; - - /* Link the null_subwords links of the added unifying node to the null - * subwords it unified. */ - for (w = start; w <= end; w++) - gwordlist_append(&new_word->null_subwords, (Gword *)(*w)); - /* Removing const qualifier, but gwordlist_append doesn't change w->... . */ - - return new_word; -} - -/** - * The functions defined in this file are primarily a part of the user API - * for working with linkages. - */ - -#define SUBSCRIPT_SEP SUBSCRIPT_DOT /* multiple-subscript separator */ - -#define PREFIX_SUPPRESS ("PL") /* prefix links start with this */ -#define PREFIX_SUPPRESS_L 2 /* length of above */ -#define SUFFIX_SUPPRESS ("LL") /* suffix links start with this */ -#define SUFFIX_SUPPRESS_L 2 /* length of above */ - -#define HIDE_MORPHO (!display_morphology) -/* TODO? !display_guess_marks is not implemented. */ -#define DISPLAY_GUESS_MARKS true // (opts->display_guess_marks) - -/* FIXME: Define an affix class MORPHOLOGY_LINKS. */ -static inline bool is_morphology_link(const char *link_name) -{ - if (NULL == link_name) return false; - return (0 == strncmp(link_name, SUFFIX_SUPPRESS, SUFFIX_SUPPRESS_L)) || - (0 == strncmp(link_name, PREFIX_SUPPRESS, PREFIX_SUPPRESS_L)); -} - -/* - * Remap the link array according to discarded links and words. - * - * The remap[] elements indicate the new WordIdx of the word. - * A value which is -1 indicates a discarded word. - * A NULL link_name indicates a discarded link. - */ -static void remap_linkages(Linkage lkg, const int *remap) -{ - LinkIdx i, j; - - for (i = 0, j = 0; i < lkg->num_links; i++) - { - Link *old_lnk = &lkg->link_array[i]; - - if (NULL != old_lnk->link_name && /* discarded link */ - (-1 != remap[old_lnk->rw]) && (-1 != remap[old_lnk->lw])) - { - Link *new_lnk = &lkg->link_array[j]; - Connector *ctmp; - - /* Copy the entire link contents, thunking the word numbers. - * Note that j is always <= i so this is always safe. */ - - new_lnk->lw = remap[old_lnk->lw]; - new_lnk->rw = remap[old_lnk->rw]; - - ctmp = new_lnk->lc; - new_lnk->lc = old_lnk->lc; - old_lnk->lc = ctmp; - - ctmp = new_lnk->rc; - new_lnk->rc = old_lnk->rc; - old_lnk->rc = ctmp; - - new_lnk->link_name = old_lnk->link_name; - - /* Remap the pp_info, too. */ - if (lkg->pp_info) - lkg->pp_info[j] = lkg->pp_info[i]; - - j++; - } - else - { - /* Whack this slot of pp_info. */ - if (lkg->pp_info) - exfree_domain_names(&lkg->pp_info[i]); - } - } - - lkg->num_links = j; - /* Unused memory not freed - all of it will be freed in free_linkages(). */ -} - -/** - * Remove unlinked optional words from a linkage. - * XXX Should we remove here also the dict-cap tokens? In any case, for now they - * are left for debug. - */ -#define D_REE 9 -void remove_empty_words(Linkage lkg) -{ - size_t i, j; - Disjunct **cdj = lkg->chosen_disjuncts; - int *remap = alloca(lkg->num_words * sizeof(*remap)); - - if (verbosity_level(+D_REE)) - { - err_msg(lg_Debug, "chosen_disjuncts before:\n\\"); - print_chosen_disjuncts_words(lkg); - } - - for (i = 0, j = 0; i < lkg->num_words; i++) - { - if ((NULL == cdj[i]) && lkg->sent->word[i].optional) - { - remap[i] = -1; - } - else - { - Disjunct *cdtmp = cdj[j]; - cdj[j] = cdj[i]; - cdj[i] = cdtmp; /* The SAT parser frees chosen_disjuncts elements. */ - remap[i] = j; - j++; - } - } - if (lkg->num_words != j) - { - /* Unused memory not freed - all of it will be freed in free_linkages(). */ - lkg->num_words = j; - remap_linkages(lkg, remap); /* Update lkg->link_array and lkg->num_links. */ - } - - if (verbosity_level(+D_REE)) - { - err_msg(lg_Debug, "chosen_disjuncts after:\n\\"); - print_chosen_disjuncts_words(lkg); - } -} -#undef D_REE - -/** - * This takes the Wordgraph path array and uses it to - * compute the chosen_words array. "I.xx" suffixes are eliminated. - * - * chosen_words - * A pointer to an array of pointers to strings. These are the words to be - * displayed when printing the solution, the links, etc. Computed as a - * function of chosen_disjuncts[] by compute_chosen_words(). This differs - * from sentence.word[].alternatives because it contains the subscripts. It - * differs from chosen_disjunct[].string in that the idiom symbols have been - * removed. Furthermore, several chosen_disjuncts[].string elements may be - * combined into one chosen_words[] element if opts->display_morphology==0 or - * that they where linkage null-words that are morphemes of the same original - * word (i.e. subwords of an unsplit_word which are marked as morphemes). - * - * wg_path - * A pointer to a NULL-terminated array of pointers to Wordgraph words. - * It corresponds 1-1 to the chosen_disjuncts array in Linkage structure. - * A new one is constructed below to correspond 1-1 to chosen_words. - * - * FIXME Sometimes the word strings are taken from chosen_disjuncts, - * and sometimes from wordgraph subwords. - */ -#define D_CCW 8 -void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts) -{ - WordIdx i; /* index of chosen_words */ - WordIdx j; - Disjunct **cdjp = linkage->chosen_disjuncts; - const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words)); - int *remap = alloca(linkage->num_words * sizeof(*remap)); - bool *show_word = alloca(linkage->num_words * sizeof(*show_word)); - bool display_morphology = opts->display_morphology; - - Gword **lwg_path = linkage->wg_path; - Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */ - -#if 0 /* FIXME? Not implemented. */ - size_t len_n_lwg_path = 0; - /* For opts->display_morphology==0: Mapping of the chosen_words indexing to - * original parse indexing. */ - size_t *wg_path_display = alloca(linkage->num_words * sizeof(*lwg_path_display)); -#endif - - Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */ - size_t nbsize = 0; /* number of word in a null block */ - Gword *unsplit_word = NULL; - - memset(show_word, 0, linkage->num_words * sizeof(*show_word)); - - if (D_CCW <= opts->verbosity) - print_lwg_path(lwg_path); - - for (i = 0; i < linkage->num_words; i++) - { - Disjunct *cdj = cdjp[i]; - Gword *w; /* current word */ - const Gword *nw; /* next word (NULL if none) */ - Gword **wgp; /* wordgraph_path traversing pointer */ - - const char *t = NULL; /* current word string */ - bool nb_end; /* current word is at end of a nullblock */ - bool join_alt = false; /* morpheme-join this alternative */ - char *s; - size_t l; - size_t m; - - lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n", - i, cdj ? cdj->string : "NULL", - lwg_path[i] ? lwg_path[i]->subword : "NULL"); - - w = lwg_path[i]; - nw = lwg_path[i+1]; - wgp = &lwg_path[i]; - unsplit_word = w->unsplit_word; - - /* FIXME If the original word was capitalized in a capitalizable - * position, the displayed null word may be its downcase version. */ - - if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */ - { - nbsize++; - if (NULL == nullblock_start) /* it starts a new null block */ - nullblock_start = wgp; - - nb_end = (NULL == nw) || (nw->unsplit_word != unsplit_word) || - (MT_INFRASTRUCTURE == w->unsplit_word->morpheme_type); - - /* Accumulate null words in this alternative */ - if (!nb_end && (NULL == cdjp[i+1])) - { - lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n", - i, nbsize, lwg_path[i]->subword); - chosen_words[i] = NULL; - continue; - } - - if (NULL != nullblock_start) - { - /* If we are here, this null word is an end of a null block */ - lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i); - - if (1 == nbsize) - { - /* Case 1: A single null subword. */ - lgdebug(D_CCW, "A single null subword.\n"); - t = join_null_word(sent, wgp, nbsize); - - gwordlist_append(&n_lwg_path, w); -#if 0 /* Not implemented */ - lwg_path_display[len_n_lwg_path++] = i; -#endif - } - else - { - lgdebug(D_CCW, "Combining null subwords"); - /* Use alternative_id to check for start of alternative. */ - if (((*nullblock_start)->alternative_id == *nullblock_start) - && nb_end) - { - /* Case 2: A null unsplit_word (all-nulls alternative).*/ - lgdebug(D_CCW, " (null alternative)\n"); - t = unsplit_word->subword; - - gwordlist_append(&n_lwg_path, unsplit_word); -#if 0 /* Not implemented */ - for (j = 0; j < nbsize; j++) - lwg_path_display[len_n_lwg_path++] = i-j+1; -#endif - } - else - { - /* Case 3: Join together >=2 null morphemes. */ - Gword *wgnull; - - lgdebug(D_CCW, " (null partial word)\n"); - wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp); - gwordlist_append(&n_lwg_path, wgnull); - t = wgnull->subword; - } - } - - nullblock_start = NULL; - nbsize = 0; - show_word[i] = true; - - if (MT_WALL != w->morpheme_type) - { - /* Put brackets around the null word. */ - l = strlen(t) + 2; - s = (char *) alloca(l+1); - s[0] = NULLWORD_START; - strcpy(&s[1], t); - s[l-1] = NULLWORD_END; - s[l] = '\0'; - t = string_set_add(s, sent->string_set); - lgdebug(D_CCW, " %s\n", t); - /* Null words have no links, so take care not to drop them. */ - } - } - } - else - { - /* This word has a linkage. */ - - /* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */ - char *sm; - - t = cdj->string; - /* Print the subscript, as in "dog.n" as opposed to "dog". */ - - if (0) - { - /* TODO */ - } - else - { - /* Get rid of those ugly ".Ixx" */ - if (is_idiom_word(t)) - { - s = strdup(t); - sm = strrchr(s, SUBSCRIPT_MARK); - *sm = '\0'; - t = string_set_add(s, sent->string_set); - free(s); - } - else if (HIDE_MORPHO) - { - /* Concatenate the word morphemes together into one word. - * Concatenate their subscripts into one subscript. - * Use subscript separator SUBSCRIPT_SEP. - * XXX Check whether we can encounter an idiom word here. - * FIXME Combining contracted words is not handled yet, because - * combining morphemes which have non-LL links to other words is - * not yet implemented. - * FIXME Move to a separate function. */ - Gword **wgaltp; - size_t join_len = 0; - size_t mcnt = 0; - - /* If the alternative contains morpheme subwords, mark it - * for joining... */ - for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++) - { - - if ((*wgaltp)->unsplit_word != unsplit_word) break; - if (MT_INFRASTRUCTURE == - (*wgaltp)->unsplit_word->morpheme_type) break; - - mcnt++; - - if (NULL == cdjp[j]) - { - /* ... but not if it contains a null word */ - join_alt = false; - break; - } - join_len += strlen(cdjp[j]->string); - if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME) - join_alt = true; - } - - if (join_alt) - { - /* Join it in two steps: 1. Base words. 2. Subscripts. - * FIXME? Can be done in one step (more efficient but maybe - * less clear). - * Put SUBSCRIPT_SEP between the subscripts. - * XXX No 1-1 correspondence between the hidden base words - * and the subscripts after the join, in case there are base - * words with and without subscripts. */ - - const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'}; - const char subscript_mark_str[] = { SUBSCRIPT_MARK, '\0'}; - char *join = calloc(join_len + 1, 1); /* zeroed out */ - - join[0] = '\0'; - - /* 1. Join base words. (Could just use the unsplit_word.) */ - for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) - { - add_morpheme_unmarked(sent, join, cdjp[i+m]->string, - (*wgaltp)->morpheme_type); - } - - strcat(join, subscript_mark_str); /* tentative */ - - /* 2. Join subscripts. */ - for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) - { - /* Cannot NULLify the word - we may have links to it. */ - if (m != mcnt-1) chosen_words[i+m] = ""; - - sm = strrchr(cdjp[i+m]->string, SUBSCRIPT_MARK); - - if (NULL != sm) - { - /* Supposing stem subscript is .=x (x optional) */ - if (MT_STEM == (*wgaltp)->morpheme_type) - { - sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */ - if ('\0' == *sm) sm = NULL; -#if 0 - if ((cnt-1) == m) - { - /* Support a prefix-stem combination. In that case - * we have just nullified the combined word, so we - * need to move it to the position of the prefix. - * FIXME: May still not be good enough. */ - move_combined_word = i+m-1; - - /* And the later chosen_word assignment should be: - * chosen_words[-1 != move_combined_word ? - * move_combined_word : i] = t; - */ - } - else - { - move_combined_word = -1; - } -#endif - } - } - if (NULL != sm) - { - strcat(join, sm+1); - strcat(join, subscript_sep_str); - } - } - - /* Remove an extra mark, if any */ - join_len = strlen(join); - if ((SUBSCRIPT_SEP == join[join_len-1]) || - (SUBSCRIPT_MARK == join[join_len-1])) - join[join_len-1] = '\0'; - - gwordlist_append(&n_lwg_path, unsplit_word); - t = string_set_add(join, sent->string_set); - free(join); - - i += mcnt-1; - } - } - } - - if (!join_alt) gwordlist_append(&n_lwg_path, *wgp); - - /* - * Add guess marks in [] square brackets, if needed, at the - * end of the base word. Convert the badly-printing - * SUBSCRIPT_MARK (hex 03 or ^C) into a period. - */ - if (t) - { - const char *sm = strrchr(t, SUBSCRIPT_MARK); - - if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT)) - || !DISPLAY_GUESS_MARKS) - { - s = alloca(strlen(t)+1); - strcpy(s, t); - if (sm) s[sm-t] = SUBSCRIPT_DOT; - t = string_set_add(s, sent->string_set); - } - else - { - size_t baselen; - const char *regex_name; - char guess_mark; - - switch (w->status & WS_GUESS) - { - case WS_SPELL: - guess_mark = GM_SPELL; - break; - case WS_RUNON: - guess_mark = GM_RUNON; - break; - case WS_REGEX: - guess_mark = GM_REGEX; - break; - case 0: - guess_mark = GM_UNKNOWN; - break; - default: - assert(0, "Missing 'case: %2x'", w->status & WS_GUESS); - } - - /* In the case of display_morphology==0, the guess indication of - * the last subword is used as the guess indication of the whole - * word. - * FIXME? The guess indications of other subwords are ignored in - * this mode. This implies that if a first or middle subword has - * a guess indication but the last subword doesn't have, no guess - * indication would be shown at all. */ - - regex_name = w->regex_name; - if ((NULL == regex_name) || HIDE_MORPHO) regex_name = ""; - /* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */ - baselen = NULL == sm ? strlen(t) : (size_t)(sm-t); - s = alloca(strlen(t) + strlen(regex_name) + 4); - strncpy(s, t, baselen); - s[baselen] = '['; - s[baselen + 1] = guess_mark; - strcpy(s + baselen + 2, regex_name); - strcat(s, "]"); - if (NULL != sm) strcat(s, sm); - t = s; - sm = strrchr(t, SUBSCRIPT_MARK); - if (sm) s[sm-t] = SUBSCRIPT_DOT; - t = string_set_add(s, sent->string_set); - } - } - } - - assert(t != NULL, "Word %zu: NULL", i); - chosen_words[i] = t; - } - - /* Conditional test removal of quotation marks and the "capdict" tokens, - * to facilitate using diff on sentence batch runs. */ - if (test_enabled("removeZZZ")) - { - for (i=0, j=0; inum_links; i++) - { - Link *lnk = &(linkage->link_array[i]); - - if (0 == strcmp("ZZZ", lnk->link_name)) - chosen_words[lnk->rw] = NULL; - } - } - - /* If morphology printing is being suppressed, then all links - * connecting morphemes will be discarded. */ - if (HIDE_MORPHO) - { - /* Discard morphology links. */ - for (i=0; inum_links; i++) - { - Link * lnk = &linkage->link_array[i]; - - if (is_morphology_link(lnk->link_name)) - { - /* Mark link for discarding. */ - lnk->link_name = NULL; - } - else - { - /* Mark word for not discarding. */ - show_word[lnk->rw] = true; - show_word[lnk->lw] = true; - } - } - } - - /* We alloc a little more than needed, but so what... */ - linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *)); - - /* Copy over the chosen words, dropping the discarded words. - * However, don't discard existing words (chosen_words[i][0]). - * Note that if a word only has morphology links and is not combined with - * another word, then it will get displayed with no links at all (e.g. - * when explicitly specifying root and suffix for debug: root.= =suf */ - for (i=0, j=0; inum_words; ++i) - { - if (chosen_words[i] && - (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i]))) - { - const char *cwtmp = linkage->word[j]; - linkage->word[j] = chosen_words[i]; - chosen_words[i] = cwtmp; - remap[i] = j; - j++; - } - else - { - remap[i] = -1; - } - } - linkage->num_words = j; - - remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */ - - linkage->wg_path_display = n_lwg_path; - - if (D_CCW <= opts->verbosity) - { - print_lwg_path(lwg_path); - print_lwg_path(n_lwg_path); - } -} -#undef D_CCW - -Linkage linkage_create(LinkageIdx k, Sentence sent, Parse_Options opts) -{ - Linkage linkage; - - if (opts->use_sat_solver) - { - linkage = sat_create_linkage(k, sent, opts); - if (!linkage) return NULL; - } - else - { - /* Cannot create a Linkage for a discarded linkage. */ - if (sent->num_linkages_post_processed <= k) return NULL; - linkage = &sent->lnkages[k]; - } - - /* Perform remaining initialization we haven't done yet...*/ - compute_chosen_words(sent, linkage, opts); - - linkage->is_sent_long = (linkage->num_words >= opts->twopass_length); - - return linkage; -} - -void linkage_delete(Linkage linkage) -{ - /* Currently a no-op */ -} - -size_t linkage_get_num_words(const Linkage linkage) -{ - if (!linkage) return 0; - return linkage->num_words; -} - -size_t linkage_get_num_links(const Linkage linkage) -{ - if (!linkage) return 0; - return linkage->num_links; -} - -static inline bool verify_link_index(const Linkage linkage, LinkIdx index) -{ - if (!linkage) return false; - if (index >= linkage->num_links) return false; - return true; -} - -int linkage_get_link_length(const Linkage linkage, LinkIdx index) -{ - Link *link; - if (!verify_link_index(linkage, index)) return -1; - link = &(linkage->link_array[index]); - return link->rw - link->lw; -} - -WordIdx linkage_get_link_lword(const Linkage linkage, LinkIdx index) -{ - if (!verify_link_index(linkage, index)) return SIZE_MAX; - return linkage->link_array[index].lw; -} - -WordIdx linkage_get_link_rword(const Linkage linkage, LinkIdx index) -{ - if (!verify_link_index(linkage, index)) return SIZE_MAX; - return linkage->link_array[index].rw; -} - -const char * linkage_get_link_label(const Linkage linkage, LinkIdx index) -{ - if (!verify_link_index(linkage, index)) return NULL; - return linkage->link_array[index].link_name; -} - -const char * linkage_get_link_llabel(const Linkage linkage, LinkIdx index) -{ - if (!verify_link_index(linkage, index)) return NULL; - return linkage->link_array[index].lc->string; -} - -const char * linkage_get_link_rlabel(const Linkage linkage, LinkIdx index) -{ - if (!verify_link_index(linkage, index)) return NULL; - return linkage->link_array[index].rc->string; -} - -const char ** linkage_get_words(const Linkage linkage) -{ - return linkage->word; -} - -const char * linkage_get_disjunct_str(const Linkage linkage, WordIdx w) -{ - Disjunct *dj; - - if (NULL == linkage) return ""; - if (NULL == linkage->disjunct_list_str) - { - lg_compute_disjunct_strings(linkage); - } - - if (linkage->num_words <= w) return NULL; /* bounds-check */ - - /* dj will be null if the word wasn't used in the parse. */ - dj = linkage->chosen_disjuncts[w]; - if (NULL == dj) return ""; - - return linkage->disjunct_list_str[w]; -} - -double linkage_get_disjunct_cost(const Linkage linkage, WordIdx w) -{ - Disjunct *dj; - - if (linkage->num_words <= w) return 0.0; /* bounds-check */ - - dj = linkage->chosen_disjuncts[w]; - - /* dj may be null, if the word didn't participate in the parse. */ - if (dj) return dj->cost; - return 0.0; -} - -double linkage_get_disjunct_corpus_score(const Linkage linkage, WordIdx w) -{ - Disjunct *dj; - - if (linkage->num_words <= w) return 99.999; /* bounds-check */ - dj = linkage->chosen_disjuncts[w]; - - /* dj may be null, if the word didn't participate in the parse. */ - if (NULL == dj) return 99.999; - - return lg_corpus_disjunct_score(linkage, w); -} - -const char * linkage_get_word(const Linkage linkage, WordIdx w) -{ - if (!linkage) return NULL; - if (linkage->num_words <= w) return NULL; /* bounds-check */ - return linkage->word[w]; -} - -int linkage_unused_word_cost(const Linkage linkage) -{ - /* The sat solver (currently) fails to fill in info */ - if (!linkage) return 0; - return linkage->lifo.unused_word_cost; -} - -double linkage_disjunct_cost(const Linkage linkage) -{ - /* The sat solver (currently) fails to fill in info */ - if (!linkage) return 0.0; - return linkage->lifo.disjunct_cost; -} - -int linkage_link_cost(const Linkage linkage) -{ - /* The sat solver (currently) fails to fill in info */ - if (!linkage) return 0; - return linkage->lifo.link_cost; -} - -double linkage_corpus_cost(const Linkage linkage) -{ - /* The sat solver (currently) fails to fill in info */ - if (!linkage) return 0.0; - return linkage->lifo.corpus_cost; -} diff -Nru link-grammar-5.3.16/link-grammar/linkage.h link-grammar-5.5.0/link-grammar/linkage.h --- link-grammar-5.3.16/link-grammar/linkage.h 2017-01-27 22:07:11.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/linkage.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,9 +0,0 @@ -#ifndef _LINKAGE_H -#define _LINKAGE_H -void compute_chosen_words(Sentence, Linkage, Parse_Options); - -void partial_init_linkage(Sentence, Linkage, unsigned int N_words); -void check_link_size(Linkage); -void remove_empty_words(Linkage); -void free_linkage(Linkage); -#endif /* _LINKAGE_H */ diff -Nru link-grammar-5.3.16/link-grammar/link-features.h link-grammar-5.5.0/link-grammar/link-features.h --- link-grammar-5.3.16/link-grammar/link-features.h 2017-04-15 21:51:08.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/link-features.h 2018-04-29 21:40:31.000000000 +0000 @@ -29,9 +29,14 @@ #endif #define LINK_MAJOR_VERSION 5 -#define LINK_MINOR_VERSION 3 -#define LINK_MICRO_VERSION 16 +#define LINK_MINOR_VERSION 5 +#define LINK_MICRO_VERSION 0 -#define LINK_VERSION_STRING "5.3.16" +#define LINK_VERSION_STRING "5.5.0" + +#define LG_HOST_OS lg_str(linux-gnu) +#define LG_CPPFLAGS "CPPFLAGS=" lg_str() +#define LG_CFLAGS "CFLAGS=" lg_str(-D_DEFAULT_SOURCE -std=c11 -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE -D_ISOC11_SOURCE -g -O2 -O3) +#define LG_DEFS lg_str(-DPACKAGE_NAME="link-grammar" -DPACKAGE_TARNAME="link-grammar" -DPACKAGE_VERSION="5.5.0" -DPACKAGE_STRING="link-grammar 5.5.0" -DPACKAGE_BUGREPORT="link-grammar@googlegroups.com" -DPACKAGE_URL="" -DPACKAGE="link-grammar" -DVERSION="5.5.0" -DYYTEXT_POINTER=1 -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=".libs/" -DHAVE_STRNDUP=1 -DHAVE_STRTOK_R=1 -DHAVE_ALIGNED_ALLOC=1 -DHAVE_POSIX_MEMALIGN=1 -DHAVE_ALLOCA_H=1 -DHAVE_ALLOCA=1 -DHAVE_FORK=1 -DHAVE_VFORK=1 -DHAVE_WORKING_VFORK=1 -DHAVE_WORKING_FORK=1 -DHAVE_PRCTL=1 -D__STDC_FORMAT_MACROS=1 -D__STDC_LIMIT_MACROS=1 -DHAVE_LOCALE_T_IN_LOCALE_H=1 -DHAVE_STDATOMIC_H=1 -DTLS=__thread -DHAVE_LIBSTDC__=1 -DHAVE_MKLIT=1 -DUSE_SAT_SOLVER=1 -DUSE_WORDGRAPH_DISPLAY=1 -DHAVE_SQLITE=1 -DHAVE_LIBASPELL=1 -DHAVE_ASPELL=1 -DHAVE_EDITLINE=1 -DHAVE_WIDECHAR_EDITLINE=1 -DHAVE_REGEXEC=1 -DHAVE_MAYBE_UNINITIALIZED=1) #endif diff -Nru link-grammar-5.3.16/link-grammar/link-features.h.in link-grammar-5.5.0/link-grammar/link-features.h.in --- link-grammar-5.3.16/link-grammar/link-features.h.in 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/link-features.h.in 2018-04-22 14:57:33.000000000 +0000 @@ -34,4 +34,9 @@ #define LINK_VERSION_STRING "@LINK_MAJOR_VERSION@.@LINK_MINOR_VERSION@.@LINK_MICRO_VERSION@" +#define LG_HOST_OS lg_str(@HOST_OS@) +#define LG_CPPFLAGS "CPPFLAGS=" lg_str(@CPPFLAGS@) +#define LG_CFLAGS "CFLAGS=" lg_str(@CFLAGS@) +#define LG_DEFS lg_str(@LG_DEFS@) + #endif diff -Nru link-grammar-5.3.16/link-grammar/link-grammar.def link-grammar-5.5.0/link-grammar/link-grammar.def --- link-grammar-5.3.16/link-grammar/link-grammar.def 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/link-grammar.def 2018-04-25 17:04:50.000000000 +0000 @@ -1,7 +1,8 @@ linkgrammar_get_version +linkgrammar_get_configuration +linkgrammar_open_data_file linkgrammar_get_dict_version linkgrammar_get_dict_locale -dictionary_create_from_utf8 dictionary_create_lang dictionary_create_default_lang dictionary_get_lang @@ -72,6 +73,7 @@ sentence_num_violations sentence_disjunct_cost sentence_link_cost +sentence_display_wordgraph linkage_create linkage_delete linkage_get_num_words @@ -84,11 +86,15 @@ linkage_get_link_rlabel linkage_get_link_num_domains linkage_get_link_domain_names -linkage_get_words linkage_get_disjunct_str linkage_get_disjunct_cost linkage_get_disjunct_corpus_score +linkage_get_words linkage_get_word +linkage_get_word_byte_start +linkage_get_word_byte_end +linkage_get_word_char_start +linkage_get_word_char_end linkage_print_constituent_tree linkage_free_constituent_tree_str linkage_print_diagram @@ -108,9 +114,13 @@ linkage_corpus_cost linkage_link_cost linkage_get_violation_name -post_process_open -post_process_close -linkage_post_process +lg_error_set_handler +lg_error_set_handler_data +lg_error_formatmsg +lg_error_printall +lg_error_clearall +lg_error_flush +lg_exp_get_string prt_error lg_compute_disjunct_strings lg_expand_disjunct_list @@ -121,10 +131,5 @@ count_disjuncts print_one_disjunct build_disjuncts_for_exp -left_print_string regex_tokenizer_test -lg_error_set_handler -lg_error_set_handler_data -lg_error_formatmsg -lg_error_printall -lg_error_clearall +utf8_strwidth diff -Nru link-grammar-5.3.16/link-grammar/link-includes.h link-grammar-5.5.0/link-grammar/link-includes.h --- link-grammar-5.3.16/link-grammar/link-includes.h 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/link-includes.h 2018-04-25 17:04:50.000000000 +0000 @@ -47,6 +47,9 @@ linkgrammar_get_dict_version(Dictionary); link_public_api(const char *) + linkgrammar_get_configuration(void); + +link_public_api(const char *) linkgrammar_get_dict_locale(Dictionary); /********************************************************************** @@ -87,6 +90,8 @@ lg_error_printall(lg_error_handler, void *data); link_public_api(int) lg_error_clearall(void); +link_public_api(bool) + lg_error_flush(void); /********************************************************************** * @@ -108,6 +113,8 @@ dictionary_set_data_dir(const char * path); link_public_api(char *) dictionary_get_data_dir(void); +link_public_api(FILE *) + linkgrammar_open_data_file(const char *); /********************************************************************** * @@ -271,6 +278,8 @@ sentence_disjunct_cost(Sentence sent, LinkageIdx linkage_num); link_public_api(int) sentence_link_cost(Sentence sent, LinkageIdx linkage_num); +link_public_api(bool) + sentence_display_wordgraph(Sentence sent, const char *modestr); /********************************************************************** * @@ -359,22 +368,32 @@ link_public_api(const char *) linkage_get_violation_name(const Linkage linkage); +/* Experimental API - subject to changes. */ +link_public_api(size_t) + linkage_get_word_byte_start(const Linkage linkage, WordIdx w); +link_public_api(size_t) + linkage_get_word_byte_end(const Linkage linkage, WordIdx w); +link_public_api(size_t) + linkage_get_word_char_start(const Linkage linkage, WordIdx w); +link_public_api(size_t) + linkage_get_word_char_end(const Linkage linkage, WordIdx w); + /********************************************************************** * * Internal functions -- do not use these in new code! - * These are not intended for general public use, but are required to - * get the link-parser executable to link under MSVC6. - * XXX FIXME we're on msvc14 now, do we still need these?? + * They are used in the link-parser code (which needs their signature to + * compile), but are not intended for general public use. * ***********************************************************************/ -link_public_api(void) +link_public_api(size_t) + utf8_strwidth(const char *); + +link_public_api(char *) dict_display_word_expr(Dictionary dict, const char *, Parse_Options opts); -link_public_api(void) +link_public_api(char *) dict_display_word_info(Dictionary dict, const char *, Parse_Options opts); -link_public_api(void) - left_print_string(FILE* fp, const char *, int); link_public_api(bool) lg_expand_disjunct_list(Sentence sent); @@ -382,7 +401,7 @@ * * Internal functions -- do not use these in new code! * These are not intended for general public use, but are required to - * work around certain Micorsoft Windows linking oddities + * work around certain Microsoft Windows linking oddities * (specifically, to be callable from the JNI bindings library.) * ***********************************************************************/ @@ -413,29 +432,13 @@ #define GNUC_DEPRECATED #endif -#if defined(_MSC_VER) && _MSC_VER > 1200 /* Only if newer than MSVC6 */ +#if defined(_MSC_VER) #define MS_DEPRECATED __declspec(deprecated) #else #define MS_DEPRECATED #endif -/********************************************************************** - * - * Functions that allow special-purpose post-processing of linkages - * - ***********************************************************************/ - #ifndef SWIG -/** Do not use in new code! */ -typedef struct Postprocessor_s PostProcessor; - -MS_DEPRECATED link_public_api(PostProcessor *) - post_process_open(const char *path) GNUC_DEPRECATED; -MS_DEPRECATED link_public_api(void) - post_process_close(PostProcessor *) GNUC_DEPRECATED; -MS_DEPRECATED link_public_api(void) - linkage_post_process(Linkage, PostProcessor *) GNUC_DEPRECATED; - LINK_END_DECLS #endif /* !SWIG */ diff -Nru link-grammar-5.3.16/link-grammar/Makefile.am link-grammar-5.5.0/link-grammar/Makefile.am --- link-grammar-5.3.16/link-grammar/Makefile.am 2017-04-15 21:47:50.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/Makefile.am 2018-04-29 21:39:42.000000000 +0000 @@ -17,7 +17,6 @@ endif if OS_WIN32 - install-libtool-import-lib: $(INSTALL) .libs/liblink-grammar.dll.a $(DESTDIR)$(libdir) @@ -28,12 +27,15 @@ uninstall-libtool-import-lib: endif -DEFS = @DEFS@ -DVERSION=\"@VERSION@\" -DDICTIONARY_DIR=\"$(pkgdatadir)\" +DEFS = @DEFS@ -DVERSION=\"@VERSION@\" -DDICTIONARY_DIR=\"$(pkgdatadir)\" -DCC=\"$(CC)\" # $(top_builddir) to pick up autogened link-grammar/link-features.h -AM_CPPFLAGS = -I$(top_srcdir) -I$(top_builddir) $(WARN_CFLAGS) \ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir) $(WARN_CFLAGS) \ $(HUNSPELL_CFLAGS) +post-process/pp_lexer.lo: AM_CPPFLAGS += -I$(top_srcdir)/link-grammar/post-process +MAINTAINERCLEANFILES = $(top_srcdir)/link-grammar/post-process/pp_lexer.c + lib_LTLIBRARIES = liblink-grammar.la EXTRA_liblink_grammar_la_DEPENDENCIES = $(srcdir)/link-grammar.def @@ -43,10 +45,6 @@ -export-symbols $(srcdir)/link-grammar.def \ $(LINK_CFLAGS) -if OS_WIN32 -liblink_grammar_la_LDFLAGS += -D_JNI_IMPLEMENTATION_ -Wl,--kill-at -endif - liblink_grammar_la_LIBADD = ${REGEX_LIBS} if HAVE_HUNSPELL @@ -78,106 +76,136 @@ liblink_grammar_la_LIBADD += -lm liblink_grammar_la_SOURCES = \ - analyze-linkage.c \ - anysplit.c \ api.c \ - build-disjuncts.c \ - constituents.c \ - count.c \ - dict-common.c \ + connectors.c \ + dict-common/dict-common.c \ + dict-common/dict-impl.c \ + dict-common/dict-utils.c \ + dict-common/file-utils.c \ + dict-common/idiom.c \ + dict-common/print-dict.c \ + dict-common/regex-morph.c \ dict-file/dictionary.c \ dict-file/read-dict.c \ dict-file/read-regex.c \ dict-file/word-file.c \ dict-sql/read-sql.c \ disjunct-utils.c \ - disjuncts.c \ error.c \ - expand.c \ - extract-links.c \ - fast-match.c \ - histogram.c \ - idiom.c \ - linkage.c \ - post-process.c \ - pp_knowledge.c \ - pp_lexer.c \ - pp_linkset.c \ - preparation.c \ - print.c \ - print-util.c \ - prune.c \ - regex-morph.c \ - regex-tokenizer.c \ + linkage/analyze-linkage.c \ + linkage/freeli.c \ + linkage/linkage.c \ + linkage/lisjuncts.c \ + linkage/sane.c \ + linkage/score.c \ + memory-pool.c \ + parse/count.c \ + parse/extract-links.c \ + parse/fast-match.c \ + parse/histogram.c \ + parse/parse.c \ + parse/preparation.c \ + parse/prune.c \ + post-process/constituents.c \ + post-process/post-process.c \ + post-process/pp_knowledge.c \ + post-process/pp_lexer.l \ + post-process/pp_linkset.c \ + prepare/build-disjuncts.c \ + prepare/expand.c \ + prepare/exprune.c \ + print/print.c \ + print/print-util.c \ + print/wcwidth.c \ resources.c \ - score.c \ - spellcheck-aspell.c \ - spellcheck-hun.c \ string-set.c \ - tokenize.c \ + tokenize/anysplit.c \ + tokenize/spellcheck-aspell.c \ + tokenize/spellcheck-hun.c \ + tokenize/regex-tokenizer.c \ + tokenize/tokenize.c \ + tokenize/wg-display.c \ + tokenize/wordgraph.c \ utilities.c \ - wcwidth.c \ - wordgraph.c \ - word-utils.c \ - anysplit.h \ + \ api-structures.h \ api-types.h \ - analyze-linkage.h \ - build-disjuncts.h \ - count.h \ + connectors.h \ + dict-common/dict-affix.h \ + dict-common/dict-api.h \ + dict-common/dict-common.h \ + dict-common/dict-defines.h \ + dict-common/dict-impl.h \ + dict-common/dict-structures.h \ + dict-common/dict-utils.h \ + dict-common/file-utils.h \ + dict-common/idiom.h \ + dict-common/regex-morph.h \ dict-file/read-dict.h \ dict-file/read-regex.h \ dict-file/word-file.h \ dict-sql/read-sql.h \ - dict-api.h \ - dict-common.h \ - dict-structures.h \ disjunct-utils.h \ - disjuncts.h \ error.h \ - expand.h \ externs.h \ - extract-links.h \ - fast-match.h \ - histogram.h \ - idiom.h \ lg_assert.h \ link-includes.h \ - linkage.h \ - post-process.h \ - pp_knowledge.h \ - pp_lexer.h \ - pp_linkset.h \ - preparation.h \ - print.h \ - print-util.h \ - prune.h \ - regex-morph.h \ - regex-tokenizer.h \ + linkage/analyze-linkage.h \ + linkage/linkage.h \ + linkage/lisjuncts.h \ + linkage/sane.h \ + linkage/score.h \ + memory-pool.h \ + parse/count.h \ + parse/extract-links.h \ + parse/fast-match.h \ + parse/histogram.h \ + parse/parse.h \ + parse/preparation.h \ + parse/prune.h \ + post-process/post-process.h \ + post-process/pp_knowledge.h \ + post-process/pp_lexer.h \ + post-process/pp_linkset.h \ + post-process/pp-structures.h \ + prepare/build-disjuncts.h \ + prepare/expand.h \ + prepare/exprune.h \ + print/print.h \ + print/print-util.h \ + print/wcwidth.h \ resources.h \ - score.h \ - spellcheck.h \ string-set.h \ - structures.h \ - tokenize.h \ - utilities.h \ - wcwidth.h \ - wordgraph.h \ - word-utils.h + tokenize/anysplit.h \ + tokenize/spellcheck.h \ + tokenize/regex-tokenizer.h \ + tokenize/tok-structures.h \ + tokenize/tokenize.h \ + tokenize/word-structures.h \ + tokenize/wordgraph.h \ + utilities.h liblink_grammar_includedir = $(includedir)/link-grammar liblink_grammar_include_HEADERS = \ link-features.h \ link-includes.h \ - dict-api.h \ - dict-structures.h + dict-common/dict-api.h \ + dict-common/dict-structures.h + +uninstall-hook: + -rmdir $(liblink_grammar_includedir) EXTRA_DIST= \ link-grammar.def \ README.md \ dict-sql/dict.sql \ dict-sql/demo.sql \ - dict-sql/README + dict-sql/README.md \ + tokenize/README.md + +# Clean up generated file that Windows MSVC compilation leaves behind. +clean-local: + rm -f $(top_builddir)/link-grammar/link-features.h # ----------------------------------------------------------- install-data-local: install-libtool-import-lib diff -Nru link-grammar-5.3.16/link-grammar/Makefile.in link-grammar-5.5.0/link-grammar/Makefile.in --- link-grammar-5.3.16/link-grammar/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -95,14 +95,13 @@ host_triplet = @host@ @WITH_SAT_SOLVER_TRUE@am__append_1 = $(MINISAT_DIR) sat-solver @WITH_CORPUS_TRUE@am__append_2 = corpus -@OS_WIN32_TRUE@am__append_3 = -D_JNI_IMPLEMENTATION_ -Wl,--kill-at -@HAVE_HUNSPELL_TRUE@am__append_4 = ${HUNSPELL_LIBS} -@HAVE_ASPELL_TRUE@am__append_5 = ${ASPELL_LIBS} -@HAVE_SQLITE_TRUE@am__append_6 = ${SQLITE3_LIBS} -@WITH_SAT_SOLVER_TRUE@am__append_7 = $(top_builddir)/link-grammar/sat-solver/libsat-solver.la -@LIBMINISAT_BUNDLED_TRUE@@WITH_SAT_SOLVER_TRUE@am__append_8 = $(top_builddir)/link-grammar/minisat/libminisat.la -@LIBMINISAT_BUNDLED_FALSE@@WITH_SAT_SOLVER_TRUE@am__append_9 = ${MINISAT_LIBS} -@WITH_CORPUS_TRUE@am__append_10 = $(top_builddir)/link-grammar/corpus/liblink-corpus.la ${SQLITE3_LIBS} +@HAVE_HUNSPELL_TRUE@am__append_3 = ${HUNSPELL_LIBS} +@HAVE_ASPELL_TRUE@am__append_4 = ${ASPELL_LIBS} +@HAVE_SQLITE_TRUE@am__append_5 = ${SQLITE3_LIBS} +@WITH_SAT_SOLVER_TRUE@am__append_6 = $(top_builddir)/link-grammar/sat-solver/libsat-solver.la +@LIBMINISAT_BUNDLED_TRUE@@WITH_SAT_SOLVER_TRUE@am__append_7 = $(top_builddir)/link-grammar/minisat/libminisat.la +@LIBMINISAT_BUNDLED_FALSE@@WITH_SAT_SOLVER_TRUE@am__append_8 = ${MINISAT_LIBS} +@WITH_CORPUS_TRUE@am__append_9 = $(top_builddir)/link-grammar/corpus/liblink-corpus.la ${SQLITE3_LIBS} subdir = link-grammar ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \ @@ -155,20 +154,30 @@ @WITH_CORPUS_TRUE@ $(am__DEPENDENCIES_1) liblink_grammar_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_3) \ - $(am__DEPENDENCIES_4) $(am__append_7) $(am__append_8) \ + $(am__DEPENDENCIES_4) $(am__append_6) $(am__append_7) \ $(am__DEPENDENCIES_5) $(am__DEPENDENCIES_6) am__dirstamp = $(am__leading_dot)dirstamp -am_liblink_grammar_la_OBJECTS = analyze-linkage.lo anysplit.lo api.lo \ - build-disjuncts.lo constituents.lo count.lo dict-common.lo \ - dict-file/dictionary.lo dict-file/read-dict.lo \ - dict-file/read-regex.lo dict-file/word-file.lo \ - dict-sql/read-sql.lo disjunct-utils.lo disjuncts.lo error.lo \ - expand.lo extract-links.lo fast-match.lo histogram.lo idiom.lo \ - linkage.lo post-process.lo pp_knowledge.lo pp_lexer.lo \ - pp_linkset.lo preparation.lo print.lo print-util.lo prune.lo \ - regex-morph.lo regex-tokenizer.lo resources.lo score.lo \ - spellcheck-aspell.lo spellcheck-hun.lo string-set.lo \ - tokenize.lo utilities.lo wcwidth.lo wordgraph.lo word-utils.lo +am_liblink_grammar_la_OBJECTS = api.lo connectors.lo \ + dict-common/dict-common.lo dict-common/dict-impl.lo \ + dict-common/dict-utils.lo dict-common/file-utils.lo \ + dict-common/idiom.lo dict-common/print-dict.lo \ + dict-common/regex-morph.lo dict-file/dictionary.lo \ + dict-file/read-dict.lo dict-file/read-regex.lo \ + dict-file/word-file.lo dict-sql/read-sql.lo disjunct-utils.lo \ + error.lo linkage/analyze-linkage.lo linkage/freeli.lo \ + linkage/linkage.lo linkage/lisjuncts.lo linkage/sane.lo \ + linkage/score.lo memory-pool.lo parse/count.lo \ + parse/extract-links.lo parse/fast-match.lo parse/histogram.lo \ + parse/parse.lo parse/preparation.lo parse/prune.lo \ + post-process/constituents.lo post-process/post-process.lo \ + post-process/pp_knowledge.lo post-process/pp_lexer.lo \ + post-process/pp_linkset.lo prepare/build-disjuncts.lo \ + prepare/expand.lo prepare/exprune.lo print/print.lo \ + print/print-util.lo print/wcwidth.lo resources.lo \ + string-set.lo tokenize/anysplit.lo \ + tokenize/spellcheck-aspell.lo tokenize/spellcheck-hun.lo \ + tokenize/regex-tokenizer.lo tokenize/tokenize.lo \ + tokenize/wg-display.lo tokenize/wordgraph.lo utilities.lo liblink_grammar_la_OBJECTS = $(am_liblink_grammar_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -212,6 +221,15 @@ am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) am__v_CCLD_0 = @echo " CCLD " $@; am__v_CCLD_1 = +@MAINTAINER_MODE_FALSE@am__skiplex = test -f $@ || +LEXCOMPILE = $(LEX) $(AM_LFLAGS) $(LFLAGS) +LTLEXCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(LEX) $(AM_LFLAGS) $(LFLAGS) +AM_V_LEX = $(am__v_LEX_@AM_V@) +am__v_LEX_ = $(am__v_LEX_@AM_DEFAULT_V@) +am__v_LEX_0 = @echo " LEX " $@; +am__v_LEX_1 = +YLWRAP = $(top_srcdir)/ylwrap SOURCES = $(liblink_grammar_la_SOURCES) DIST_SOURCES = $(liblink_grammar_la_SOURCES) RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \ @@ -257,7 +275,8 @@ CTAGS = ctags DIST_SUBDIRS = minisat sat-solver corpus am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/link-features.h.in \ - $(top_srcdir)/depcomp + $(top_srcdir)/depcomp $(top_srcdir)/ylwrap \ + post-process/pp_lexer.c DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) am__relativize = \ dir0=`pwd`; \ @@ -290,6 +309,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -306,7 +326,7 @@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ -DEFS = @DEFS@ -DVERSION=\"@VERSION@\" -DDICTIONARY_DIR=\"$(pkgdatadir)\" +DEFS = @DEFS@ -DVERSION=\"@VERSION@\" -DDICTIONARY_DIR=\"$(pkgdatadir)\" -DCC=\"$(CC)\" DEPDIR = @DEPDIR@ DLLTOOL = @DLLTOOL@ DSYMUTIL = @DSYMUTIL@ @@ -318,6 +338,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -326,13 +347,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -399,12 +423,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -469,125 +493,151 @@ @LIBMINISAT_BUNDLED_TRUE@MINISAT_DIR = minisat # $(top_builddir) to pick up autogened link-grammar/link-features.h -AM_CPPFLAGS = -I$(top_srcdir) -I$(top_builddir) $(WARN_CFLAGS) \ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir) $(WARN_CFLAGS) \ $(HUNSPELL_CFLAGS) +MAINTAINERCLEANFILES = $(top_srcdir)/link-grammar/post-process/pp_lexer.c lib_LTLIBRARIES = liblink-grammar.la EXTRA_liblink_grammar_la_DEPENDENCIES = $(srcdir)/link-grammar.def liblink_grammar_la_LDFLAGS = -version-info @VERSION_INFO@ \ - -export-dynamic -no-undefined -export-symbols \ - $(srcdir)/link-grammar.def $(LINK_CFLAGS) $(am__append_3) + -export-dynamic -no-undefined \ + -export-symbols $(srcdir)/link-grammar.def \ + $(LINK_CFLAGS) + # Math libraries are needed for floorf, etc. -liblink_grammar_la_LIBADD = ${REGEX_LIBS} $(am__append_4) \ - $(am__append_5) $(am__append_6) $(am__append_7) \ - $(am__append_8) $(am__append_9) $(am__append_10) -lm +liblink_grammar_la_LIBADD = ${REGEX_LIBS} $(am__append_3) \ + $(am__append_4) $(am__append_5) $(am__append_6) \ + $(am__append_7) $(am__append_8) $(am__append_9) -lm liblink_grammar_la_SOURCES = \ - analyze-linkage.c \ - anysplit.c \ api.c \ - build-disjuncts.c \ - constituents.c \ - count.c \ - dict-common.c \ + connectors.c \ + dict-common/dict-common.c \ + dict-common/dict-impl.c \ + dict-common/dict-utils.c \ + dict-common/file-utils.c \ + dict-common/idiom.c \ + dict-common/print-dict.c \ + dict-common/regex-morph.c \ dict-file/dictionary.c \ dict-file/read-dict.c \ dict-file/read-regex.c \ dict-file/word-file.c \ dict-sql/read-sql.c \ disjunct-utils.c \ - disjuncts.c \ error.c \ - expand.c \ - extract-links.c \ - fast-match.c \ - histogram.c \ - idiom.c \ - linkage.c \ - post-process.c \ - pp_knowledge.c \ - pp_lexer.c \ - pp_linkset.c \ - preparation.c \ - print.c \ - print-util.c \ - prune.c \ - regex-morph.c \ - regex-tokenizer.c \ + linkage/analyze-linkage.c \ + linkage/freeli.c \ + linkage/linkage.c \ + linkage/lisjuncts.c \ + linkage/sane.c \ + linkage/score.c \ + memory-pool.c \ + parse/count.c \ + parse/extract-links.c \ + parse/fast-match.c \ + parse/histogram.c \ + parse/parse.c \ + parse/preparation.c \ + parse/prune.c \ + post-process/constituents.c \ + post-process/post-process.c \ + post-process/pp_knowledge.c \ + post-process/pp_lexer.l \ + post-process/pp_linkset.c \ + prepare/build-disjuncts.c \ + prepare/expand.c \ + prepare/exprune.c \ + print/print.c \ + print/print-util.c \ + print/wcwidth.c \ resources.c \ - score.c \ - spellcheck-aspell.c \ - spellcheck-hun.c \ string-set.c \ - tokenize.c \ + tokenize/anysplit.c \ + tokenize/spellcheck-aspell.c \ + tokenize/spellcheck-hun.c \ + tokenize/regex-tokenizer.c \ + tokenize/tokenize.c \ + tokenize/wg-display.c \ + tokenize/wordgraph.c \ utilities.c \ - wcwidth.c \ - wordgraph.c \ - word-utils.c \ - anysplit.h \ + \ api-structures.h \ api-types.h \ - analyze-linkage.h \ - build-disjuncts.h \ - count.h \ + connectors.h \ + dict-common/dict-affix.h \ + dict-common/dict-api.h \ + dict-common/dict-common.h \ + dict-common/dict-defines.h \ + dict-common/dict-impl.h \ + dict-common/dict-structures.h \ + dict-common/dict-utils.h \ + dict-common/file-utils.h \ + dict-common/idiom.h \ + dict-common/regex-morph.h \ dict-file/read-dict.h \ dict-file/read-regex.h \ dict-file/word-file.h \ dict-sql/read-sql.h \ - dict-api.h \ - dict-common.h \ - dict-structures.h \ disjunct-utils.h \ - disjuncts.h \ error.h \ - expand.h \ externs.h \ - extract-links.h \ - fast-match.h \ - histogram.h \ - idiom.h \ lg_assert.h \ link-includes.h \ - linkage.h \ - post-process.h \ - pp_knowledge.h \ - pp_lexer.h \ - pp_linkset.h \ - preparation.h \ - print.h \ - print-util.h \ - prune.h \ - regex-morph.h \ - regex-tokenizer.h \ + linkage/analyze-linkage.h \ + linkage/linkage.h \ + linkage/lisjuncts.h \ + linkage/sane.h \ + linkage/score.h \ + memory-pool.h \ + parse/count.h \ + parse/extract-links.h \ + parse/fast-match.h \ + parse/histogram.h \ + parse/parse.h \ + parse/preparation.h \ + parse/prune.h \ + post-process/post-process.h \ + post-process/pp_knowledge.h \ + post-process/pp_lexer.h \ + post-process/pp_linkset.h \ + post-process/pp-structures.h \ + prepare/build-disjuncts.h \ + prepare/expand.h \ + prepare/exprune.h \ + print/print.h \ + print/print-util.h \ + print/wcwidth.h \ resources.h \ - score.h \ - spellcheck.h \ string-set.h \ - structures.h \ - tokenize.h \ - utilities.h \ - wcwidth.h \ - wordgraph.h \ - word-utils.h + tokenize/anysplit.h \ + tokenize/spellcheck.h \ + tokenize/regex-tokenizer.h \ + tokenize/tok-structures.h \ + tokenize/tokenize.h \ + tokenize/word-structures.h \ + tokenize/wordgraph.h \ + utilities.h liblink_grammar_includedir = $(includedir)/link-grammar liblink_grammar_include_HEADERS = \ link-features.h \ link-includes.h \ - dict-api.h \ - dict-structures.h + dict-common/dict-api.h \ + dict-common/dict-structures.h EXTRA_DIST = \ link-grammar.def \ README.md \ dict-sql/dict.sql \ dict-sql/demo.sql \ - dict-sql/README + dict-sql/README.md \ + tokenize/README.md all: all-recursive .SUFFIXES: -.SUFFIXES: .c .lo .o .obj +.SUFFIXES: .c .l .lo .o .obj $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ @@ -654,6 +704,26 @@ echo rm -f $${locs}; \ rm -f $${locs}; \ } +dict-common/$(am__dirstamp): + @$(MKDIR_P) dict-common + @: > dict-common/$(am__dirstamp) +dict-common/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) dict-common/$(DEPDIR) + @: > dict-common/$(DEPDIR)/$(am__dirstamp) +dict-common/dict-common.lo: dict-common/$(am__dirstamp) \ + dict-common/$(DEPDIR)/$(am__dirstamp) +dict-common/dict-impl.lo: dict-common/$(am__dirstamp) \ + dict-common/$(DEPDIR)/$(am__dirstamp) +dict-common/dict-utils.lo: dict-common/$(am__dirstamp) \ + dict-common/$(DEPDIR)/$(am__dirstamp) +dict-common/file-utils.lo: dict-common/$(am__dirstamp) \ + dict-common/$(DEPDIR)/$(am__dirstamp) +dict-common/idiom.lo: dict-common/$(am__dirstamp) \ + dict-common/$(DEPDIR)/$(am__dirstamp) +dict-common/print-dict.lo: dict-common/$(am__dirstamp) \ + dict-common/$(DEPDIR)/$(am__dirstamp) +dict-common/regex-morph.lo: dict-common/$(am__dirstamp) \ + dict-common/$(DEPDIR)/$(am__dirstamp) dict-file/$(am__dirstamp): @$(MKDIR_P) dict-file @: > dict-file/$(am__dirstamp) @@ -676,61 +746,179 @@ @: > dict-sql/$(DEPDIR)/$(am__dirstamp) dict-sql/read-sql.lo: dict-sql/$(am__dirstamp) \ dict-sql/$(DEPDIR)/$(am__dirstamp) +linkage/$(am__dirstamp): + @$(MKDIR_P) linkage + @: > linkage/$(am__dirstamp) +linkage/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) linkage/$(DEPDIR) + @: > linkage/$(DEPDIR)/$(am__dirstamp) +linkage/analyze-linkage.lo: linkage/$(am__dirstamp) \ + linkage/$(DEPDIR)/$(am__dirstamp) +linkage/freeli.lo: linkage/$(am__dirstamp) \ + linkage/$(DEPDIR)/$(am__dirstamp) +linkage/linkage.lo: linkage/$(am__dirstamp) \ + linkage/$(DEPDIR)/$(am__dirstamp) +linkage/lisjuncts.lo: linkage/$(am__dirstamp) \ + linkage/$(DEPDIR)/$(am__dirstamp) +linkage/sane.lo: linkage/$(am__dirstamp) \ + linkage/$(DEPDIR)/$(am__dirstamp) +linkage/score.lo: linkage/$(am__dirstamp) \ + linkage/$(DEPDIR)/$(am__dirstamp) +parse/$(am__dirstamp): + @$(MKDIR_P) parse + @: > parse/$(am__dirstamp) +parse/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) parse/$(DEPDIR) + @: > parse/$(DEPDIR)/$(am__dirstamp) +parse/count.lo: parse/$(am__dirstamp) parse/$(DEPDIR)/$(am__dirstamp) +parse/extract-links.lo: parse/$(am__dirstamp) \ + parse/$(DEPDIR)/$(am__dirstamp) +parse/fast-match.lo: parse/$(am__dirstamp) \ + parse/$(DEPDIR)/$(am__dirstamp) +parse/histogram.lo: parse/$(am__dirstamp) \ + parse/$(DEPDIR)/$(am__dirstamp) +parse/parse.lo: parse/$(am__dirstamp) parse/$(DEPDIR)/$(am__dirstamp) +parse/preparation.lo: parse/$(am__dirstamp) \ + parse/$(DEPDIR)/$(am__dirstamp) +parse/prune.lo: parse/$(am__dirstamp) parse/$(DEPDIR)/$(am__dirstamp) +post-process/$(am__dirstamp): + @$(MKDIR_P) post-process + @: > post-process/$(am__dirstamp) +post-process/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) post-process/$(DEPDIR) + @: > post-process/$(DEPDIR)/$(am__dirstamp) +post-process/constituents.lo: post-process/$(am__dirstamp) \ + post-process/$(DEPDIR)/$(am__dirstamp) +post-process/post-process.lo: post-process/$(am__dirstamp) \ + post-process/$(DEPDIR)/$(am__dirstamp) +post-process/pp_knowledge.lo: post-process/$(am__dirstamp) \ + post-process/$(DEPDIR)/$(am__dirstamp) +post-process/pp_lexer.lo: post-process/$(am__dirstamp) \ + post-process/$(DEPDIR)/$(am__dirstamp) +post-process/pp_linkset.lo: post-process/$(am__dirstamp) \ + post-process/$(DEPDIR)/$(am__dirstamp) +prepare/$(am__dirstamp): + @$(MKDIR_P) prepare + @: > prepare/$(am__dirstamp) +prepare/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) prepare/$(DEPDIR) + @: > prepare/$(DEPDIR)/$(am__dirstamp) +prepare/build-disjuncts.lo: prepare/$(am__dirstamp) \ + prepare/$(DEPDIR)/$(am__dirstamp) +prepare/expand.lo: prepare/$(am__dirstamp) \ + prepare/$(DEPDIR)/$(am__dirstamp) +prepare/exprune.lo: prepare/$(am__dirstamp) \ + prepare/$(DEPDIR)/$(am__dirstamp) +print/$(am__dirstamp): + @$(MKDIR_P) print + @: > print/$(am__dirstamp) +print/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) print/$(DEPDIR) + @: > print/$(DEPDIR)/$(am__dirstamp) +print/print.lo: print/$(am__dirstamp) print/$(DEPDIR)/$(am__dirstamp) +print/print-util.lo: print/$(am__dirstamp) \ + print/$(DEPDIR)/$(am__dirstamp) +print/wcwidth.lo: print/$(am__dirstamp) \ + print/$(DEPDIR)/$(am__dirstamp) +tokenize/$(am__dirstamp): + @$(MKDIR_P) tokenize + @: > tokenize/$(am__dirstamp) +tokenize/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) tokenize/$(DEPDIR) + @: > tokenize/$(DEPDIR)/$(am__dirstamp) +tokenize/anysplit.lo: tokenize/$(am__dirstamp) \ + tokenize/$(DEPDIR)/$(am__dirstamp) +tokenize/spellcheck-aspell.lo: tokenize/$(am__dirstamp) \ + tokenize/$(DEPDIR)/$(am__dirstamp) +tokenize/spellcheck-hun.lo: tokenize/$(am__dirstamp) \ + tokenize/$(DEPDIR)/$(am__dirstamp) +tokenize/regex-tokenizer.lo: tokenize/$(am__dirstamp) \ + tokenize/$(DEPDIR)/$(am__dirstamp) +tokenize/tokenize.lo: tokenize/$(am__dirstamp) \ + tokenize/$(DEPDIR)/$(am__dirstamp) +tokenize/wg-display.lo: tokenize/$(am__dirstamp) \ + tokenize/$(DEPDIR)/$(am__dirstamp) +tokenize/wordgraph.lo: tokenize/$(am__dirstamp) \ + tokenize/$(DEPDIR)/$(am__dirstamp) liblink-grammar.la: $(liblink_grammar_la_OBJECTS) $(liblink_grammar_la_DEPENDENCIES) $(EXTRA_liblink_grammar_la_DEPENDENCIES) $(AM_V_CCLD)$(liblink_grammar_la_LINK) -rpath $(libdir) $(liblink_grammar_la_OBJECTS) $(liblink_grammar_la_LIBADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) + -rm -f dict-common/*.$(OBJEXT) + -rm -f dict-common/*.lo -rm -f dict-file/*.$(OBJEXT) -rm -f dict-file/*.lo -rm -f dict-sql/*.$(OBJEXT) -rm -f dict-sql/*.lo + -rm -f linkage/*.$(OBJEXT) + -rm -f linkage/*.lo + -rm -f parse/*.$(OBJEXT) + -rm -f parse/*.lo + -rm -f post-process/*.$(OBJEXT) + -rm -f post-process/*.lo + -rm -f prepare/*.$(OBJEXT) + -rm -f prepare/*.lo + -rm -f print/*.$(OBJEXT) + -rm -f print/*.lo + -rm -f tokenize/*.$(OBJEXT) + -rm -f tokenize/*.lo distclean-compile: -rm -f *.tab.c -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/analyze-linkage.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/anysplit.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/api.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/build-disjuncts.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/constituents.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/count.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dict-common.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/connectors.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/disjunct-utils.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/disjuncts.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/expand.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/extract-links.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fast-match.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/histogram.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/idiom.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/linkage.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/post-process.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pp_knowledge.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pp_lexer.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pp_linkset.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/preparation.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/print-util.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/print.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/prune.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/regex-morph.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/regex-tokenizer.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memory-pool.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/resources.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/score.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/spellcheck-aspell.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/spellcheck-hun.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/string-set.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tokenize.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utilities.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wcwidth.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/word-utils.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wordgraph.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@dict-common/$(DEPDIR)/dict-common.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@dict-common/$(DEPDIR)/dict-impl.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@dict-common/$(DEPDIR)/dict-utils.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@dict-common/$(DEPDIR)/file-utils.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@dict-common/$(DEPDIR)/idiom.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@dict-common/$(DEPDIR)/print-dict.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@dict-common/$(DEPDIR)/regex-morph.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@dict-file/$(DEPDIR)/dictionary.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@dict-file/$(DEPDIR)/read-dict.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@dict-file/$(DEPDIR)/read-regex.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@dict-file/$(DEPDIR)/word-file.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@dict-sql/$(DEPDIR)/read-sql.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@linkage/$(DEPDIR)/analyze-linkage.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@linkage/$(DEPDIR)/freeli.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@linkage/$(DEPDIR)/linkage.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@linkage/$(DEPDIR)/lisjuncts.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@linkage/$(DEPDIR)/sane.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@linkage/$(DEPDIR)/score.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@parse/$(DEPDIR)/count.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@parse/$(DEPDIR)/extract-links.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@parse/$(DEPDIR)/fast-match.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@parse/$(DEPDIR)/histogram.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@parse/$(DEPDIR)/parse.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@parse/$(DEPDIR)/preparation.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@parse/$(DEPDIR)/prune.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@post-process/$(DEPDIR)/constituents.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@post-process/$(DEPDIR)/post-process.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@post-process/$(DEPDIR)/pp_knowledge.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@post-process/$(DEPDIR)/pp_lexer.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@post-process/$(DEPDIR)/pp_linkset.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@prepare/$(DEPDIR)/build-disjuncts.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@prepare/$(DEPDIR)/expand.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@prepare/$(DEPDIR)/exprune.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@print/$(DEPDIR)/print-util.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@print/$(DEPDIR)/print.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@print/$(DEPDIR)/wcwidth.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@tokenize/$(DEPDIR)/anysplit.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@tokenize/$(DEPDIR)/regex-tokenizer.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@tokenize/$(DEPDIR)/spellcheck-aspell.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@tokenize/$(DEPDIR)/spellcheck-hun.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@tokenize/$(DEPDIR)/tokenize.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@tokenize/$(DEPDIR)/wg-display.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@tokenize/$(DEPDIR)/wordgraph.Plo@am__quote@ .c.o: @am__fastdepCC_TRUE@ $(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ @@ -756,13 +944,23 @@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $< +.l.c: + $(AM_V_LEX)$(am__skiplex) $(SHELL) $(YLWRAP) $< $(LEX_OUTPUT_ROOT).c $@ -- $(LEXCOMPILE) + mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs + -rm -rf dict-common/.libs dict-common/_libs -rm -rf dict-file/.libs dict-file/_libs -rm -rf dict-sql/.libs dict-sql/_libs + -rm -rf linkage/.libs linkage/_libs + -rm -rf parse/.libs parse/_libs + -rm -rf post-process/.libs post-process/_libs + -rm -rf prepare/.libs prepare/_libs + -rm -rf print/.libs print/_libs + -rm -rf tokenize/.libs tokenize/_libs install-liblink_grammar_includeHEADERS: $(liblink_grammar_include_HEADERS) @$(NORMAL_INSTALL) @list='$(liblink_grammar_include_HEADERS)'; test -n "$(liblink_grammar_includedir)" || list=; \ @@ -973,21 +1171,37 @@ distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + -rm -f dict-common/$(DEPDIR)/$(am__dirstamp) + -rm -f dict-common/$(am__dirstamp) -rm -f dict-file/$(DEPDIR)/$(am__dirstamp) -rm -f dict-file/$(am__dirstamp) -rm -f dict-sql/$(DEPDIR)/$(am__dirstamp) -rm -f dict-sql/$(am__dirstamp) + -rm -f linkage/$(DEPDIR)/$(am__dirstamp) + -rm -f linkage/$(am__dirstamp) + -rm -f parse/$(DEPDIR)/$(am__dirstamp) + -rm -f parse/$(am__dirstamp) + -rm -f post-process/$(DEPDIR)/$(am__dirstamp) + -rm -f post-process/$(am__dirstamp) + -rm -f prepare/$(DEPDIR)/$(am__dirstamp) + -rm -f prepare/$(am__dirstamp) + -rm -f print/$(DEPDIR)/$(am__dirstamp) + -rm -f print/$(am__dirstamp) + -rm -f tokenize/$(DEPDIR)/$(am__dirstamp) + -rm -f tokenize/$(am__dirstamp) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." + -rm -f post-process/pp_lexer.c + -test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES) clean: clean-recursive -clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \ +clean-am: clean-generic clean-libLTLIBRARIES clean-libtool clean-local \ mostlyclean-am distclean: distclean-recursive - -rm -rf ./$(DEPDIR) dict-file/$(DEPDIR) dict-sql/$(DEPDIR) + -rm -rf ./$(DEPDIR) dict-common/$(DEPDIR) dict-file/$(DEPDIR) dict-sql/$(DEPDIR) linkage/$(DEPDIR) parse/$(DEPDIR) post-process/$(DEPDIR) prepare/$(DEPDIR) print/$(DEPDIR) tokenize/$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags @@ -1034,7 +1248,7 @@ installcheck-am: maintainer-clean: maintainer-clean-recursive - -rm -rf ./$(DEPDIR) dict-file/$(DEPDIR) dict-sql/$(DEPDIR) + -rm -rf ./$(DEPDIR) dict-common/$(DEPDIR) dict-file/$(DEPDIR) dict-sql/$(DEPDIR) linkage/$(DEPDIR) parse/$(DEPDIR) post-process/$(DEPDIR) prepare/$(DEPDIR) print/$(DEPDIR) tokenize/$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic @@ -1053,25 +1267,26 @@ uninstall-am: uninstall-libLTLIBRARIES \ uninstall-liblink_grammar_includeHEADERS uninstall-local - -.MAKE: $(am__recursive_targets) install-am install-strip + @$(NORMAL_INSTALL) + $(MAKE) $(AM_MAKEFLAGS) uninstall-hook +.MAKE: $(am__recursive_targets) install-am install-strip uninstall-am .PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \ check-am clean clean-generic clean-libLTLIBRARIES \ - clean-libtool cscopelist-am ctags ctags-am distclean \ - distclean-compile distclean-generic distclean-libtool \ - distclean-tags distdir dvi dvi-am html html-am info info-am \ - install install-am install-data install-data-am \ - install-data-local install-dvi install-dvi-am install-exec \ - install-exec-am install-html install-html-am install-info \ - install-info-am install-libLTLIBRARIES \ + clean-libtool clean-local cscopelist-am ctags ctags-am \ + distclean distclean-compile distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-data-local install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-libLTLIBRARIES \ install-liblink_grammar_includeHEADERS install-man install-pdf \ install-pdf-am install-ps install-ps-am install-strip \ installcheck installcheck-am installdirs installdirs-am \ maintainer-clean maintainer-clean-generic mostlyclean \ mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am \ - uninstall-libLTLIBRARIES \ + uninstall-hook uninstall-libLTLIBRARIES \ uninstall-liblink_grammar_includeHEADERS uninstall-local .PRECIOUS: Makefile @@ -1085,6 +1300,15 @@ @OS_WIN32_FALSE@install-libtool-import-lib: @OS_WIN32_FALSE@uninstall-libtool-import-lib: +post-process/pp_lexer.lo: AM_CPPFLAGS += -I$(top_srcdir)/link-grammar/post-process + +uninstall-hook: + -rmdir $(liblink_grammar_includedir) + +# Clean up generated file that Windows MSVC compilation leaves behind. +clean-local: + rm -f $(top_builddir)/link-grammar/link-features.h + # ----------------------------------------------------------- install-data-local: install-libtool-import-lib diff -Nru link-grammar-5.3.16/link-grammar/memory-pool.c link-grammar-5.5.0/link-grammar/memory-pool.c --- link-grammar-5.3.16/link-grammar/memory-pool.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/memory-pool.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,275 @@ +/*************************************************************************/ +/* Copyright (c) 2018 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include // errno +#include // strerror_r + +#include "error.h" +#include "memory-pool.h" +#include "utilities.h" // MIN, MAX, aligned alloc + +/* TODO: Add valgrind descriptions. See: + * http://valgrind.org/docs/manual/mc-manual.html#mc-manual.mempools */ + +/** + * Align given size to the nearest upper power of 2 + * for sizefunc = func; + mp->name = name; + + if (align) + { + mp->element_size = align_size(element_size); + mp->alignment = MAX(MIN_ALIGNMENT, mp->element_size); + mp->alignment = MIN(MAX_ALIGNMENT, mp->alignment); + mp->data_size = num_elements * mp->element_size; + mp->block_size = ALIGN(mp->data_size + FLDSIZE_NEXT, mp->alignment); + } + else + { + mp->element_size = element_size; + mp->alignment = MIN_ALIGNMENT; + mp->data_size = num_elements * mp->element_size; + mp->block_size = mp->data_size + FLDSIZE_NEXT; + } + + mp->zero_out = zero_out; + mp->exact = exact; + mp->alloc_next = NULL; + mp->chain = NULL; + mp->ring = NULL; + mp->free_list = NULL; + mp->curr_elements = 0; + mp->num_elements = num_elements; + + lgdebug(+D_MEMPOOL, "%sElement size %zu, alignment %zu (pool '%s' created in %s())\n", + POOL_ALLOCATOR?"":"(Fake pool allocator) ", + mp->element_size, mp->alignment, mp->name, mp->func); + return mp; +} + +/** + * Delete the given memory pool. + */ +void pool_delete(Pool_desc *mp) +{ + if (NULL == mp) return; + lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n", + mp->curr_elements, mp->name, mp->func); + + /* Free its chained memory blocks. */ + char *c_next; + size_t alloc_size; +#if POOL_ALLOCATOR + alloc_size = mp->data_size; +#else + alloc_size = mp->element_size; +#endif + for (char *c = mp->chain; c != NULL; c = c_next) + { + c_next = POOL_NEXT_BLOCK(c, alloc_size); +#if POOL_ALLOCATOR + aligned_free(c); +#else + free(c); +#endif + } + free(mp); +} + +#if POOL_ALLOCATOR +/** + * Allocate an element from the requested pool. + * This function uses the feature that pointers to void and char are + * interchangeable. + * 1. If no current block or current block exhausted - obtain another one + * and chain it the block chain. Else reuse an LRU unused block; + * The element pointer is aligned to the required alignment. + * 2. Zero the block if required; + * 3. Return element pointer. + */ +void *pool_alloc(Pool_desc *mp) +{ +#ifdef POOL_FREE + if (NULL != mp->free_list) + { + void *alloc_next = mp->free_list; + mp->free_list = *(char **)mp->free_list; + if (mp->zero_out) memset(alloc_next, 0, mp->element_size); + return alloc_next; + } +#endif // POOL_FREE + + mp->curr_elements++; /* For stats. */ + + if ((NULL == mp->alloc_next) || (mp->alloc_next == mp->ring + mp->data_size)) + { + assert(!mp->exact || (NULL == mp->alloc_next), + "Too many elements %zu>%zu (pool '%s' created in %s())", + mp->curr_elements, mp->num_elements, mp->name, mp->func); + + /* No current block or current block exhausted - obtain another one. */ + char *prev = mp->ring; /* Remember current block for possible chaining. */ + if (NULL != mp->ring) + { + /* Next block already exists. */ + mp->ring = POOL_NEXT_BLOCK(mp->ring, mp->data_size); + } + + if (NULL == mp->ring) + { + /* Allocate a new block and chain it. */ + mp->ring = aligned_alloc(mp->alignment, mp->block_size); + if (NULL == mp->ring) + { + /* aligned_alloc() has strict requirements. */ + char errbuf[64]; + strerror_r(errno, errbuf, sizeof(errbuf)); + assert(NULL != mp->ring, "Block/element sizes %zu/%zu: %s", + mp->block_size, mp->element_size, errbuf); + } + if (NULL == mp->alloc_next) + mp->chain = mp->ring; /* This is the start of the chain. */ + else + POOL_NEXT_BLOCK(prev, mp->data_size) = mp->ring; + POOL_NEXT_BLOCK(mp->ring, mp->data_size) = NULL; + //printf("New ring %p next %p\n", mp->ring, + //POOL_NEXT_BLOCK(mp->ring, mp->data_size)); + } /* Else reuse existing block. */ + + if (mp->zero_out) memset(mp->ring, 0, mp->data_size); + mp->alloc_next = mp->ring; + } + + /* Grab a new element. */ + void *alloc_next = mp->alloc_next; + mp->alloc_next += mp->element_size; + return alloc_next; +} + +/** + * Reuse the given memory pool. + * Reset the pool pointers without freeing its memory. + * pool_alloc() will then reuse the existing pool blocks before allocating + * new blocks. + */ +void pool_reuse(Pool_desc *mp) +{ + lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n", + mp->curr_elements, mp->name, mp->func); + mp->ring = mp->chain; + mp->alloc_next = mp->ring; +} + +#ifdef POOL_FREE +/** + * Free elements. They are added to a free list that is used by + * pool_alloc() before it allocates from memory blocks. + * XXX Unchecked. + */ +void pool_free(Pool_desc *mp, void *e) +{ + assert(mp->element_size >= FLDSIZE_NEXT); + if (NULL == e) return; + + char *next = mp->free_list; + mp->free_list = e; + *(char **)e = next; +} +#endif // POOL_FREE + +#else // !POOL_ALLOCATOR + +/* A dummy pool allocator - for debugging (see the comment in memory-pool.h). + * Note: No Doxygen headers because these function replace functions with + * the same name defined above. */ + +/* + * Allocate an element by using malloc() directly. + */ +void *pool_alloc(Pool_desc *mp) +{ + mp->curr_elements++; + assert(!mp->exact || mp->curr_elements <= mp->num_elements, + "Too many elements (%zu>%zu) (pool '%s' created in %s())", + mp->curr_elements, mp->num_elements, mp->name, mp->func); + + /* Allocate a new element and chain it. */ + char *next = mp->chain; + mp->chain = malloc(mp->element_size + FLDSIZE_NEXT); + POOL_NEXT_BLOCK(mp->chain, mp->element_size) = next; + + void *alloc_next = mp->chain; + if (mp->zero_out) memset(alloc_next, 0, mp->element_size); + + return alloc_next; +} + +/* + * Reuse the given fake memory pool by freeing its memory. + */ +void pool_reuse(Pool_desc *mp) +{ + if (NULL == mp) return; + lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n", + mp->curr_elements, mp->name, mp->func); + + /* Free its chained memory blocks. */ + char *c_next; + for (char *c = mp->chain; c != NULL; c = c_next) + { + c_next = POOL_NEXT_BLOCK(c, mp->element_size); + free(c); + } + + mp->chain = NULL; +} + +#ifdef POOL_FREE +void pool_free(Pool_desc *mp, void *e) +{ + free(e); +} +#endif // POOL_FREE +#endif // POOL_ALLOCATOR diff -Nru link-grammar-5.3.16/link-grammar/memory-pool.h link-grammar-5.5.0/link-grammar/memory-pool.h --- link-grammar-5.3.16/link-grammar/memory-pool.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/memory-pool.h 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,68 @@ +/*************************************************************************/ +/* Copyright (c) 2018 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _MEMORY_POOL_H +#define _MEMORY_POOL_H + +#include "link-includes.h" +#include "utilities.h" // GNUC_MALLOC (XXX separate include?) + +#define D_MEMPOOL (D_SPEC+4) +#define MIN_ALIGNMENT sizeof(void *) // Minimum element alignment. +#define MAX_ALIGNMENT 64 // Maximum element alignment. + +typedef struct Pool_desc_s Pool_desc; + +/* See below the definition of pool_new(). */ +Pool_desc *pool_new(const char *, const char *, size_t, size_t, bool, bool, bool); +void *pool_alloc(Pool_desc *) GNUC_MALLOC; +void pool_reuse(Pool_desc *); +void pool_delete(Pool_desc *); +void pool_free(Pool_desc *, void *e); + +/* Pool allocator debug facility: + * If configured with "CFLAGS=-DPOOL_ALLOCATOR=0", a fake pool allocator + * that uses malloc() for each allocation is defined, in order that ASAN + * or valgrind can be used to find memory usage bugs. + */ + +#ifndef POOL_ALLOCATOR +#define POOL_ALLOCATOR 1 +#endif + +#define FLDSIZE_NEXT sizeof(char *) // "next block" field size +#define POOL_NEXT_BLOCK(blk, offset_next) (*(char **)((blk)+(offset_next))) + +struct Pool_desc_s +{ + /* Used only by the real pool allocator. */ + char *chain; // Allocated blocks. */ + char *ring; // Current area for allocation. + char *alloc_next; // Next element to be allocated. + char *free_list; // Allocations that got freed. + size_t block_size; // Block size for pool extension. + size_t data_size; // Size of data inside block_size. + size_t alignment; // Alignment of element allocation. + + /* Common to the real and fake pool allocators. */ + size_t element_size; // Allocated memory per element. + const char *name; // Pool name. + const char *func; // Invoker of pool_new(). + + /* For debug and stats. */ + size_t num_elements; + size_t curr_elements; + + /* Flags that are used by pool_alloc(). */ + bool zero_out; // Zero out allocated elements. + bool exact; // Abort if more than num_elements are needed. +}; +#endif // _MEMORY_POOL_H diff -Nru link-grammar-5.3.16/link-grammar/minisat/Makefile.am link-grammar-5.5.0/link-grammar/minisat/Makefile.am --- link-grammar-5.3.16/link-grammar/minisat/Makefile.am 2016-11-17 18:36:07.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/minisat/Makefile.am 2017-05-28 11:14:43.000000000 +0000 @@ -23,3 +23,5 @@ minisat/utils/ParseUtils.h \ minisat/utils/System.cc \ minisat/utils/System.h + +EXTRA_DIST= README diff -Nru link-grammar-5.3.16/link-grammar/minisat/Makefile.in link-grammar-5.5.0/link-grammar/minisat/Makefile.in --- link-grammar-5.3.16/link-grammar/minisat/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/minisat/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -197,6 +197,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -225,6 +226,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -233,13 +235,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -306,12 +311,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -396,6 +401,7 @@ minisat/utils/System.cc \ minisat/utils/System.h +EXTRA_DIST = README all: all-am .SUFFIXES: diff -Nru link-grammar-5.3.16/link-grammar/minisat/minisat/core/SolverTypes.h link-grammar-5.5.0/link-grammar/minisat/minisat/core/SolverTypes.h --- link-grammar-5.3.16/link-grammar/minisat/minisat/core/SolverTypes.h 2016-09-26 21:50:34.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/minisat/minisat/core/SolverTypes.h 2018-04-22 14:57:33.000000000 +0000 @@ -149,9 +149,9 @@ friend class ClauseAllocator; // NOTE: This constructor cannot be used directly (doesn't allocate enough memory). - Clause(const vec& ps, bool use_extra, bool learnt) { + Clause(const vec& ps, bool use_extra, bool learnt_) { header.mark = 0; - header.learnt = learnt; + header.learnt = learnt_; header.has_extra = use_extra; header.reloced = 0; header.size = ps.size(); @@ -186,10 +186,10 @@ public: void calcAbstraction() { assert(header.has_extra); - uint32_t abstraction = 0; + uint32_t abstracton = 0; for (int i = 0; i < size(); i++) - abstraction |= 1 << (var(data[i].lit) & 31); - data[header.size].abs = abstraction; } + abstracton |= 1 << (var(data[i].lit) & 31); + data[header.size].abs = abstracton; } int size () const { return header.size; } diff -Nru link-grammar-5.3.16/link-grammar/minisat/minisat/mtl/Alloc.h link-grammar-5.5.0/link-grammar/minisat/minisat/mtl/Alloc.h --- link-grammar-5.3.16/link-grammar/minisat/minisat/mtl/Alloc.h 2016-09-26 21:50:34.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/minisat/minisat/mtl/Alloc.h 2018-04-22 14:57:33.000000000 +0000 @@ -56,8 +56,8 @@ uint32_t size () const { return sz; } uint32_t wasted () const { return wasted_; } - Ref alloc (int size); - void free (int size) { wasted_ += size; } + Ref alloc (int size_); + void free (int size_) { wasted_ += size_; } // Deref, Load Effective Address (LEA), Inverse of LEA (AEL): T& operator[](Ref r) { assert(r < sz); return memory[r]; } @@ -108,15 +108,15 @@ template typename RegionAllocator::Ref -RegionAllocator::alloc(int size) -{ - // printf("ALLOC called (this = %p, size = %d)\n", this, size); fflush(stdout); - assert(size > 0); - capacity(sz + size); +RegionAllocator::alloc(int size_) +{ + // printf("ALLOC called (this = %p, size = %d)\n", this, size_); fflush(stdout); + assert(size_ > 0); + capacity(sz + size_); uint32_t prev_sz = sz; - sz += size; - + sz += size_; + // Handle overflow: if (sz < prev_sz) throw OutOfMemoryException(); diff -Nru link-grammar-5.3.16/link-grammar/minisat/minisat/mtl/Vec.h link-grammar-5.5.0/link-grammar/minisat/minisat/mtl/Vec.h --- link-grammar-5.3.16/link-grammar/minisat/minisat/mtl/Vec.h 2016-09-26 21:50:34.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/minisat/minisat/mtl/Vec.h 2018-04-22 14:57:33.000000000 +0000 @@ -53,8 +53,8 @@ public: // Constructors: vec() : data(NULL), sz(0), cap(0) { } - explicit vec(Size size) : data(NULL), sz(0), cap(0) { growTo(size); } - vec(Size size, const T& pad) : data(NULL), sz(0), cap(0) { growTo(size, pad); } + explicit vec(Size size_) : data(NULL), sz(0), cap(0) { growTo(size_); } + vec(Size size_, const T& pad) : data(NULL), sz(0), cap(0) { growTo(size_, pad); } ~vec() { clear(true); } // Pointer to first element: @@ -106,19 +106,19 @@ template -void vec::growTo(Size size, const T& pad) { - if (sz >= size) return; - capacity(size); - for (Size i = sz; i < size; i++) data[i] = pad; - sz = size; } +void vec::growTo(Size size_, const T& pad) { + if (sz >= size_) return; + capacity(size_); + for (Size i = sz; i < size_; i++) data[i] = pad; + sz = size_; } template -void vec::growTo(Size size) { - if (sz >= size) return; - capacity(size); - for (Size i = sz; i < size; i++) new (&data[i]) T(); - sz = size; } +void vec::growTo(Size size_) { + if (sz >= size_) return; + capacity(size_); + for (Size i = sz; i < size_; i++) new (&data[i]) T(); + sz = size_; } template diff -Nru link-grammar-5.3.16/link-grammar/parse/count.c link-grammar-5.5.0/link-grammar/parse/count.c --- link-grammar-5.3.16/link-grammar/parse/count.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/count.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,703 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013,2014,2015 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include + +#include "link-includes.h" +#include "api-structures.h" +#include "connectors.h" +#include "count.h" +#include "disjunct-utils.h" +#include "fast-match.h" +#include "resources.h" +#include "tokenize/word-structures.h" // for Word_struct + +/* This file contains the exhaustive search algorithm. */ + +typedef struct Table_connector_s Table_connector; +struct Table_connector_s +{ + Table_connector *next; + Connector *le, *re; + Count_bin count; + short lw, rw; + unsigned short null_count; +}; + +struct count_context_s +{ + fast_matcher_t *mchxt; + Sentence sent; + /* int null_block; */ /* not used, always 1 */ + bool islands_ok; + bool null_links; + bool exhausted; + unsigned int checktimer; /* Avoid excess system calls */ + int table_size; + int log2_table_size; + Table_connector ** table; + Resources current_resources; +}; + +static void free_table(count_context_t *ctxt) +{ + xfree(ctxt->table, ctxt->table_size * sizeof(Table_connector*)); + ctxt->table = NULL; + ctxt->table_size = 0; +} + +static void init_table(count_context_t *ctxt, size_t sent_len) +{ + unsigned int shift; + /* A piecewise exponential function determines the size of the + * hash table. Probably should make use of the actual number of + * disjuncts, rather than just the number of words. + */ + if (ctxt->table) free_table(ctxt); + + if (sent_len >= 10) + { + shift = 12 + (sent_len) / 4 ; + } + else + { + shift = 12; + } + + /* Clamp at max 4*(1<<24) == 64 MBytes */ + if (24 < shift) shift = 24; + lgdebug(+5, "Connector table size (1<<%u)*%zu\n", shift, sizeof(Table_connector)); + ctxt->table_size = (1U << shift); + ctxt->log2_table_size = shift; + ctxt->table = (Table_connector**) + xalloc(ctxt->table_size * sizeof(Table_connector*)); + memset(ctxt->table, 0, ctxt->table_size*sizeof(Table_connector*)); + + +} + +#if defined(DEBUG) || defined(DEBUG_TABLE_STAT) +static int hit; +static int miss; +#undef DEBUG_TABLE_STAT +#define DEBUG_TABLE_STAT(x) x +/** + * Provide data for insights on the effectively of the connector pair table. + * Hits, misses, chain length, number of elements with zero/nonzero counts. + */ +static void table_stat(count_context_t *ctxt, Sentence sent) +{ + int z = 0, nz = 0; + int c, N = 0; + const int sent_length = sent->length; + int *ww = alloca(sent_length*sent_length*sizeof(ww)); + int *wc = alloca(sent_length*sent_length*sizeof(ww)); + + memset(ww, 0, sent_length*sent_length*sizeof(ww)); + memset(wc, 0, sent_length*sent_length*sizeof(ww)); + for (int i = 0; i < ctxt->table_size; i++) + { + c = 0; + Table_connector *t = ctxt->table[i]; + if (t == NULL) N++; + for (; t != NULL; t = t->next) + { + if ((t->lw < 0) || (t->rw == sent_length)) continue; + c++; + if (hist_total(&t->count) == 0) z++; + else nz++; + ww[t->rw + sent_length * t->lw]++; + if (hist_total(&t->count)) wc[t->rw + sent_length * t->lw]++; + } + if (c != 0) printf("Connector table [%d] c=%d\n", i, c); + } + + int wn = 0; + for (int i = 0; i < sent_length; i++) + for (int j = 0; j < sent_length; j++) + { + if (ww[i + sent_length * j]) wn++; + printf("WW %d %d = %d C=%d\n", i, j, ww[i + sent_length * j], wc[i + sent_length * j]); + } + + printf("Connector table z=%d nz=%d N=%d hit=%d miss=%d wc=%d\n", z, nz, N, hit, miss, wn); +} +#else +#define DEBUG_TABLE_STAT(x) +#endif /* DEBUG */ + +/** + * Stores the value in the table. Assumes it's not already there. + */ +static Table_connector * table_store(count_context_t *ctxt, + int lw, int rw, + Connector *le, Connector *re, + unsigned int null_count) +{ + Table_connector *t, *n; + unsigned int h; + + n = pool_alloc(ctxt->sent->Table_connector_pool); + n->lw = lw; n->rw = rw; n->le = le; n->re = re; n->null_count = null_count; + h = pair_hash(ctxt->table_size, lw, rw, le, re, null_count); + t = ctxt->table[h]; + n->next = t; + ctxt->table[h] = n; + + return n; +} + +/** returns the pointer to this info, NULL if not there */ +static Table_connector * +find_table_pointer(count_context_t *ctxt, + int lw, int rw, + Connector *le, Connector *re, + unsigned int null_count) +{ + Table_connector *t; + unsigned int h = pair_hash(ctxt->table_size,lw, rw, le, re, null_count); + t = ctxt->table[h]; + for (; t != NULL; t = t->next) { + if ((t->lw == lw) && (t->rw == rw) + && (t->le == le) && (t->re == re) + && (t->null_count == null_count)) + { + DEBUG_TABLE_STAT(hit++); + return t; + } + } + DEBUG_TABLE_STAT(miss++); + + /* Create a new connector only if resources are exhausted. + * (???) Huh? I guess we're in panic parse mode in that case. + * checktimer is a device to avoid a gazillion system calls + * to get the timer value. On circa-2017 machines, it results + * in about 0.5-1 timer calls per second. + */ + ctxt->checktimer ++; + if (ctxt->exhausted || ((0 == ctxt->checktimer%(1<<21)) && + (ctxt->current_resources != NULL) && + resources_exhausted(ctxt->current_resources))) + { + ctxt->exhausted = true; + t = table_store(ctxt, lw, rw, le, re, null_count); + t->count = hist_zero(); + return t; + } + else return NULL; +} + +/** returns the count for this quintuple if there, -1 otherwise */ +Count_bin* table_lookup(count_context_t * ctxt, + int lw, int rw, Connector *le, Connector *re, + unsigned int null_count) +{ + Table_connector *t = find_table_pointer(ctxt, lw, rw, le, re, null_count); + + if (t == NULL) return NULL; else return &t->count; +} + +#define NO_COUNT -1 +#ifdef PERFORM_COUNT_HISTOGRAMMING +#define INIT_NO_COUNT {.total = NO_COUNT} +#else +#define INIT_NO_COUNT NO_COUNT +#endif +Count_bin count_unknown = INIT_NO_COUNT; + +/** + * psuedocount is used to check to see if a parse is even possible, + * so that we don't waste cpu time performing an actual count, only + * to discover that it is zero. + * + * Returns false if and only if this entry is in the hash table + * with a count value of 0. If an entry is not in the hash table, + * we have to assume the worst case: that the count might be non-zero, + * and since we don't know, we return true. However, if the entry is + * in the hash table, and its zero, then we know, for sure, that the + * count is zero. + */ +static Count_bin pseudocount(count_context_t * ctxt, + int lw, int rw, Connector *le, Connector *re, + unsigned int null_count) +{ + Count_bin * count = table_lookup(ctxt, lw, rw, le, re, null_count); + if (NULL == count) return count_unknown; + return *count; +} + +/** + * Return the number of optional words strictly between w1 and w2. + */ +static int num_optional_words(count_context_t *ctxt, int w1, int w2) +{ + int n = 0; + + for (int w = w1+1; w < w2; w++) + if (ctxt->sent->word[w].optional) n++; + + return n; +} + +#ifdef DEBUG +#define DO_COUNT_TRACE +#endif + +#ifdef DO_COUNT_TRACE +#define D_COUNT_TRACE 8 +#define LBLSZ 11 +#define TRACE_LABEL(l, do_count) \ + (verbosity_level(D_COUNT_TRACE, "do_count") ? \ + prt_error("%-*s", LBLSZ, STRINGIFY(l)) : 0, do_count) +#define V(c) (!c?"(nil)":connector_string(c)) +static Count_bin do_count1(int lineno, count_context_t *ctxt, + int lw, int rw, + Connector *le, Connector *re, + int null_count); + +static Count_bin do_count(int lineno, count_context_t *ctxt, + int lw, int rw, + Connector *le, Connector *re, + int null_count) +{ + static int level; + + if (!verbosity_level(D_COUNT_TRACE)) + return do_count1(lineno, ctxt, lw, rw, le, re, null_count); + + Table_connector *t = find_table_pointer(ctxt, lw, rw, le, re, null_count); + char m_result[64] = ""; + if (t != NULL) + snprintf(m_result, sizeof(m_result), "(M=%lld)", hist_total(&t->count)); + + level++; + prt_error("%*sdo_count%s:%d lw=%d rw=%d le=%s re=%s null_count=%d\n\\", + level*2, "", m_result, lineno, lw, rw, V(le), V(re), null_count); + Count_bin r = do_count1(lineno, ctxt, lw, rw, le, re, null_count); + prt_error("%*sreturn%.*s:%d=%lld\n", + LBLSZ+level*2, "", (!!t)*3, "(M)", lineno, hist_total(&r)); + level--; + + return r; +} + +static Count_bin do_count1(int lineno, +#define do_count(...) do_count(__LINE__, __VA_ARGS__) +#else +#define TRACE_LABEL(l, do_count) (do_count) +static Count_bin do_count( +#endif + count_context_t *ctxt, + int lw, int rw, + Connector *le, Connector *re, + int null_count) +{ + Count_bin zero = hist_zero(); + Count_bin total; + int start_word, end_word, w; + Table_connector *t; + + assert (0 <= null_count, "Bad null count"); + + t = find_table_pointer(ctxt, lw, rw, le, re, null_count); + + if (t) return t->count; + + /* Create a table entry, to be updated with the found + * linkage count before we return. */ + t = table_store(ctxt, lw, rw, le, re, null_count); + + int unparseable_len = rw-lw-1; + +#if 1 + /* This check is not necessary for correctness, as it is handled in + * the general case below. It looks like it should be slightly faster. */ + if (unparseable_len == 0) + { + /* lw and rw are neighboring words */ + /* You can't have a linkage here with null_count > 0 */ + if ((le == NULL) && (re == NULL) && (null_count == 0)) + { + t->count = hist_one(); + } + else + { + t->count = zero; + } + return t->count; + } +#endif + + /* The left and right connectors are null, but the two words are + * NOT next to each-other. */ + if ((le == NULL) && (re == NULL)) + { + int nopt_words = num_optional_words(ctxt, lw, rw); + + if ((null_count == 0) || (!ctxt->islands_ok && (lw != -1)) ) + { + /* The null_count of skipping n words is just n. + * In case the unparsable range contains optional words, we + * don't know here how many of them are actually skipped, because + * they may belong to different alternatives and essentially just + * be ignored. Hence the inequality - sane_linkage_morphism() + * will discard the linkages with extra null words. */ + if ((null_count <= unparseable_len) && + (null_count >= unparseable_len - nopt_words)) + + { + t->count = hist_one(); + } + else + { + t->count = zero; + } + return t->count; + } + + /* Here null_count != 0 and we allow islands (a set of words + * linked together but separate from the rest of the sentence). + * Because we don't know here if an optional word is just + * skipped or is a real null-word (see the comment above) we + * try both possibilities: If a real null is encountered, the + * rest of the sentence must contain one less null-word. Else + * the rest of the sentence still contains the required number + * of null words. */ + t->count = zero; + w = lw + 1; + for (int opt = 0; opt <= !!ctxt->sent->word[w].optional; opt++) + { + null_count += opt; + for (Disjunct *d = ctxt->sent->word[w].d; d != NULL; d = d->next) + { + if (d->left == NULL) + { + hist_accumv(&t->count, d->cost, + do_count(ctxt, w, rw, d->right, NULL, null_count-1)); + } + } + hist_accumv(&t->count, 0.0, + do_count(ctxt, w, rw, NULL, NULL, null_count-1)); + } + return t->count; + } + + if (le == NULL) + { + start_word = lw+1; + } + else + { + start_word = le->nearest_word; + } + + if (re == NULL) + { + end_word = rw; + } + else + { + end_word = re->nearest_word +1; + } + + total = zero; + fast_matcher_t *mchxt = ctxt->mchxt; + + for (w = start_word; w < end_word; w++) + { + size_t mlb = form_match_list(mchxt, w, le, lw, re, rw); +#ifdef VERIFY_MATCH_LIST + int id = get_match_list_element(mchxt, mlb) ? + get_match_list_element(mchxt, mlb)->match_id : 0; +#endif + for (size_t mle = mlb; get_match_list_element(mchxt, mle) != NULL; mle++) + { + Disjunct *d = get_match_list_element(mchxt, mle); + bool Lmatch = d->match_left; + bool Rmatch = d->match_right; + +#ifdef VERIFY_MATCH_LIST + assert(id == d->match_id, "Modified id (%d!=%d)", id, d->match_id); +#endif + + for (int lnull_cnt = 0; lnull_cnt <= null_count; lnull_cnt++) + { + int rnull_cnt = null_count - lnull_cnt; + /* Now lnull_cnt and rnull_cnt are the null-counts we're + * requiring in those parts respectively. */ + bool leftpcount = false; + bool rightpcount = false; + + PRAGMA_MAYBE_UNINITIALIZED /* For old GCC versions */ + Count_bin l_any; /* Used only when leftpcount==true */ + Count_bin r_any; /* Used only when rightpcount==true */ + PRAGMA_END + Count_bin l_cmulti = INIT_NO_COUNT; + Count_bin l_dmulti = INIT_NO_COUNT; + Count_bin l_dcmulti = INIT_NO_COUNT; + Count_bin l_bnr = INIT_NO_COUNT; + Count_bin r_cmulti = INIT_NO_COUNT; + Count_bin r_dmulti = INIT_NO_COUNT; + Count_bin r_dcmulti = INIT_NO_COUNT; + Count_bin r_bnl = INIT_NO_COUNT; + + /* Now, we determine if (based on table only) we can see that + the current range is not parsable. */ + + /* The result count is a sum of multiplications of + * LHS and RHS counts. If one of them is zero, we can skip + * calculating the other one. + * + * So, first perform pseudocounting as an optimization. If + * the pseudocount is zero, then we know that the true + * count will be zero. + * + * Cache the result in the l_* and r_* variables, so a table + * lookup can be skipped in cases we cannot skip the actual + * calculation and a table entry exists. */ + if (Lmatch) + { + l_any = pseudocount(ctxt, lw, w, le->next, d->left->next, lnull_cnt); + leftpcount = (hist_total(&l_any) != 0); + if (!leftpcount && le->multi) + { + l_cmulti = + pseudocount(ctxt, lw, w, le, d->left->next, lnull_cnt); + leftpcount |= (hist_total(&l_cmulti) != 0); + } + if (!leftpcount && d->left->multi) + { + l_dmulti = + pseudocount(ctxt, lw, w, le->next, d->left, lnull_cnt); + leftpcount |= (hist_total(&l_dmulti) != 0); + } + if (!leftpcount && le->multi && d->left->multi) + { + l_dcmulti = + pseudocount(ctxt, lw, w, le, d->left, lnull_cnt); + leftpcount |= (hist_total(&l_dcmulti) != 0); + } + } + + if (Rmatch && (leftpcount || (le == NULL))) + { + r_any = pseudocount(ctxt, w, rw, d->right->next, re->next, rnull_cnt); + rightpcount = (hist_total(&r_any) != 0); + if (!rightpcount && re->multi) + { + r_cmulti = + pseudocount(ctxt, w, rw, d->right->next, re, rnull_cnt); + rightpcount |= (hist_total(&r_cmulti) != 0); + } + if (!rightpcount && d->right->multi) + { + r_dmulti = + pseudocount(ctxt, w,rw, d->right, re->next, rnull_cnt); + rightpcount |= (hist_total(&r_dmulti) != 0); + } + if (!rightpcount && d->right->multi && re->multi) + { + r_dcmulti = + pseudocount(ctxt, w, rw, d->right, re, rnull_cnt); + rightpcount |= (hist_total(&r_dcmulti) != 0); + } + } + + if (!leftpcount && !rightpcount) continue; + +#define COUNT(c, do_count) \ + { c = TRACE_LABEL(c, do_count); } + if (!(leftpcount && rightpcount)) + { + if (leftpcount) + { + /* Evaluate using the left match, but not the right. */ + COUNT(l_bnr, do_count(ctxt, w, rw, d->right, re, rnull_cnt)); + } + else if (le == NULL) + { + /* Evaluate using the right match, but not the left. */ + COUNT(r_bnl, do_count(ctxt, lw, w, le, d->left, lnull_cnt)); + } + } + +#define CACHE_COUNT(c, how_to_count, do_count) \ +{ \ + Count_bin count = (hist_total(&c) == NO_COUNT) ? \ + TRACE_LABEL(c, do_count) : c; \ + how_to_count; \ +} + /* If the pseudocounting above indicates one of the terms + * in the count multiplication is zero, + * we know that the true total is zero. So we don't + * bother counting the other term at all, in that case. */ + Count_bin leftcount = zero; + Count_bin rightcount = zero; + if (leftpcount && + (rightpcount || (0 != hist_total(&l_bnr)))) + { + CACHE_COUNT(l_any, leftcount = count, + do_count(ctxt, lw, w, le->next, d->left->next, lnull_cnt)); + if (le->multi) + CACHE_COUNT(l_cmulti, hist_accumv(&leftcount, d->cost, count), + do_count(ctxt, lw, w, le, d->left->next, lnull_cnt)); + if (d->left->multi) + CACHE_COUNT(l_dmulti, hist_accumv(&leftcount, d->cost, count), + do_count(ctxt, lw, w, le->next, d->left, lnull_cnt)); + if (d->left->multi && le->multi) + CACHE_COUNT(l_dcmulti, hist_accumv(&leftcount, d->cost, count), + do_count(ctxt, lw, w, le, d->left, lnull_cnt)); + + if (0 < hist_total(&leftcount)) + { + /* Evaluate using the left match, but not the right */ + CACHE_COUNT(l_bnr, hist_muladdv(&total, &leftcount, d->cost, count), + do_count(ctxt, w, rw, d->right, re, rnull_cnt)); + } + } + + if (rightpcount && + ((0 < hist_total(&leftcount)) || (0 != hist_total(&r_bnl)))) + { + CACHE_COUNT(r_any, rightcount = count, + do_count(ctxt, w, rw, d->right->next, re->next, rnull_cnt)); + if (re->multi) + CACHE_COUNT(r_cmulti, hist_accumv(&rightcount, d->cost, count), + do_count(ctxt, w, rw, d->right->next, re, rnull_cnt)); + if (d->right->multi) + CACHE_COUNT(r_dmulti, hist_accumv(&rightcount, d->cost, count), + do_count(ctxt, w, rw, d->right, re->next, rnull_cnt)); + if (d->right->multi && re->multi) + CACHE_COUNT(r_dcmulti, hist_accumv(&rightcount, d->cost, count), + do_count(ctxt, w, rw, d->right, re, rnull_cnt)); + + if (0 < hist_total(&rightcount)) + { + if (le == NULL) + { + /* Evaluate using the right match, but not the left */ + CACHE_COUNT(r_bnl, hist_muladdv(&total, &rightcount, d->cost, count), + do_count(ctxt, lw, w, le, d->left, lnull_cnt)); + } + else + { + /* Total number where links are used on both side. + * Note that we don't have leftcount if le == NULL. */ + hist_muladd(&total, &leftcount, 0.0, &rightcount); + } + } + } + + /* Sigh. Overflows can and do occur, esp for the ANY language. */ + if (INT_MAX < hist_total(&total)) + { +#ifdef PERFORM_COUNT_HISTOGRAMMING + total.total = INT_MAX; +#else + total = INT_MAX; +#endif /* PERFORM_COUNT_HISTOGRAMMING */ + t->count = total; + pop_match_list(mchxt, mlb); + return total; + } + } + } + pop_match_list(mchxt, mlb); + } + t->count = total; + return total; +} + + +/** + * Returns the number of ways the sentence can be parsed with the + * specified null count. Assumes that the fast-matcher and the count + * context have already been initialized, and will be freed later. The + * "null_count" argument is the number of words that are allowed to + * have no links to them. + * + * This the full-fledged parser, but it only 'counts', in order to + * avoid an explosion of allocated memory structures to hold each + * possible parse. Thus, to see an 'actual' parse, a second pass + * must be made, with build_parse_set(), to get actual parse structures. + * + * The work is split up this way for two reasons: + * 1) A given sentence may have thousands of parses, and the user is + * interested in only a few. + * 2) A given sentence may have billions of parses, in which case, + * allocating for each would blow out RAM. + * So, basically, its good to know how many parses to expect, before + * starting to allocate parse structures. + * + * The count returned here is meant to be completely accurate; it is + * not an approximation! + * + * Currently, the code has been designed to maintain a histogram of + * the cost of each of the parses. The number and width of the bins + * is adjustable in histogram.c. At this time, the histogram is not + * used anywhere, and a 3-5% speedup is available if it is avoided. + * We plan to use this histogram, later .... + */ +Count_bin do_parse(Sentence sent, + fast_matcher_t *mchxt, + count_context_t *ctxt, + int null_count, Parse_Options opts) +{ + Count_bin hist; + + ctxt->current_resources = opts->resources; + ctxt->exhausted = false; + ctxt->checktimer = 0; + ctxt->sent = sent; + + /* consecutive blocks of this many words are considered as + * one null link. */ + /* ctxt->null_block = 1; */ + ctxt->islands_ok = opts->islands_ok; + ctxt->mchxt = mchxt; + + hist = do_count(ctxt, -1, sent->length, NULL, NULL, null_count+1); + + return hist; +} + +/* sent_length is used only as a hint for the hash table size ... */ +count_context_t * alloc_count_context(Sentence sent) +{ + count_context_t *ctxt = (count_context_t *) xalloc (sizeof(count_context_t)); + memset(ctxt, 0, sizeof(count_context_t)); + + if (NULL != sent->Table_connector_pool) + { + pool_reuse(sent->Table_connector_pool); + } + else + { + sent->Table_connector_pool = + pool_new(__func__, "Table_connector", + /*num_elements*/10240, sizeof(Table_connector), + /*zero_out*/false, /*align*/false, /*exact*/false); + } + + init_table(ctxt, sent->length); + return ctxt; +} + +void free_count_context(count_context_t *ctxt, Sentence sent) +{ + if (NULL == ctxt) return; + + DEBUG_TABLE_STAT(if (verbosity_level(D_SPEC+2)) table_stat(ctxt, sent)); + free_table(ctxt); + xfree(ctxt, sizeof(count_context_t)); +} diff -Nru link-grammar-5.3.16/link-grammar/parse/count.h link-grammar-5.5.0/link-grammar/parse/count.h --- link-grammar-5.3.16/link-grammar/parse/count.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/count.h 2018-03-06 02:44:41.000000000 +0000 @@ -0,0 +1,26 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _COUNT_H +#define _COUNT_H + +#include "fast-match.h" +#include "histogram.h" /* for s64 */ + +typedef struct count_context_s count_context_t; + +Count_bin* table_lookup(count_context_t *, int, int, Connector *, Connector *, unsigned int); +Count_bin do_parse(Sentence, fast_matcher_t*, count_context_t*, int null_count, Parse_Options); + +count_context_t* alloc_count_context(Sentence); +void free_count_context(count_context_t*, Sentence); +#endif /* _COUNT_H */ diff -Nru link-grammar-5.3.16/link-grammar/parse/extract-links.c link-grammar-5.5.0/link-grammar/parse/extract-links.c --- link-grammar-5.3.16/link-grammar/parse/extract-links.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/extract-links.c 2018-04-27 02:57:39.000000000 +0000 @@ -0,0 +1,727 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2010, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include /* For UINT_MAX */ + +#include "connectors.h" +#include "count.h" +#include "disjunct-utils.h" // for Disjunct +#include "extract-links.h" +#include "utilities.h" // for Windows rand_r() +#include "linkage/linkage.h" +#include "tokenize/word-structures.h" // for Word_Struct + +//#define RECOUNT + +typedef struct Parse_choice_struct Parse_choice; + +/* The parse_choice is used to extract links for a given parse */ +typedef struct Parse_set_struct Parse_set; +struct Parse_choice_struct +{ + Parse_choice * next; + Parse_set * set[2]; + Link link[2]; /* the lc fields of these is NULL if there is no link used */ + Disjunct *ld, *md, *rd; /* the chosen disjuncts for the relevant three words */ +}; + +struct Parse_set_struct +{ + short lw, rw; /* left and right word index */ + unsigned short null_count; /* number of island words */ + Connector *le, *re; /* pending, unconnected connectors */ + + s64 count; /* The number of ways to parse. */ +#ifdef RECOUNT + s64 recount; /* Exactly the same as above, but counted at a later stage. */ + s64 cut_count; /* Count only low-cost parses, i.e. below the cost cutoff */ + //double cost_cutoff; +#undef RECOUNT +#define RECOUNT(X) X +#else +#define RECOUNT(X) /* Make it disappear... */ +#endif + Parse_choice * first; + Parse_choice * tail; +}; + +typedef struct Pset_bucket_struct Pset_bucket; +struct Pset_bucket_struct +{ + Parse_set set; + Pset_bucket *next; +}; + +struct extractor_s +{ + unsigned int x_table_size; + unsigned int log2_x_table_size; + Pset_bucket ** x_table; /* Hash table */ + Parse_set * parse_set; + + /* thread-safe random number state */ + unsigned int rand_state; +}; + +/** + * The first thing we do is we build a data structure to represent the + * result of the entire parse search. There will be a set of nodes + * built for each call to the count() function that returned a non-zero + * value, AND which is part of a valid linkage. Each of these nodes + * represents a valid continuation, and contains pointers to two other + * sets (one for the left continuation and one for the right + * continuation). + */ + +static void free_set(Parse_set *s) +{ + Parse_choice *p, *xp; + if (s == NULL) return; + for (p=s->first; p != NULL; p = xp) + { + xp = p->next; + xfree((void *)p, sizeof(*p)); + } +} + +static Parse_choice * +make_choice(Parse_set *lset, Connector * llc, Connector * lrc, + Parse_set *rset, Connector * rlc, Connector * rrc, + Disjunct *ld, Disjunct *md, Disjunct *rd) +{ + Parse_choice *pc; + pc = (Parse_choice *) xalloc(sizeof(*pc)); + pc->next = NULL; + pc->set[0] = lset; + pc->link[0].link_name = NULL; + pc->link[0].lw = lset->lw; + pc->link[0].rw = lset->rw; + pc->link[0].lc = llc; + pc->link[0].rc = lrc; + pc->set[1] = rset; + pc->link[1].link_name = NULL; + pc->link[1].lw = rset->lw; + pc->link[1].rw = rset->rw; + pc->link[1].lc = rlc; + pc->link[1].rc = rrc; + pc->ld = ld; + pc->md = md; + pc->rd = rd; + return pc; +} + +/** + * Put this parse_choice into a given set. The tail pointer is always + * left pointing to the end of the list. + */ +static void put_choice_in_set(Parse_set *s, Parse_choice *pc) +{ + if (s->first == NULL) + { + s->first = pc; + } + else + { + s->tail->next = pc; + } + s->tail = pc; + pc->next = NULL; +} + +static void record_choice( + Parse_set *lset, Connector * llc, Connector * lrc, + Parse_set *rset, Connector * rlc, Connector * rrc, + Disjunct *ld, Disjunct *md, Disjunct *rd, Parse_set *s) +{ + put_choice_in_set(s, make_choice(lset, llc, lrc, + rset, rlc, rrc, + ld, md, rd)); +} + +/** + * Allocate the parse info struct + * + * A piecewise exponential function determines the size of the hash + * table. Probably should make use of the actual number of disjuncts, + * rather than just the number of words. + */ +extractor_t * extractor_new(int nwords, unsigned int ranstat) +{ + int log2_table_size; + extractor_t * pex; + + pex = (extractor_t *) xalloc(sizeof(extractor_t)); + memset(pex, 0, sizeof(extractor_t)); + pex->rand_state = ranstat; + + /* Alloc the x_table */ + if (nwords >= 10) { + log2_table_size = 14; + } else if (nwords >= 4) { + log2_table_size = nwords; + } else { + log2_table_size = 4; + } + pex->log2_x_table_size = log2_table_size; + pex->x_table_size = (1 << log2_table_size); + + /*printf("Allocating x_table of size %d\n", x_table_size);*/ + pex->x_table = (Pset_bucket**) xalloc(pex->x_table_size * sizeof(Pset_bucket*)); + memset(pex->x_table, 0, pex->x_table_size * sizeof(Pset_bucket*)); + + return pex; +} + +/** + * This is the function that should be used to free the set structure. Since + * it's a dag, a recursive free function won't work. Every time we create + * a set element, we put it in the hash table, so this is OK. + */ +void free_extractor(extractor_t * pex) +{ + unsigned int i; + Pset_bucket *t, *x; + if (!pex) return; + + for (i=0; ix_table_size; i++) + { + for (t = pex->x_table[i]; t!= NULL; t=x) + { + x = t->next; + free_set(&t->set); + xfree((void *) t, sizeof(Pset_bucket)); + } + } + pex->parse_set = NULL; + + /*printf("Freeing x_table of size %d\n", x_table_size);*/ + xfree((void *) pex->x_table, pex->x_table_size * sizeof(Pset_bucket*)); + pex->x_table_size = 0; + pex->x_table = NULL; + + xfree((void *) pex, sizeof(extractor_t)); +} + +/** + * Returns the pointer to this info, NULL if not there. + */ +static Pset_bucket * x_table_pointer(int lw, int rw, + Connector *le, Connector *re, + unsigned int null_count, extractor_t * pex) +{ + Pset_bucket *t; + t = pex->x_table[pair_hash(pex->x_table_size, lw, rw, le, re, null_count)]; + for (; t != NULL; t = t->next) { + if ((t->set.lw == lw) && (t->set.rw == rw) && + (t->set.le == le) && (t->set.re == re) && + (t->set.null_count == null_count)) return t; + } + return NULL; +} + +/** + * Stores the value in the x_table. Assumes it's not already there. + */ +static Pset_bucket * x_table_store(int lw, int rw, + Connector *le, Connector *re, + unsigned int null_count, extractor_t * pex) +{ + Pset_bucket *t, *n; + unsigned int h; + + n = (Pset_bucket *) xalloc(sizeof(Pset_bucket)); + n->set.lw = lw; + n->set.rw = rw; + n->set.null_count = null_count; + n->set.le = le; + n->set.re = re; + n->set.count = 0; + n->set.first = NULL; + n->set.tail = NULL; + + h = pair_hash(pex->x_table_size, lw, rw, le, re, null_count); + t = pex->x_table[h]; + n->next = t; + pex->x_table[h] = n; + return n; +} + +/** Create a bogus parse set that only holds lw, rw. */ +static Parse_set* dummy_set(int lw, int rw, + unsigned int null_count, extractor_t * pex) +{ + Pset_bucket *dummy; + dummy = x_table_pointer(lw, rw, NULL, NULL, null_count, pex); + if (dummy) return &dummy->set; + + dummy = x_table_store(lw, rw, NULL, NULL, null_count, pex); + dummy->set.count = 1; + return &dummy->set; +} + +#ifdef FINISH_THIS_IDEA_MAYBE_LATER +static int cost_compare(const void *a, const void *b) +{ + const Match_node* const * ma = a; + const Match_node* const * mb = b; + if ((*ma)->d->cost < (*mb)->d->cost) return -1; + if ((*ma)->d->cost > (*mb)->d->cost) return 1; + return 0; +} + +/** + * Sort the matchlist into ascending disjunct cost. The goal here + * is to issue the lowest-cost disjuncts first, so that the parse + * set ends up quasi-sorted. This is not enough to get us a totally + * sorted parse set, but it does guarantee that at least the very + * first parse really will be the lowest cost. + */ +static Match_node* sort_matchlist(Match_node* mlist) +{ + Match_node* mx; + Match_node** marr; + size_t len = 1; + size_t i; + + for (mx = mlist; mx->next != NULL; mx = mx->next) len++; + if (1 == len) return mlist; + + /* Avoid blowing out the stack. Its hopeless. */ + if (100000 < len) return mlist; + + marr = alloca(len * sizeof(Match_node*)); + i = 0; + for (mx = mlist; mx != NULL; mx = mx->next) marr[i++] = mx; + + qsort((void *) marr, len, sizeof(Match_node*), cost_compare); + for (i=0; inext = marr[i+1]; + marr[len-1]->next = NULL; + return marr[0]; +} +#endif /* FINISH_THIS_IDEA_MAYBE_LATER */ + +/** + * returns NULL if there are no ways to parse, or returns a pointer + * to a set structure representing all the ways to parse. + * + * This code is similar to do_count() in count.c -- for a good reason: + * the do_count() function did a full parse, but didn't actually + * allocate a memory structures to hold the parse. This also does + * a full parse, but it also allocates and fills out the various + * parse structures. + */ +static +Parse_set * mk_parse_set(Word* words, fast_matcher_t *mchxt, + count_context_t * ctxt, + Disjunct *ld, Disjunct *rd, int lw, int rw, + Connector *le, Connector *re, unsigned int null_count, + extractor_t * pex, bool islands_ok) +{ + int start_word, end_word, w; + Pset_bucket *xt; + Count_bin * count; + + assert(null_count < 0x7fff, "mk_parse_set() called with null_count < 0."); + + count = table_lookup(ctxt, lw, rw, le, re, null_count); + + /* If there's no counter, then there's no way to parse. */ + if (NULL == count) return NULL; + if (hist_total(count) == 0) return NULL; + + xt = x_table_pointer(lw, rw, le, re, null_count, pex); + + /* Perhaps we've already computed it; if so, return it. */ + if (xt != NULL) return &xt->set; + + /* Start it out with the empty set of parse choices. */ + /* This entry must be updated before we return. */ + xt = x_table_store(lw, rw, le, re, null_count, pex); + + /* The count we previously computed; its non-zero. */ + xt->set.count = hist_total(count); + +#define NUM_PARSES 4 + // xt->set.cost_cutoff = hist_cost_cutoff(count, NUM_PARSES); + // xt->set.cut_count = hist_cut_total(count, NUM_PARSES); + + RECOUNT({xt->set.recount = 1;}) + + /* If the two words are next to each other, the count == 1 */ + if (lw + 1 == rw) return &xt->set; + + /* The left and right connectors are null, but the two words are + * NOT next to each-other. */ + if ((le == NULL) && (re == NULL)) + { + Parse_set* pset; + Parse_set* dummy; + Disjunct* dis; + + if (!islands_ok && (lw != -1)) return &xt->set; + if (null_count == 0) return &xt->set; + + RECOUNT({xt->set.recount = 0;}) + + w = lw + 1; + for (int opt = 0; opt <= !!words[w].optional; opt++) + { + null_count += opt; + for (dis = words[w].d; dis != NULL; dis = dis->next) + { + if (dis->left == NULL) + { + pset = mk_parse_set(words, mchxt, ctxt, + dis, NULL, w, rw, dis->right, NULL, + null_count-1, pex, islands_ok); + if (pset == NULL) continue; + dummy = dummy_set(lw, w, null_count-1, pex); + record_choice(dummy, NULL, NULL, + pset, NULL, NULL, + NULL, NULL, NULL, &xt->set); + RECOUNT({xt->set.recount += pset->recount;}) + } + } + pset = mk_parse_set(words, mchxt, ctxt, + NULL, NULL, w, rw, NULL, NULL, + null_count-1, pex, islands_ok); + if (pset != NULL) + { + dummy = dummy_set(lw, w, null_count-1, pex); + record_choice(dummy, NULL, NULL, + pset, NULL, NULL, + NULL, NULL, NULL, &xt->set); + RECOUNT({xt->set.recount += pset->recount;}) + } + } + return &xt->set; + } + + if (le == NULL) + { + start_word = lw + 1; + } + else + { + start_word = le->nearest_word; + } + + if (re == NULL) + { + end_word = rw; + } + else + { + end_word = re->nearest_word + 1; + } + + /* This condition can never be true here. It is included so GCC + * will be able to optimize the loop over "null_count". Without + * this check, GCC thinks this loop may be an infinite loop and + * it may omit some optimizations. */ + if (UINT_MAX == null_count) return NULL; + + RECOUNT({xt->set.recount = 0;}) + for (w = start_word; w < end_word; w++) + { + size_t mlb, mle; + mle = mlb = form_match_list(mchxt, w, le, lw, re, rw); + // if (mlist) mlist = sort_matchlist(mlist); + for (; get_match_list_element(mchxt, mle) != NULL; mle++) + { + unsigned int lnull_count, rnull_count; + Disjunct *d = get_match_list_element(mchxt, mle); + bool Lmatch = d->match_left; + bool Rmatch = d->match_right; + + for (lnull_count = 0; lnull_count <= null_count; lnull_count++) + { + int i, j; + Parse_set *ls[4], *rs[4]; + + /* Here, lnull_count and rnull_count are the null_counts + * we're assigning to those parts respectively. */ + rnull_count = null_count - lnull_count; + + /* Now, we determine if (based on table only) we can see that + the current range is not parsable. */ + + for (i=0; i<4; i++) { ls[i] = rs[i] = NULL; } + if (Lmatch) + { + ls[0] = mk_parse_set(words, mchxt, ctxt, + ld, d, lw, w, le->next, d->left->next, + lnull_count, pex, islands_ok); + + if (le->multi) + ls[1] = mk_parse_set(words, mchxt, ctxt, + ld, d, lw, w, le, d->left->next, + lnull_count, pex, islands_ok); + + if (d->left->multi) + ls[2] = mk_parse_set(words, mchxt, ctxt, + ld, d, lw, w, le->next, d->left, + lnull_count, pex, islands_ok); + + if (le->multi && d->left->multi) + ls[3] = mk_parse_set(words, mchxt, ctxt, + ld, d, lw, w, le, d->left, + lnull_count, pex, islands_ok); + } + + if (Rmatch) + { + rs[0] = mk_parse_set(words, mchxt, ctxt, + d, rd, w, rw, d->right->next, re->next, + rnull_count, pex, islands_ok); + + if (d->right->multi) + rs[1] = mk_parse_set(words, mchxt, ctxt, + d, rd, w, rw, d->right, re->next, + rnull_count, pex, islands_ok); + + if (re->multi) + rs[2] = mk_parse_set(words, mchxt, ctxt, + d, rd, w, rw, d->right->next, re, + rnull_count, pex, islands_ok); + + if (d->right->multi && re->multi) + rs[3] = mk_parse_set(words, mchxt, ctxt, + d, rd, w, rw, d->right, re, + rnull_count, pex, islands_ok); + } + + for (i=0; i<4; i++) + { + /* This ordering is probably not consistent with that + * needed to use list_links. (??) */ + if (ls[i] == NULL) continue; + for (j=0; j<4; j++) + { + if (rs[j] == NULL) continue; + record_choice(ls[i], le, d->left, + rs[j], d->right, re, + ld, d, rd, &xt->set); + RECOUNT({xt->set.recount += ls[i]->recount * rs[j]->recount;}) + } + } + + if (ls[0] != NULL || ls[1] != NULL || ls[2] != NULL || ls[3] != NULL) + { + /* Evaluate using the left match, but not the right */ + Parse_set* rset = mk_parse_set(words, mchxt, ctxt, + d, rd, w, rw, d->right, re, + rnull_count, pex, islands_ok); + if (rset != NULL) + { + for (i=0; i<4; i++) + { + if (ls[i] == NULL) continue; + /* this ordering is probably not consistent with + * that needed to use list_links */ + record_choice(ls[i], le, d->left, + rset, NULL /* d->right */, + re, /* the NULL indicates no link*/ + ld, d, rd, &xt->set); + RECOUNT({xt->set.recount += ls[i]->recount * rset->recount;}) + } + } + } + if ((le == NULL) && (rs[0] != NULL || + rs[1] != NULL || rs[2] != NULL || rs[3] != NULL)) + { + /* Evaluate using the right match, but not the left */ + Parse_set* lset = mk_parse_set(words, mchxt, ctxt, + ld, d, lw, w, le, d->left, + lnull_count, pex, islands_ok); + + if (lset != NULL) + { + for (j=0; j<4; j++) + { + if (rs[j] == NULL) continue; + /* this ordering is probably not consistent with + * that needed to use list_links */ + record_choice(lset, NULL /* le */, + d->left, /* NULL indicates no link */ + rs[j], d->right, re, + ld, d, rd, &xt->set); + RECOUNT({xt->set.recount += lset->recount * rs[j]->recount;}) + } + } + } + } + } + pop_match_list(mchxt, mlb); + } + return &xt->set; +} + +/** + * Return TRUE if and only if an overflow in the number of parses + * occurred. Use a 64-bit int for counting. + */ +static bool set_node_overflowed(Parse_set *set) +{ + Parse_choice *pc; + s64 n = 0; + if (set == NULL || set->first == NULL) return false; + + for (pc = set->first; pc != NULL; pc = pc->next) + { + n += pc->set[0]->count * pc->set[1]->count; + if (PARSE_NUM_OVERFLOW < n) return true; + } + return false; +} + +static bool set_overflowed(extractor_t * pex) +{ + unsigned int i; + + assert(pex->x_table != NULL, "called set_overflowed with x_table==NULL"); + for (i=0; ix_table_size; i++) + { + Pset_bucket *t; + for (t = pex->x_table[i]; t != NULL; t = t->next) + { + if (set_node_overflowed(&t->set)) return true; + } + } + return false; +} + +/** + * This is the top level call that computes the whole parse_set. It + * points whole_set at the result. It creates the necessary hash + * table (x_table) which will be freed at the same time the + * whole_set is freed. + * + * This assumes that do_parse() has been run, and that the count_context + * is filled with the values thus computed. This function is structured + * much like do_parse(), which wraps the main workhorse do_count(). + * + * If the number of linkages gets huge, then the counts can overflow. + * We check if this has happened when verifying the parse set. + * This routine returns TRUE iff an overflow occurred. + */ + +bool build_parse_set(extractor_t* pex, Sentence sent, + fast_matcher_t *mchxt, + count_context_t *ctxt, + unsigned int null_count, Parse_Options opts) +{ + pex->parse_set = + mk_parse_set(sent->word, mchxt, ctxt, + NULL, NULL, -1, sent->length, NULL, NULL, null_count+1, + pex, opts->islands_ok); + + + return set_overflowed(pex); +} + +// Cannot be static, also called by SAT-solver. +void check_link_size(Linkage lkg) +{ + if (lkg->lasz <= lkg->num_links) + { + lkg->lasz = 2 * lkg->lasz + 10; + lkg->link_array = realloc(lkg->link_array, lkg->lasz * sizeof(Link)); + } +} + +static void issue_link(Linkage lkg, Disjunct * ld, Disjunct * rd, Link * link) +{ + check_link_size(lkg); + lkg->link_array[lkg->num_links] = *link; + lkg->num_links++; + + lkg->chosen_disjuncts[link->lw] = ld; + lkg->chosen_disjuncts[link->rw] = rd; +} + +static void issue_links_for_choice(Linkage lkg, Parse_choice *pc) +{ + if (pc->link[0].lc != NULL) { /* there is a link to generate */ + issue_link(lkg, pc->ld, pc->md, &pc->link[0]); + } + if (pc->link[1].lc != NULL) { + issue_link(lkg, pc->md, pc->rd, &pc->link[1]); + } +} + +static void list_links(Linkage lkg, const Parse_set * set, int index) +{ + Parse_choice *pc; + s64 n; + + if (set == NULL || set->first == NULL) return; + for (pc = set->first; pc != NULL; pc = pc->next) { + n = pc->set[0]->count * pc->set[1]->count; + if (index < n) break; + index -= n; + } + assert(pc != NULL, "walked off the end in list_links"); + issue_links_for_choice(lkg, pc); + list_links(lkg, pc->set[0], index % pc->set[0]->count); + list_links(lkg, pc->set[1], index / pc->set[0]->count); +} + +static void list_random_links(Linkage lkg, extractor_t * pex, const Parse_set * set) +{ + Parse_choice *pc; + int num_pc, new_index; + + if (set == NULL || set->first == NULL) return; + num_pc = 0; + for (pc = set->first; pc != NULL; pc = pc->next) { + num_pc++; + } + assert(num_pc != 0, "Couldn't get a random parse choice"); + + new_index = rand_r(&pex->rand_state) % num_pc; + + num_pc = 0; + for (pc = set->first; pc != NULL; pc = pc->next) { + if (new_index == num_pc) break; + num_pc++; + } + + issue_links_for_choice(lkg, pc); + list_random_links(lkg, pex, pc->set[0]); + list_random_links(lkg, pex, pc->set[1]); +} + +/** + * Generate the list of all links of the index'th parsing of the + * sentence. For this to work, you must have already called parse, and + * already built the whole_set. + */ +void extract_links(extractor_t * pex, Linkage lkg) +{ + int index = lkg->lifo.index; + if (index < 0) + { + bool repeatable = false; + if (0 == pex->rand_state) repeatable = true; + if (repeatable) pex->rand_state = index; + list_random_links(lkg, pex, pex->parse_set); + if (repeatable) + pex->rand_state = 0; + else + lkg->sent->rand_state = pex->rand_state; + } + else { + list_links(lkg, pex->parse_set, index); + } +} diff -Nru link-grammar-5.3.16/link-grammar/parse/extract-links.h link-grammar-5.5.0/link-grammar/parse/extract-links.h --- link-grammar-5.3.16/link-grammar/parse/extract-links.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/extract-links.h 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,30 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this softwares. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _EXTRACT_LINKS_H +#define _EXTRACT_LINKS_H + +#include "api-structures.h" +#include "link-includes.h" + +typedef struct extractor_s extractor_t; + +extractor_t* extractor_new(int nwords, unsigned int rand_state); +void free_extractor(extractor_t*); + +bool build_parse_set(extractor_t*, Sentence, + fast_matcher_t*, count_context_t*, + unsigned int null_count, Parse_Options); + +void extract_links(extractor_t*, Linkage); + +#endif /* _EXTRACT_LINKS_H */ diff -Nru link-grammar-5.3.16/link-grammar/parse/fast-match.c link-grammar-5.5.0/link-grammar/parse/fast-match.c --- link-grammar-5.3.16/link-grammar/parse/fast-match.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/fast-match.c 2018-03-11 23:13:19.000000000 +0000 @@ -0,0 +1,583 @@ +/**************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/**************************************************************************/ + +#include "api-structures.h" // For Sentence_s +#include "connectors.h" +#include "disjunct-utils.h" +#include "fast-match.h" +#include "string-set.h" +#include "dict-common/dict-common.h" // For contable +#include "tokenize/word-structures.h" // For Word_struct +#include "tokenize/wordgraph.h" +#include "tokenize/tok-structures.h" // XXX TODO provide gword access methods! +#include "utilities.h" // UNREACHABLE + +/** + * The entire goal of this file is provide a fast lookup of all of the + * disjuncts on a given word that might be able to connect to a given + * connector on the left or the right. The main entry point is + * form_match_list(), which performs this lookup. + * + * The lookup is fast, because it uses a precomputed lookup table to + * find the match candidates. The lookup table is stocked by looking + * at all disjuncts on all words, and sorting them into bins organized + * by connectors they could potentially connect to. The lookup table + * is created by calling the alloc_fast_matcher() function. + * + * free_fast_matcher() is used to free the matcher. + * form_match_list() manages its memory as a "stack" - match-lists are + * pushed on this stack. The said stack size gets over 2048 entries only + * for long and/or complex sentences. + * pop_match_list() releases the memory that form_match_list() returned + * by unwinding this stack. + */ + +#define MATCH_LIST_SIZE_INIT 4096 /* the initial size of the match-list stack */ +#define MATCH_LIST_SIZE_INC 2 /* match-list stack increase size factor */ + +/** + * Returns the number of disjuncts in the list that have non-null + * left connector lists. + */ +static int left_disjunct_list_length(const Disjunct * d) +{ + int i; + for (i=0; d!=NULL; d=d->next) { + if (d->left != NULL) i++; + } + return i; +} + +static int right_disjunct_list_length(const Disjunct * d) +{ + int i; + for (i=0; d!=NULL; d=d->next) { + if (d->right != NULL) i++; + } + return i; +} + +/** + * Push a match-list element into the match-list array. + */ +static void push_match_list_element(fast_matcher_t *ctxt, Disjunct *d) +{ + if (ctxt->match_list_end >= ctxt->match_list_size) + { + ctxt->match_list_size *= MATCH_LIST_SIZE_INC; + /* XXX the realloc clobbers xalloc count */ + ctxt->match_list = realloc(ctxt->match_list, + ctxt->match_list_size * sizeof(*ctxt->match_list)); + } + + ctxt->match_list[ctxt->match_list_end++] = d; +} + +/** + * Free all of the hash tables and Match_nodes + */ +void free_fast_matcher(Sentence sent, fast_matcher_t *mchxt) +{ + if (NULL == mchxt) return; + + for (WordIdx w = 0; w < mchxt->size; w++) + { + xfree((char *)mchxt->l_table[w], mchxt->l_table_size[w] * sizeof (Match_node *)); + xfree((char *)mchxt->r_table[w], mchxt->r_table_size[w] * sizeof (Match_node *)); + } + + free(mchxt->match_list); + lgdebug(6, "Sentence size %zu, match_list_size %zu\n", + mchxt->size, mchxt->match_list_size); + + xfree(mchxt->l_table_size, mchxt->size * sizeof(unsigned int)); + xfree(mchxt->l_table, mchxt->size * sizeof(Match_node **)); + xfree(mchxt, sizeof(fast_matcher_t)); +} + +/** + * Adds the match node m to the sorted list of match nodes l. + * Makes the list sorted from smallest to largest. + */ +static Match_node * add_to_right_table_list(Match_node * m, Match_node * l) +{ + Match_node *p, *prev; + + if (l == NULL) return m; + + /* Insert m at head of list */ + if ((m->d->right->nearest_word) <= (l->d->right->nearest_word)) + { + m->next = l; + return m; + } + + /* Walk list to insertion point */ + prev = l; + p = prev->next; + while (p != NULL && ((m->d->right->nearest_word) > (p->d->right->nearest_word))) + { + prev = p; + p = p->next; + } + + m->next = p; + prev->next = m; + + return l; /* return pointer to original head */ +} + +/** + * Adds the match node m to the sorted list of match nodes l. + * Makes the list sorted from largest to smallest + */ +static Match_node * add_to_left_table_list(Match_node * m, Match_node * l) +{ + Match_node *p, *prev; + + if (l == NULL) return m; + + /* Insert m at head of list */ + if ((m->d->left->nearest_word) >= (l->d->left->nearest_word)) + { + m->next = l; + return m; + } + + /* Walk list to insertion point */ + prev = l; + p = prev->next; + while (p != NULL && ((m->d->left->nearest_word) < (p->d->left->nearest_word))) + { + prev = p; + p = p->next; + } + + m->next = p; + prev->next = m; + + return l; /* return pointer to original head */ +} + +/** + * Compare only the uppercase part of two connectors. + * Return true if they are the same, else false. + */ +static bool con_uc_eq(const Connector *c1, const Connector *c2) +{ + return (connector_uc_num(c1) == connector_uc_num(c2)); +} + +static Match_node **get_match_table_entry(unsigned int size, Match_node **t, + Connector * c, int dir) +{ + unsigned int h, s; + + s = h = connector_uc_hash(c) & (size-1); + + if (dir == 1) { + while (NULL != t[h]) + { + if (con_uc_eq(t[h]->d->right, c)) break; + + /* Increment and try again. Every hash bucket MUST have + * a unique upper-case part, since later on, we only + * compare the lower-case parts, assuming upper-case + * parts are already equal. So just look for the next + * unused hash bucket. + */ + h = (h + 1) & (size-1); + if (NULL == t[h]) break; + if (h == s) return NULL; + } + } + else + { + while (NULL != t[h]) + { + if (con_uc_eq(t[h]->d->left, c)) break; + h = (h + 1) & (size-1); + if (NULL == t[h]) break; + if (h == s) return NULL; + } + } + + return &t[h]; +} + +/** + * The disjunct d (whose left or right pointer points to c) is put + * into the appropriate hash table + * dir = 1, we're putting this into a right table. + * dir = -1, we're putting this into a left table. + */ +static void put_into_match_table(Sentence sent, unsigned int size, + Match_node ** t, Disjunct * d, + Connector * c, int dir) +{ + Match_node *m, **xl; + + m = pool_alloc(sent->fm_Match_node); + m->next = NULL; + m->d = d; + + xl = get_match_table_entry(size, t, c, dir); + assert(NULL != xl, "get_match_table_entry: Overflow"); + if (dir == 1) { + *xl = add_to_right_table_list(m, *xl); + } + else + { + *xl = add_to_left_table_list(m, *xl); + } +} + +fast_matcher_t* alloc_fast_matcher(const Sentence sent) +{ + unsigned int size; + size_t w; + size_t len; + Match_node ** t; + Disjunct * d; + fast_matcher_t *ctxt; + + ctxt = (fast_matcher_t *) xalloc(sizeof(fast_matcher_t)); + ctxt->size = sent->length; + ctxt->l_table_size = xalloc(2 * sent->length * sizeof(unsigned int)); + ctxt->r_table_size = ctxt->l_table_size + sent->length; + ctxt->l_table = xalloc(2 * sent->length * sizeof(Match_node **)); + ctxt->r_table = ctxt->l_table + sent->length; + memset(ctxt->l_table, 0, 2 * sent->length * sizeof(Match_node **)); + + ctxt->match_list_size = MATCH_LIST_SIZE_INIT; + ctxt->match_list = xalloc(ctxt->match_list_size * sizeof(*ctxt->match_list)); + ctxt->match_list_end = 0; + + if (NULL != sent->fm_Match_node) + { + pool_reuse(sent->fm_Match_node); + } + else + { + sent->fm_Match_node = + pool_new(__func__, "Match_node", + /*num_elements*/2048, sizeof(Match_node), + /*zero_out*/false, /*align*/true, /*exact*/false); + } + + for (w=0; wlength; w++) + { + len = left_disjunct_list_length(sent->word[w].d); + len = MIN(sent->dict->contable.num_con, len); + size = next_power_of_two_up(len); + ctxt->l_table_size[w] = size; + t = ctxt->l_table[w] = (Match_node **) xalloc(size * sizeof(Match_node *)); + memset(t, 0, size * sizeof(Match_node *)); + + for (d = sent->word[w].d; d != NULL; d = d->next) + { + if (d->left != NULL) + { + //printf("%s %d\n", connector_string(d->left), d->left->length_limit); + put_into_match_table(sent, size, t, d, d->left, -1); + } + } + + len = right_disjunct_list_length(sent->word[w].d); + len = MIN(sent->dict->contable.num_con, len); + size = next_power_of_two_up(len); + ctxt->r_table_size[w] = size; + t = ctxt->r_table[w] = (Match_node **) xalloc(size * sizeof(Match_node *)); + memset(t, 0, size * sizeof(Match_node *)); + + for (d = sent->word[w].d; d != NULL; d = d->next) + { + if (d->right != NULL) + { + //printf("%s %d\n", connector_string(d->right), d->right->length_limit); + put_into_match_table(sent, size, t, d, d->right, 1); + } + } + } + + return ctxt; +} + +#if 0 +/** + * Print statistics on various connector matching aspects. + * A summary can be found by the shell commands: + * link-parser < file.batch | grep match_stats: | sort | uniq -c + */ +static void match_stats(Connector *c1, Connector *c2) +{ + if (NULL == c1) printf("match_stats: cache\n"); + if (NULL == c2) return; + if ((1 == c1->uc_start) && (1 == c2->uc_start) && + (c1->string[0] == c2->string[0])) + { + printf("match_stats: h/d mismatch\n"); + } + + if (0 == c1->lc_start) printf("match_stats: no lc (c1)\n"); + if (0 == c2->lc_start) printf("match_stats: no lc (c2)\n"); + + if (string_set_cmp(c1->string, c2->string)) printf("match_stats: same\n"); + + const char *a = &c1->string[c1->lc_start]; + const char *b = &c2->string[c2->lc_start]; + do + { + if (*a != *b && (*a != '*') && (*b != '*')) printf("match_stats: lc false\n"); + a++; + b++; + } while (*a != '\0' && *b != '\0'); + printf("match_stats: lc true\n"); +} +#else +#define match_stats(a, b) +#endif + +#ifdef DEBUG +#undef N +#define N(c) (c?connector_string(c):"") + +/** + * Print the match list, including connector match indications. + * Usage: link-parser -verbosity=9 -debug=print_match_list + * Output format: + * MATCH_NODE list_id: lw>lc [=] leftright [=] rcmatch_list[mlb]; + + for (; NULL != *m; m++) + { + Disjunct *d = *m; + + prt_error("MATCH_NODE %5d: %02d>%-9s %c %9s<%02d>%-9s %c %9s<%02d\n", + id, lw , N(lc), d->match_left ? '=': ' ', + N(d->left), w, N(d->right), + d->match_right? '=' : ' ', N(rc), rw); + } +} +#else +#define print_match_list(...) +#endif + +typedef struct +{ + const condesc_t *desc; + bool match; +} match_cache; + +/** + * Match the lower-case parts of connectors, and the head-dependent, + * using a cache of the most recent compare. Due to the way disjuncts + * are written, we are often asked to compare to the same connector + * 3 or 4 times in a row. So if we already did that compare, just use + * the cached result. (i.e. the caching here is almost trivial, but it + * works well). + */ +static bool do_match_with_cache(Connector *a, Connector *b, match_cache *c_con) +{ + /* The following uses a string-set compare - string_set_cmp() cannot + * be used here because c_con->string may be NULL. */ + match_stats(c_con->string == a->string ? NULL : a, NULL); + UNREACHABLE(connector_desc(a) == NULL); // clang static analyzer suppression. + if (c_con->desc == connector_desc(a)) + { + /* The match_cache desc field is initialized to NULL, and this is + * enough because the connector desc filed cannot be NULL, as it + * actually fetched a non-empty match list. */ + PRAGMA_MAYBE_UNINITIALIZED + return c_con->match; + PRAGMA_END + } + + /* No cache match. Check if the connectors match and cache the result. + * We know that the uc parts of the connectors are the same, because + * we fetch the matching lists according to the uc part or the + * connectors to be matched. So the uc parts are not checked here. */ + c_con->match = lc_easy_match(connector_desc(a), connector_desc(b)); + c_con->desc = connector_desc(a); + + return c_con->match; +} + +typedef struct +{ + const Gword *gword; + bool same_alternative; +} gword_cache; + +/** + * Return true iff c1 and c2 are from the same alternative. + * An optimization for English checks if one of the connectors belongs + * to an original sentence word (c2 is checked first for an inline + * optimization opportunity). + * If a wordgraph word of the checked connector is the same + * as of the previously checked one, use the cached result. + * (The first wordgraph word is used for cache validity indication, + * but there is only one most of the times anyway.) + */ +#define ALT_CONNECTION_POSSIBLE +#define OPTIMIZE_EN +static bool alt_connection_possible(Connector *c1, Connector *c2, + gword_cache *c_con) +{ +#ifdef ALT_CONNECTION_POSSIBLE + bool same_alternative = false; + +#ifdef OPTIMIZE_EN + /* Try a shortcut first. */ + if ((c2->originating_gword->o_gword->hier_depth == 0) || + (c1->originating_gword->o_gword->hier_depth == 0)) + { + return true; + } +#endif /* OPTIMIZE_EN */ + + if (c1->originating_gword->o_gword == c_con->gword) + return c_con->same_alternative; + + /* Each of the loops is of one iteration most of the times. */ + for (const gword_set *ga = c1->originating_gword; NULL != ga; ga = ga->next) { + for (const gword_set *gb = c2->originating_gword; NULL != gb; gb = gb->next) { + if (in_same_alternative(ga->o_gword, gb->o_gword)) { + same_alternative = true; + break; + } + } + if (same_alternative) break; + } + + c_con->same_alternative = same_alternative; + c_con->gword = c1->originating_gword->o_gword; + + + return same_alternative; +#else + return true; +#endif /* ALT_CONNECTION_POSSIBLE */ +} + +/** + * Forms and returns a list of disjuncts coming from word w, that + * actually matches lc or rc or both. The lw and rw are the words from + * which lc and rc came respectively. + * + * The list is returned in an array of Match_nodes. This list + * contains no duplicates, because when processing the ml list, only + * elements whose match_left is true are included, and such elements are + * not included again when processing the mr list. + * + * Note that if both lc and rc match the corresponding connectors of w, + * match_left is set to true when the ml list is processed and the + * disjunct is then added to the result list, and match_right of the + * same disjunct is set to true when the mr list is processed, and this + * disjunct is not added again. + */ +size_t +form_match_list(fast_matcher_t *ctxt, int w, + Connector *lc, int lw, + Connector *rc, int rw) +{ + Match_node *mx, *mr_end, **mxp; + size_t front = ctxt->match_list_end; + Match_node *ml = NULL, *mr = NULL; + match_cache mc; + gword_cache gc; + + gc.same_alternative = false; + +#ifdef VERIFY_MATCH_LIST + static int id = 0; + int lid = ++id; /* A local copy, for multi-threading support. */ +#endif + + /* Get the lists of candidate matching disjuncts of word w for lc and + * rc. Consider each of these lists only if the length_limit of lc + * rc and also w, is not greater then the distance between their word + * and the word w. */ + if ((lc != NULL) && ((w - lw) <= lc->length_limit)) + { + mxp = get_match_table_entry(ctxt->l_table_size[w], ctxt->l_table[w], lc, -1); + if (NULL != mxp) ml = *mxp; + } + if ((rc != NULL) && ((rw - w) <= rc->length_limit)) + { + mxp = get_match_table_entry(ctxt->r_table_size[w], ctxt->r_table[w], rc, 1); + if (NULL != mxp) mr = *mxp; + } + + for (mx = mr; mx != NULL; mx = mx->next) + { + if (mx->d->right->nearest_word > rw) break; + mx->d->match_left = false; + } + mr_end = mx; + + /* Construct the list of things that could match the left. */ + mc.desc = NULL; + gc.gword = NULL; + for (mx = ml; mx != NULL; mx = mx->next) + { + if (mx->d->left->nearest_word < lw) break; + if ((w - lw) > mx->d->left->length_limit) continue; + + mx->d->match_left = do_match_with_cache(mx->d->left, lc, &mc) && + alt_connection_possible(mx->d->left, lc, &gc); + if (!mx->d->match_left) continue; + mx->d->match_right = false; + +#ifdef VERIFY_MATCH_LIST + mx->d->match_id = lid; +#endif + push_match_list_element(ctxt, mx->d); + } + + /* Append the list of things that could match the right. + * Note that it is important to set here match_right correctly even + * if we are going to skip this element here because its match_left + * is true, since then it means it is already included in the match + * list. */ + mc.desc = NULL; + gc.gword = NULL; + for (mx = mr; mx != mr_end; mx = mx->next) + { + if ((rw - w) > mx->d->right->length_limit) continue; + + mx->d->match_right = do_match_with_cache(mx->d->right, rc, &mc) && + alt_connection_possible(mx->d->right, rc, &gc); + if (!mx->d->match_right || mx->d->match_left) continue; + +#ifdef VERIFY_MATCH_LIST + mx->d->match_id = lid; +#endif + push_match_list_element(ctxt, mx->d); + } + + push_match_list_element(ctxt, NULL); + print_match_list(ctxt, lid, front, w, lc, lw, rc, rw); + return front; +} diff -Nru link-grammar-5.3.16/link-grammar/parse/fast-match.h link-grammar-5.5.0/link-grammar/parse/fast-match.h --- link-grammar-5.3.16/link-grammar/parse/fast-match.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/fast-match.h 2018-03-06 02:44:41.000000000 +0000 @@ -0,0 +1,67 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _FAST_MATCH_H_ +#define _FAST_MATCH_H_ + +#include // for size_t +#include "api-types.h" +#include "link-includes.h" // for Sentence +#include "memory-pool.h" + +typedef struct Match_node_struct Match_node; +struct Match_node_struct +{ + Match_node * next; + Disjunct * d; +}; + +typedef struct fast_matcher_s fast_matcher_t; +struct fast_matcher_s +{ + size_t size; + unsigned int *l_table_size; /* the sizes of the hash tables */ + unsigned int *r_table_size; + + /* the beginnings of the hash tables */ + Match_node *** l_table; + Match_node *** r_table; + + /* I'll pedantically maintain my own array of these cells */ + Disjunct ** match_list; /* match-list stack */ + size_t match_list_end; /* index to the match-list stack end */ + size_t match_list_size; /* number of allocated elements */ +}; + +/* See the source file for documentation. */ +fast_matcher_t* alloc_fast_matcher(const Sentence); +void free_fast_matcher(Sentence sent, fast_matcher_t*); + +size_t form_match_list(fast_matcher_t *, int, Connector *, int, Connector *, int); + +/** + * Return the match-list element at the given index. + */ +static inline Disjunct *get_match_list_element(fast_matcher_t *ctxt, size_t mli) +{ + return ctxt->match_list[mli]; +} + +/** + * Pop up the match-list stack + */ +static inline void pop_match_list(fast_matcher_t *ctxt, size_t match_list_last) +{ + ctxt->match_list_end = match_list_last; +} + +#endif diff -Nru link-grammar-5.3.16/link-grammar/parse/histogram.c link-grammar-5.5.0/link-grammar/parse/histogram.c --- link-grammar-5.3.16/link-grammar/parse/histogram.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/histogram.c 2017-06-17 01:29:32.000000000 +0000 @@ -0,0 +1,181 @@ +/*************************************************************************/ +/* Copyright (c) 2015 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include "histogram.h" + +#ifdef PERFORM_COUNT_HISTOGRAMMING +/* A histogram distribution of the parse counts. */ + +Count_bin hist_zero(void) +{ + static Count_bin zero + = {0, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 0}; + return zero; +} + +Count_bin hist_one(void) +{ + static Count_bin one + = {0, 1, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 0}; + return one; +} + +#define BIN_WIDTH 0.334 + +/** + * Accumulate counts in 'a', adding them to sum. + * The histogram is shifted by the amount 'cost'. + * That is, the bins are shifted over by the integer part of the cost + * (scaled to the bin-width). + */ +void hist_accum(Count_bin* sum, double cost, const Count_bin* a) +{ + unsigned int i; + unsigned int start; + + // Skip, if nothing to accumulate. + if (0 == a->total) return; + sum->total += a->total; + + // The cost tells us how much to offset the histogram in a, + // before accumulating it. 'base' is the bin number of the first + // non-empty bin. + start = (unsigned int) floor (cost / BIN_WIDTH); + if (0 == sum->bin[0]) + { + sum->base = start; + start = 0; + } + + for (i = start; i < NUM_BINS; i++) + { + sum->bin[i] += a->bin[i-start]; + } + for (i = NUM_BINS-start; i < NUM_BINS; i++) + { + sum->overrun += a->bin[i]; + } + sum->overrun += a->overrun; +} + +/** Same as above */ +void hist_accumv(Count_bin* sum, double cost, const Count_bin a) +{ + hist_accum(sum, cost, &a); +} + +/** + * Create a product of two histogrammed counts. + * Observe that doing so requires a kind-of cross-product to + * be performed, thus, a nested double sum. + */ +void hist_prod(Count_bin* prod, const Count_bin* a, const Count_bin* b) +{ + unsigned int i, k; + + // Skip, if the product is zero. + if (0 == a->total || 0 == b->total) return; + prod->total = a->total * b->total; + +// #define SLOW_BUT_SIMPLE 1 +#ifdef SLOW_BUT_SIMPLE + /* The below implements the straight-forward concept of the product. + * Its not quite optimal, because the intialization loop, and the + * if check can be eliminated by re-writing j = k-i. + */ + for (i = 0; i < NUM_BINS; i++) prod->bin[i] = 0; + prod->overrun = 0; + for (i = 0; i < NUM_BINS; i++) + { + for (j = 0; j < NUM_BINS; j++) + { + if (i+j < NUM_BINS) + prod->bin[i+j] += a->bin[i] * b->bin[j]; + else + prod->overrun += a->bin[i] * b->bin[j]; + } + + prod->overrun += a->bin[i] * b->overrun; + prod->overrun += a->overrun * b->bin[i]; + } + prod->overrun += a->overrun * b->overrun; +#else + /* The below does exactly the same thing as the above, but + * ever so slightly more quickly. Some pointless checks get + * eliminated. + */ + prod->overrun = 0; + for (k = 0; k < NUM_BINS; k++) + { + prod->bin[k] = 0; + for (i = 0; i <= k; i++) + { + prod->bin[k] += a->bin[i] * b->bin[k-i]; + } + prod->overrun += a->bin[k] * b->overrun; + prod->overrun += a->overrun * b->bin[k]; + } + for (k = NUM_BINS; k < 2 * NUM_BINS - 1; k++) + { + for (i = k - NUM_BINS + 1; i < NUM_BINS; i++) + { + prod->overrun += a->bin[i] * b->bin[k-i]; + } + } + prod->overrun += a->overrun * b->overrun; +#endif +} + +/** + * Multiply two histograms 'a' and 'b', and accumulate them into 'acc'. + * The accumulated histogram is first shifted by 'cost'. + */ +void hist_muladd(Count_bin* acc, const Count_bin* a, double cost, const Count_bin* b) +{ + Count_bin tmp = hist_zero(); + hist_prod(&tmp, a, b); + hist_accum(acc, cost, &tmp); +} + +void hist_muladdv(Count_bin* acc, const Count_bin* a, double cost, const Count_bin b) +{ + hist_muladd(acc, a, cost, &b); +} + +double hist_cost_cutoff(Count_bin* hist, int count) +{ + int i; + s64 cnt = 0; + + for (i=0; ibin[i]; + if (count <= cnt) + return ((double) i + hist->base) * BIN_WIDTH; + } + return 1.0e38; +} + +s64 hist_cut_total(Count_bin* hist, int min_total) +{ + int i; + s64 cnt = 0; + + for (i=0; ibin[i]; + if (min_total <= cnt) return cnt; + } + return hist->total; +} + +#endif /* PERFORM_COUNT_HISTOGRAMMING */ diff -Nru link-grammar-5.3.16/link-grammar/parse/histogram.h link-grammar-5.5.0/link-grammar/parse/histogram.h --- link-grammar-5.3.16/link-grammar/parse/histogram.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/histogram.h 2017-06-17 01:29:32.000000000 +0000 @@ -0,0 +1,94 @@ +/*************************************************************************/ +/* Copyright (c) 2015 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _HISTOGRAM_H_ +#define _HISTOGRAM_H_ + +#ifndef _MSC_VER +typedef long long s64; /* signed 64-bit integer, even on 32-bit cpus */ +#define PARSE_NUM_OVERFLOW (1LL<<24) +#else +/* Microsoft Visual C Version 6 doesn't support long long. */ +typedef signed __int64 s64; /* signed 64-bit integer, even on 32-bit cpus */ +#define PARSE_NUM_OVERFLOW (((s64)1)<<24) +#endif + + +/* + * Count Histogramming is currently not required for anything, and the + * code runs about 6% faster when it is disabled. + * +#define PERFORM_COUNT_HISTOGRAMMING 1 + */ +#ifdef PERFORM_COUNT_HISTOGRAMMING + +/** + * A histogram distribution of the parse counts. + * The histogram is with respect to the cost of the parse. Thus, each + * bin of the histogram contains a count of the number of parses + * achievable with that cost. Rather than setting the baseline cost + * at zero, it is dynamically scaled, so that 'base' is the number of + * the first bin with a non-zero count in it. If there are counts that + * don't fit into the available bins, then they are accumulated into + * the overrun bin. It is always the case that + * total == sum_i bin[i] + overrun + */ +#define NUM_BINS 12 +struct Count_bin_s +{ + short base; + s64 total; + s64 bin[NUM_BINS]; + s64 overrun; +}; + +typedef struct Count_bin_s Count_bin; + +Count_bin hist_zero(void); +Count_bin hist_one(void); + +void hist_accum(Count_bin* sum, double, const Count_bin*); +void hist_accumv(Count_bin* sum, double, const Count_bin); +void hist_prod(Count_bin* prod, const Count_bin*, const Count_bin*); +void hist_muladd(Count_bin* prod, const Count_bin*, double, const Count_bin*); +void hist_muladdv(Count_bin* prod, const Count_bin*, double, const Count_bin); + +static inline s64 hist_total(Count_bin* tot) { return tot->total; } +s64 hist_cut_total(Count_bin* tot, int min_total); + +double hist_cost_cutoff(Count_bin*, int count); + +#else + +typedef s64 Count_bin; + +static inline Count_bin hist_zero(void) { return 0; } +static inline Count_bin hist_one(void) { return 1; } + +static inline void hist_accum(Count_bin* sum, double cost, Count_bin* a) + { *sum += *a; } +static inline void hist_accumv(Count_bin* sum, double cost, Count_bin a) + { *sum += a; } +static inline void hist_prod(Count_bin* prod, Count_bin* a, Count_bin* b) + { *prod = (*a) * (*b); } +static inline void hist_muladd(Count_bin* prod, Count_bin* a, double cost, Count_bin* b) + { *prod += (*a) * (*b); } +static inline void hist_muladdv(Count_bin* prod, Count_bin* a, double cost, Count_bin b) + { *prod += (*a) * b; } + +static inline s64 hist_total(Count_bin* tot) { return *tot; } +static inline s64 hist_cut_total(Count_bin* tot, int min_total) { return *tot; } + +static inline double hist_cost_cutoff(Count_bin* tot, int count) { return 1.0e38; } + +#endif /* PERFORM_COUNT_HISTOGRAMMING */ + +#endif /* _HISTOGRAM_H_ */ diff -Nru link-grammar-5.3.16/link-grammar/parse/parse.c link-grammar-5.5.0/link-grammar/parse/parse.c --- link-grammar-5.3.16/link-grammar/parse/parse.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/parse.c 2018-03-11 23:13:19.000000000 +0000 @@ -0,0 +1,432 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright 2008, 2009, 2013, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include "api-structures.h" +#include "count.h" +#include "dict-common/dict-common.h" // For Dictionary_s +#include "disjunct-utils.h" +#include "extract-links.h" +#include "fast-match.h" +#include "linkage/analyze-linkage.h" +#include "linkage/linkage.h" +#include "linkage/sane.h" +#include "parse.h" +#include "post-process/post-process.h" +#include "preparation.h" +#include "prune.h" +#include "resources.h" +#include "tokenize/word-structures.h" // For Word_struct + +#define D_PARSE 5 /* Debug level for this file. */ + +static Linkage linkage_array_new(int num_to_alloc) +{ + Linkage lkgs = (Linkage) malloc(num_to_alloc * sizeof(struct Linkage_s)); + memset(lkgs, 0, num_to_alloc * sizeof(struct Linkage_s)); + return lkgs; +} + +static bool setup_linkages(Sentence sent, extractor_t* pex, + fast_matcher_t* mchxt, + count_context_t* ctxt, + Parse_Options opts) +{ + bool overflowed = build_parse_set(pex, sent, mchxt, ctxt, sent->null_count, opts); + print_time(opts, "Built parse set"); + + if (overflowed && (1 < opts->verbosity)) + { + err_ctxt ec = { sent }; + err_msgc(&ec, lg_Warn, "Count overflow.\n" + "Considering a random subset of %zu of an unknown and large number of linkages\n", + opts->linkage_limit); + } + + if (sent->num_linkages_found == 0) + { + sent->num_linkages_alloced = 0; + sent->num_linkages_post_processed = 0; + sent->num_valid_linkages = 0; + sent->lnkages = NULL; + return overflowed; + } + + sent->num_linkages_alloced = + MIN(sent->num_linkages_found, (int) opts->linkage_limit); + + /* Now actually malloc the array in which we will process linkages. */ + /* We may have been called before, e.g. this might be a panic parse, + * and the linkages array may still be there from last time. + * XXX free_linkages() zeros sent->num_linkages_found. */ + if (sent->lnkages) free_linkages(sent); + sent->lnkages = linkage_array_new(sent->num_linkages_alloced); + + return overflowed; +} + +/** + * Print the chosen_disjuncts words. + * This is used for debug, e.g. for tracking them in the Wordgraph display. + */ +static void print_chosen_disjuncts_words(const Linkage lkg, bool prt_optword) +{ + size_t i; + dyn_str *djwbuf = dyn_str_new(); + + err_msg(lg_Debug, "Linkage %p (%zu words): ", lkg, lkg->num_words); + for (i = 0; i < lkg->num_words; i++) + { + Disjunct *cdj = lkg->chosen_disjuncts[i]; + const char *djw; /* disjunct word - the chosen word */ + + if (NULL == cdj) + djw = (prt_optword && lkg->sent->word[i].optional) ? "{}" : "[]"; + else if ('\0' == cdj->word_string[0]) + djw = "\\0"; /* null string - something is wrong */ + else + djw = cdj->word_string; + + dyn_strcat(djwbuf, djw); + dyn_strcat(djwbuf, " "); + } + err_msg(lg_Debug, "%s\n", djwbuf->str); + dyn_str_delete(djwbuf); +} + +#define D_PL 7 +/** + * This fills the linkage array with morphologically-acceptable + * linkages. + */ +static void process_linkages(Sentence sent, extractor_t* pex, + bool overflowed, Parse_Options opts) +{ + if (0 == sent->num_linkages_found) return; + if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */ + + /* Pick random linkages if we get more than what was asked for. */ + bool pick_randomly = overflowed || + (sent->num_linkages_found > (int) sent->num_linkages_alloced); + + sent->num_valid_linkages = 0; + size_t N_invalid_morphism = 0; + + int itry = 0; + size_t in = 0; + int maxtries; + + /* In the case of overflow, which will happen for some long + * sentences, but is particularly common for the amy/ady random + * splitters, we want to find as many morpho-acceptable linkages + * as possible, but keep the CPU usage down, as these might be + * very rare. This is due to a bug/feature in the interaction + * between the word-graph and the parser: valid morph linkages + * can be one-in-a-thousand.. or worse. Search for them, but + * don't over-do it. + * Note: This problem has recently been alleviated by an + * alternatives-compatibility check in the fast matcher - see + * alt_connection_possible(). + */ +#define MAX_TRIES 250000 + + if (pick_randomly) + { + /* Try picking many more linkages, but not more than possible. */ + maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES, + sent->num_linkages_found); + } + else + { + maxtries = sent->num_linkages_alloced; + } + + bool need_init = true; + for (itry=0; itrylnkages[in]; + Linkage_info * lifo = &lkg->lifo; + + /* Negative values tell extract-links to pick randomly; for + * reproducible-rand, the actual value is the rand seed. */ + lifo->index = pick_randomly ? -(itry+1) : itry; + + if (need_init) + { + partial_init_linkage(sent, lkg, sent->length); + need_init = false; + } + extract_links(pex, lkg); + compute_link_names(lkg, sent->string_set); + + if (verbosity_level(+D_PL)) + { + err_msg(lg_Debug, "chosen_disjuncts before:\n\\"); + print_chosen_disjuncts_words(lkg, /*prt_opt*/true); + } + + if (sane_linkage_morphism(sent, lkg, opts)) + { + remove_empty_words(lkg); + + if (verbosity_level(+D_PL)) + { + err_msg(lg_Debug, "chosen_disjuncts after:\n\\"); + print_chosen_disjuncts_words(lkg, /*prt_opt*/false); + } + + need_init = true; + in++; + if (in >= sent->num_linkages_alloced) break; + } + else + { + N_invalid_morphism++; + lkg->num_links = 0; + lkg->num_words = sent->length; + // memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); + memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *)); + } + } + + /* The last one was alloced, but never actually used. Free it. */ + if (!need_init) free_linkage(&sent->lnkages[in]); + + sent->num_valid_linkages = in; + + /* The remainder of the array is garbage; we never filled it in. + * So just pretend that it's shorter than it is */ + sent->num_linkages_alloced = sent->num_valid_linkages; + + lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had " + "invalid morphology construction\n", N_invalid_morphism, + itry + (itry != maxtries)); +} + +static void sort_linkages(Sentence sent, Parse_Options opts) +{ + if (0 == sent->num_linkages_found) return; + + /* It they're randomized, don't bother sorting */ + if (0 != sent->rand_state && sent->dict->shuffle_linkages) return; + + qsort((void *)sent->lnkages, sent->num_linkages_alloced, + sizeof(struct Linkage_s), + (int (*)(const void *, const void *))opts->cost_model.compare_fn); + +#ifdef DEBUG + /* Skip in case of a timeout - sent->lnkages may be inconsistent then. */ + if (!resources_exhausted(opts->resources)) + { + /* num_linkages_post_processed sanity check (ONLY). */ + size_t in; + size_t N_linkages_post_processed = 0; + for (in=0; in < sent->num_linkages_alloced; in++) + { + Linkage_info *lifo = &sent->lnkages[in].lifo; + if (lifo->discarded) break; + N_linkages_post_processed++; + } + assert(sent->num_linkages_post_processed==N_linkages_post_processed, + "Bad num_linkages_post_processed (%zu!=%zu)", + sent->num_linkages_post_processed, N_linkages_post_processed); + } +#endif + + print_time(opts, "Sorted all linkages"); +} + +#define SHORTEST_SENTENCE_TO_PACK 9 + +/** + * Pack all disjunct and connectors into one big memory block. + * This facilitate a better memory caching for long sentences + * (a performance gain of a few percents). + * + * The current Connector struct size is 32 bit, and future ones may be + * smaller, but still with a power-of-2 size. + * The idea is to put an integral number of connectors in each cache line + * (assumed to be >= Connector struct size, e.g. 64 bytes), + * so one connector will not need 2 cache lines. + * + * The allocated memory includes 3 sections , in that order: + * 1. A block for disjuncts, when it start is not aligned (the disjunct size + * is currently 56 bytes and cannot be reduced much). + * 2. A small alignment gap, that ends in a 64-byte boundary. + * 3. A block of connectors, which is so aligned to 64-byte boundary. + * + * FIXME: 1. Find the "best" value for SHORTEST_SENTENCE_TO_PACK. + * 2. Maybe this check should be done in too stages, the second one + * will use number of disjunct and connector thresholds. + */ +static void pack_sentence(Sentence sent) +{ + int dcnt = 0; + int ccnt = 0; + + if (sent->length < SHORTEST_SENTENCE_TO_PACK) return; + for (size_t w = 0; w < sent->length; w++) + { + Disjunct *d; + + for (d = sent->word[w].d; NULL != d; d = d->next) + { + dcnt++; + for (Connector *c = d->right; c!=NULL; c = c->next) ccnt++; + for (Connector *c = d->left; c != NULL; c = c->next) ccnt++; + } + } + +#define CONN_ALIGNMENT sizeof(Connector) + size_t dsize = dcnt * sizeof(Disjunct); + dsize = ALIGN(dsize, CONN_ALIGNMENT); /* Align connector block. */ + size_t csize = ccnt * sizeof(Connector); + void *memblock = malloc(dsize + csize); + Disjunct *dblock = memblock; + Connector *cblock = (Connector *)((char *)memblock + dsize); + sent->disjuncts_connectors_memblock = memblock; + + for (size_t i = 0; i < sent->length; i++) + { + Disjunct *word_disjuncts = sent->word[i].d; + + sent->word[i].d = pack_disjuncts_dup(sent->word[i].d, &dblock, &cblock); + free_disjuncts(word_disjuncts); + } +} + + +/** + * classic_parse() -- parse the given sentence. + * Perform parsing, using the original link-grammar parsing algorithm + * given in the original link-grammar papers. + * + * Do the parse with the minimum number of null-links within the range + * specified by opts->min_null_count and opts->max_null_count. + * + * To that end, call do_parse() with an increasing null_count, from + * opts->min_null_count up to (including) opts->max_null_count, until a + * parse is found. + * + * A note about the disjuncts save/restore that is done here: + * To increase the parsing speed, before invoking do_parse(), + * pp_and_power_prune() is invoked to remove connectors which have no + * possibility to connect. It includes a significant optimization when + * null_count==0 that makes a more aggressive removal, but this + * optimization is not appropriate when null_count>0. + * + * So in case this optimization has been done and a complete parse (i.e. + * a parse when null_count==0) is not found, we are left with sentence + * disjuncts which are not appropriate to continue do_parse() tries with + * null_count>0. To solve that, we need to restore the original + * disjuncts of the sentence and call pp_and_power_prune() once again. + */ +void classic_parse(Sentence sent, Parse_Options opts) +{ + fast_matcher_t * mchxt = NULL; + count_context_t * ctxt = NULL; + bool pp_and_power_prune_done = false; + Disjunct **disjuncts_copy = NULL; + bool is_null_count_0 = (0 == opts->min_null_count); + int max_null_count = MIN((int)sent->length, opts->max_null_count); + + /* Build lists of disjuncts */ + prepare_to_parse(sent, opts); + if (resources_exhausted(opts->resources)) return; + + if (is_null_count_0 && (0 < max_null_count)) + { + /* Save the disjuncts in case we need to parse with null_count>0. */ + disjuncts_copy = alloca(sent->length * sizeof(Disjunct *)); + for (size_t i = 0; i < sent->length; i++) + disjuncts_copy[i] = disjuncts_dup(sent->word[i].d); + } + + for (int nl = opts->min_null_count; nl <= max_null_count; nl++) + { + Count_bin hist; + s64 total; + + if (!pp_and_power_prune_done) + { + if (0 != nl) + { + pp_and_power_prune_done = true; + if (is_null_count_0) + opts->min_null_count = 1; /* Don't optimize for null_count==0. */ + + /* We are parsing now with null_count>0, when previously we + * parsed with null_count==0. Restore the save disjuncts. */ + if (NULL != disjuncts_copy) + { + free_sentence_disjuncts(sent); + for (size_t i = 0; i < sent->length; i++) + sent->word[i].d = disjuncts_copy[i]; + disjuncts_copy = NULL; + } + } + pp_and_power_prune(sent, opts); + if (is_null_count_0) opts->min_null_count = 0; + if (resources_exhausted(opts->resources)) break; + + free_count_context(ctxt, sent); + free_fast_matcher(sent, mchxt); + pack_sentence(sent); + ctxt = alloc_count_context(sent); + mchxt = alloc_fast_matcher(sent); + print_time(opts, "Initialized fast matcher"); + } + + if (resources_exhausted(opts->resources)) break; + free_linkages(sent); + + sent->null_count = nl; + hist = do_parse(sent, mchxt, ctxt, sent->null_count, opts); + total = hist_total(&hist); + + lgdebug(D_PARSE, "Info: Total count with %zu null links: %lld\n", + sent->null_count, total); + + /* total is 64-bit, num_linkages_found is 32-bit. Clamp */ + total = (total > INT_MAX) ? INT_MAX : total; + total = (total < 0) ? INT_MAX : total; + + sent->num_linkages_found = (int) total; + print_time(opts, "Counted parses"); + + extractor_t * pex = extractor_new(sent->length, sent->rand_state); + bool ovfl = setup_linkages(sent, pex, mchxt, ctxt, opts); + process_linkages(sent, pex, ovfl, opts); + free_extractor(pex); + + post_process_lkgs(sent, opts); + + if (sent->num_valid_linkages > 0) break; + if ((0 == nl) && (0 < max_null_count) && verbosity > 0) + prt_error("No complete linkages found.\n"); + + /* If we are here, then no valid linkages were found. + * If there was a parse overflow, give up now. */ + if (PARSE_NUM_OVERFLOW < total) break; + //if (sent->num_linkages_found > 0 && nl>0) printf("NUM_LINKAGES_FOUND %d\n", sent->num_linkages_found); + } + sort_linkages(sent, opts); + + if (NULL != disjuncts_copy) + { + for (size_t i = 0; i < sent->length; i++) + free_disjuncts(disjuncts_copy[i]); + } + free_count_context(ctxt, sent); + free_fast_matcher(sent, mchxt); +} diff -Nru link-grammar-5.3.16/link-grammar/parse/parse.h link-grammar-5.5.0/link-grammar/parse/parse.h --- link-grammar-5.3.16/link-grammar/parse/parse.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/parse.h 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,3 @@ + + +void classic_parse(Sentence, Parse_Options); diff -Nru link-grammar-5.3.16/link-grammar/parse/preparation.c link-grammar-5.5.0/link-grammar/parse/preparation.c --- link-grammar-5.3.16/link-grammar/parse/preparation.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/preparation.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,146 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-structures.h" +#include "prepare/build-disjuncts.h" +#include "connectors.h" +#include "dict-common/dict-common.h" // For Dictionary_s +#include "disjunct-utils.h" +#include "externs.h" +#include "preparation.h" +#include "print/print.h" +#include "prune.h" +#include "resources.h" +#include "string-set.h" +#include "tokenize/word-structures.h" // for Word_struct + +/** + * Set c->nearest_word to the nearest word that this connector could + * possibly connect to. The connector *might*, in the end, + * connect to something more distant, but this is the nearest + * one that could be connected. + */ +static int set_dist_fields(Connector * c, size_t w, int delta) +{ + int i; + if (c == NULL) return (int) w; + i = set_dist_fields(c->next, w, delta) + delta; + c->nearest_word = i; + return i; +} + +/** + * Initialize the word fields of the connectors, and + * eliminate those disjuncts that are so long, that they + * would need to connect past the end of the sentence. + */ +static void setup_connectors(Sentence sent) +{ + size_t w; + Disjunct * d, * xd, * head; + for (w=0; wlength; w++) + { + head = NULL; + for (d=sent->word[w].d; d!=NULL; d=xd) + { + xd = d->next; + if ((set_dist_fields(d->left, w, -1) < 0) || + (set_dist_fields(d->right, w, 1) >= (int) sent->length)) + { + d->next = NULL; + free_disjuncts(d); + } + else + { + d->next = head; + head = d; + } + } + sent->word[w].d = head; + } +} + +/** + * Record the wordgraph word in each of its connectors. + * It is used for checking alternatives consistency. + */ +static void gword_record_in_connector(Sentence sent) +{ + for (size_t w = 0; w < sent->length; w++) + { + for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next) + { + for (Connector *c = d->right; NULL != c; c = c->next) + c->originating_gword = d->originating_gword; + for (Connector *c = d->left; NULL != c; c = c->next) + c->originating_gword = d->originating_gword; + } + } +} + +/** + * Turn sentence expressions into disjuncts. + * Sentence expressions must have been built, before calling this routine. + */ +static void build_sentence_disjuncts(Sentence sent, double cost_cutoff, + Parse_Options opts) +{ + Disjunct * d; + X_node * x; + size_t w; + for (w = 0; w < sent->length; w++) + { + d = NULL; + for (x = sent->word[w].x; x != NULL; x = x->next) + { + Disjunct *dx = build_disjuncts_for_exp(x->exp, x->string, cost_cutoff, opts); + word_record_in_disjunct(x->word, dx); + d = catenate_disjuncts(dx, d); + } + sent->word[w].d = d; + } +} + +/** + * Assumes that the sentence expression lists have been generated. + */ +void prepare_to_parse(Sentence sent, Parse_Options opts) +{ + size_t i; + + build_sentence_disjuncts(sent, opts->disjunct_cost, opts); + if (verbosity_level(5)) + { + prt_error("Debug: After expanding expressions into disjuncts:\n"); + print_disjunct_counts(sent); + } + print_time(opts, "Built disjuncts"); + + for (i=0; ilength; i++) + { + sent->word[i].d = eliminate_duplicate_disjuncts(sent->word[i].d); + + /* Some long Russian sentences can really blow up, here. */ + if (resources_exhausted(opts->resources)) + return; + } + print_time(opts, "Eliminated duplicate disjuncts"); + + if (verbosity_level(5)) + { + prt_error("Debug: After expression pruning and duplicate elimination:\n"); + print_disjunct_counts(sent); + } + + gword_record_in_connector(sent); + setup_connectors(sent); +} diff -Nru link-grammar-5.3.16/link-grammar/parse/preparation.h link-grammar-5.5.0/link-grammar/parse/preparation.h --- link-grammar-5.3.16/link-grammar/parse/preparation.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/preparation.h 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,19 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _PREPARATION_H +#define _PREPARATION_H +#include "link-includes.h" + +void prepare_to_parse(Sentence, Parse_Options); +#endif /* _PREPARATION_H */ diff -Nru link-grammar-5.3.16/link-grammar/parse/prune.c link-grammar-5.5.0/link-grammar/parse/prune.c --- link-grammar-5.3.16/link-grammar/parse/prune.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/prune.c 2018-03-06 02:44:41.000000000 +0000 @@ -0,0 +1,1104 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2009, 2013, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-structures.h" +#include "connectors.h" +#include "disjunct-utils.h" +#include "dict-common/dict-common.h" // For contable +#include "post-process/post-process.h" +#include "post-process/pp-structures.h" +#include "print/print.h" // For print_disjunct_counts() + +#include "prune.h" +#include "resources.h" +#include "string-set.h" +#include "tokenize/word-structures.h" // for Word_struct +#include "tokenize/wordgraph.h" + +/* This code is not too effective and is costly for the current corpus + * batches and also for ady/amy. So maybe it should be discarded. */ +//#define ALT_MUTUAL_CONSISTENCY +//#define ALT_DISJUNCT_CONSISTENCY + +#if defined(ALT_MUTUAL_CONSISTENCY) || defined(ALT_DISJUNCT_CONSISTENCY) +#endif /* ALT_MUTUAL_CONSISTENCY || ALT_DISJUNCT_CONSISTENCY */ + +#define D_PRUNE 5 + +typedef Connector * connector_table; + +/* Indicator that this connector cannot be used -- that its "obsolete". */ +#define BAD_WORD (MAX_SENTENCE+1) + +typedef struct c_list_s C_list; +struct c_list_s +{ + C_list * next; + Connector * c; + bool shallow; +}; + +typedef struct power_table_s power_table; +struct power_table_s +{ + unsigned int power_table_size; + unsigned int *l_table_size; /* the sizes of the hash tables */ + unsigned int *r_table_size; + C_list *** l_table; + C_list *** r_table; + Pool_desc *memory_pool; +}; + +typedef struct cms_struct Cms; +struct cms_struct +{ + Cms * next; + const char * name; + int count; /* the number of times this is in the multiset */ +}; + +#define CMS_SIZE (2<<10) +typedef struct multiset_table_s multiset_table; +struct multiset_table_s +{ + Cms * cms_table[CMS_SIZE]; +}; + +typedef struct prune_context_s prune_context; +struct prune_context_s +{ + bool null_links; + int power_cost; + int N_changed; /* counts the number of changes + of c->nearest_word fields in a pass */ + power_table *pt; +#ifdef ALT_DISJUNCT_CONSISTENCY + const Connector *first_connector; /* for alt disjunct consistency */ +#endif + Sentence sent; +}; + +/* + + The algorithms in this file prune disjuncts from the disjunct list + of the sentence that can be eliminated by a simple checks. The first + check works as follows: + + A series of passes are made through the sentence, alternating + left-to-right and right-to-left. Consider the left-to-right pass (the + other is symmetric). A set S of connectors is maintained (initialized + to be empty). Now the disjuncts of the current word are processed. + If a given disjunct's left pointing connectors have the property that + at least one of them has no connector in S to which it can be matched, + then that disjunct is deleted. Now the set S is augmented by the right + connectors of the remaining disjuncts of that word. This completes + one word. The process continues through the words from left to right. + Alternate passes are made until no disjunct is deleted. + + It worries me a little that if there are some really huge disjuncts lists, + then this process will probably do nothing. (This fear turns out to be + unfounded.) + + Notes: Power pruning will not work if applied before generating the + "and" disjuncts. This is because certain of it's tricks don't work. + Think about this, and finish this note later.... + Also, currently I use the standard connector match procedure instead + of the pruning one, since I know power pruning will not be used before + and generation. Replace this to allow power pruning to work before + generating and disjuncts. + + Currently it seems that normal pruning, power pruning, and generation, + pruning, and power pruning (after "and" generation) and parsing take + about the same amount of time. This is why doing power pruning before + "and" generation might be a very good idea. + + New idea: Suppose all the disjuncts of a word have a connector of type + c pointing to the right. And further, suppose that there is exactly one + word to its right containing that type of connector pointing to the left. + Then all the other disjuncts on the latter word can be deleted. + (This situation is created by the processing of "either...or", and by + the extra disjuncts added to a "," neighboring a conjunction.) + +*/ + +/* + Here is what you've been waiting for: POWER-PRUNE + + The kinds of constraints it checks for are the following: + + 1) successive connectors on the same disjunct have to go to + nearer and nearer words. + + 2) two deep connectors cannot attach to each other + (A connectors is deep if it is not the first in its list; it + is shallow if it is the first in its list; it is deepest if it + is the last on its list.) + + 3) on two adjacent words, a pair of connectors can be used + only if they're the deepest ones on their disjuncts + + 4) on two non-adjacent words, a pair of connectors can be used only + if not [both of them are the deepest]. + + The data structure consists of a pair of hash tables on every word. + Each bucket of a hash table has a list of pointers to connectors. + These nodes also store if the chosen connector is shallow. +*/ +/* + As with normal pruning, we make alternate left->right and right->left + passes. In the R->L pass, when we're on a word w, we make use of + all the left-pointing hash tables on the words to the right of w. + After the pruning on this word, we build the left-pointing hash table + this word. This guarantees idempotence of the pass -- after doing an + L->R, doing another would change nothing. + + Each connector has an integer c_word field. This refers to the closest + word that it could be connected to. These are initially determined by + how deep the connector is. For example, a deepest connector can connect + to the neighboring word, so its c_word field is w+1 (w-1 if this is a left + pointing connector). It's neighboring shallow connector has a c_word + value of w+2, etc. + + The pruning process adjusts these c_word values as it goes along, + accumulating information about any way of linking this sentence. + The pruning process stops only after no disjunct is deleted and no + c_word values change. + + The difference between RUTHLESS and GENTLE power pruning is simply + that GENTLE uses the deletable region array, and RUTHLESS does not. + So we can get the effect of these two different methods simply by + always ensuring that deletable[][] has been defined. With nothing + deletable, this is equivalent to RUTHLESS. --DS, 7/97 +*/ + +/** + * free all of the hash tables and C_lists + */ +static void power_table_delete(power_table *pt) +{ + pool_delete(pt->memory_pool); + for (WordIdx w = 0; w < pt->power_table_size; w++) + { + xfree((char *)pt->l_table[w], pt->l_table_size[w] * sizeof (C_list *)); + xfree((char *)pt->r_table[w], pt->r_table_size[w] * sizeof (C_list *)); + } + xfree(pt->l_table_size, 2 * pt->power_table_size * sizeof(unsigned int)); + xfree(pt->l_table, 2 * pt->power_table_size * sizeof(C_list **)); + xfree(pt, sizeof(power_table)); +} + +/** + * The disjunct d (whose left or right pointer points to c) is put + * into the appropriate hash table + */ +static void put_into_power_table(C_list * m, unsigned int size, C_list ** t, + Connector * c, bool shal) +{ + unsigned int h; + h = connector_uc_num(c) & (size-1); + m->next = t[h]; + t[h] = m; + m->c = c; + m->shallow = shal; +} + +/** + * Allocates and builds the initial power hash tables + */ +static power_table * power_table_new(Sentence sent) +{ + power_table *pt; + size_t w, len; + unsigned int i, size; + C_list ** t; + Disjunct * d; + Connector * c; +#define TOPSZ 32768 + size_t lr_table_max_usage = MIN(sent->dict->contable.num_con, TOPSZ); + + pt = (power_table *) xalloc (sizeof(power_table)); + pt->power_table_size = sent->length; + pt->l_table_size = xalloc (2 * sent->length * sizeof(unsigned int)); + pt->r_table_size = pt->l_table_size + sent->length; + pt->l_table = xalloc (2 * sent->length * sizeof(C_list **)); + pt->r_table = pt->l_table + sent->length; + + Pool_desc *mp = pt->memory_pool = pool_new(__func__, "C_list", + /*num_elements*/2048, sizeof(C_list), + /*zero_out*/false, /*align*/false, /*exact*/false); + + for (w=0; wlength; w++) + { + /* The below uses variable-sized hash tables. This seems to + * provide performance that is equal or better than the best + * fixed-size performance. + * The best fixed-size performance seems to come at about + * a 1K table size, for both English and Russian. (Both have + * about 100 fixed link-types, and many thousands of auto-genned + * link types (IDxxx idioms for both, LLxxx suffix links for + * Russian). Pluses and minuses: + * + small fixed tables are faster to initialize. + * - small fixed tables have more collisions + * - variable-size tables require counting connectors. + * (and the more complex code to go with) + * CPU cache-size effects ... + * Strong dependence on the hashing algo! + */ + len = left_connector_count(sent->word[w].d); + size = next_power_of_two_up(MIN(len, lr_table_max_usage)); + pt->l_table_size[w] = size; + t = pt->l_table[w] = (C_list **) xalloc(size * sizeof(C_list *)); + for (i=0; iword[w].d; d!=NULL; d=d->next) { + c = d->left; + if (c != NULL) { + put_into_power_table(pool_alloc(mp), size, t, c, true); + for (c=c->next; c!=NULL; c=c->next) { + put_into_power_table(pool_alloc(mp), size, t, c, false); + } + } + } + + len = right_connector_count(sent->word[w].d); + size = next_power_of_two_up(MIN(len, lr_table_max_usage)); + pt->r_table_size[w] = size; + t = pt->r_table[w] = (C_list **) xalloc(size * sizeof(C_list *)); + for (i=0; iword[w].d; d!=NULL; d=d->next) { + c = d->right; + if (c != NULL) { + put_into_power_table(pool_alloc(mp), size, t, c, true); + for (c=c->next; c!=NULL; c=c->next){ + put_into_power_table(pool_alloc(mp), size, t, c, false); + } + } + } + } + + return pt; +} + +/** + * This runs through all the connectors in this table, and eliminates those + * who are obsolete. The word fields of an obsolete one has been set to + * BAD_WORD. + */ +static void clean_table(unsigned int size, C_list ** t) +{ + unsigned int i; + C_list * m, * xm, * head; + for (i = 0; i < size; i++) { + head = NULL; + for (m = t[i]; m != NULL; m = xm) { + xm = m->next; + if (m->c->nearest_word != BAD_WORD) { + m->next = head; + head = m; + } + } + t[i] = head; + } +} + +/** + * Find if words w1 and w2 may become adjacent due to optional words. + * This may happen if they contain only optional words between them. + * + * Return true iff they may become adjacent (i.e. all the words + * between them are optional). + */ +static bool optional_gap_collapse(Sentence sent, int w1, int w2) +{ + for (int w = w1+1; w < w2; w++) + if (!sent->word[w].optional) return false; + + return true; +} + +#if defined(ALT_MUTUAL_CONSISTENCY) || defined(ALT_DISJUNCT_CONSISTENCY) +static bool alt_consistency(prune_context *pc, + Connector *lc, Connector *rc, + int lword, int rword, bool lr) +{ + bool same_alternative = false; + +#ifdef ALT_MUTUAL_CONSISTENCY + /* Validate that rc and lc are from the same alternative. + * Each of the loops is of one iteration most of the times. */ + for (const gword_set *ga = lc->originating_gword; NULL != ga; ga = ga->next) { + for (const gword_set *gb = rc->originating_gword; NULL != gb; gb = gb->next) { + if (in_same_alternative(ga->o_gword, gb->o_gword)) { + same_alternative = true; + break; + } + } + if (same_alternative) break; + } +#endif /* ALT_MUTUAL_CONSISTENCY */ + +#ifdef ALT_DISJUNCT_CONSISTENCY + /* Validate that the candidate connector is not from a different + * alternative than any of the already existing connectors of the + * checked disjunct (XXX at the same side). + */ + + if (same_alternative) + { + const Connector *remote_connector = lr ? lc : rc; + const gword_set* gword_set_c = remote_connector->originating_gword; + const Connector *curr_connector = lr ? rc : lc; + +#if 0 + printf("CHECK %s F%p=%s R%p=%s:", lr ? "rc" : "lc", + pc->first_connector, pc->first_connector->string, + remote_connector, remote_connector->string); +#endif + for (const Connector *i = pc->first_connector; curr_connector != i; i = i->next) + { + //printf(" I%p=%s", i, i->string); + bool alt_compatible = false; + for (const gword_set *gi = i->originating_gword; NULL != gi; gi = gi->next) + { + for (const gword_set *gs = gword_set_c; NULL != gs; gs = gs->next) + { + if (in_same_alternative(gi->o_gword, gs->o_gword)) + { + alt_compatible = true; + break; + } + } + if (alt_compatible) break; + } + if (!alt_compatible) + { + same_alternative = false; +#if 0 + printf(" FALSE\n"); +#endif + break; + } + } +#if 0 + printf("\n"); +#endif + } +#endif /* ALT_DISJUNCT_CONSISTENCY */ + + if (!same_alternative) + { + lgdebug(8, "w%d=%s and w%d=%s NSA\n", + lword, lc->originating_gword->o_gword->subword, + rword, rc->originating_gword->o_gword->subword); + + return false; + } + + return same_alternative; +} +#endif /* ALT_MUTUAL_CONSISTENCY || ALT_DISJUNCT_CONSISTENCY */ + +/** + * This takes two connectors (and whether these are shallow or not) + * (and the two words that these came from) and returns TRUE if it is + * possible for these two to match based on local considerations. + */ +static bool possible_connection(prune_context *pc, + Connector *lc, Connector *rc, + bool lshallow, bool rshallow, + int lword, int rword, bool lr) +{ + int dist; + if ((!lshallow) && (!rshallow)) return false; + if (!easy_match_desc(lc->desc, rc->desc)) return false; + + /* Two deep connectors can't work */ + if ((lc->nearest_word > rword) || (rc->nearest_word < lword)) return false; + + dist = rword - lword; + // assert(0 < dist, "Bad word order in possible connection."); + + /* Word range constraints */ + if (1 == dist) + { + if ((lc->next != NULL) || (rc->next != NULL)) return false; + } + else + if (dist > lc->length_limit || dist > rc->length_limit) + { + return false; + } + /* If the words are NOT next to each other, then there must be + * at least one intervening connector (i.e. cannot have both + * lc->next and rc->next being null). But we only enforce this + * when we think its still possible to have a complete parse, + * i.e. before well allow null-linked words. + */ + else + if (!pc->null_links && + (lc->next == NULL) && + (rc->next == NULL) && + (!lc->multi) && (!rc->multi) && + !optional_gap_collapse(pc->sent, lword, rword)) + { + return false; + } + +#if defined(ALT_MUTUAL_CONSISTENCY) || defined(ALT_DISJUNCT_CONSISTENCY) + if (!alt_consistency(pc, lc, rc, lword, rword, lr)) return false; +#endif + + return true; +} + +/** + * This returns TRUE if the right table of word w contains + * a connector that can match to c. shallow tells if c is shallow. + */ +static bool +right_table_search(prune_context *pc, int w, Connector *c, + bool shallow, int word_c) +{ + unsigned int size, h; + C_list *cl; + power_table *pt = pc->pt; + + size = pt->r_table_size[w]; + h = connector_uc_num(c) & (size-1); + for (cl = pt->r_table[w][h]; cl != NULL; cl = cl->next) + { + if (possible_connection(pc, cl->c, c, cl->shallow, shallow, w, word_c, true)) + return true; + } + return false; +} + +/** + * This returns TRUE if the right table of word w contains + * a connector that can match to c. shallows tells if c is shallow + */ +static bool +left_table_search(prune_context *pc, int w, Connector *c, + bool shallow, int word_c) +{ + unsigned int size, h; + C_list *cl; + power_table *pt = pc->pt; + + size = pt->l_table_size[w]; + h = connector_uc_num(c) & (size-1); + for (cl = pt->l_table[w][h]; cl != NULL; cl = cl->next) + { + if (possible_connection(pc, c, cl->c, shallow, cl->shallow, word_c, w, false)) + return true; + } + return false; +} + +/** + * Take this connector list, and try to match it with the words + * w-1, w-2, w-3... Returns the word to which the first connector of + * the list could possibly be matched. If c is NULL, returns w. If + * there is no way to match this list, it returns a negative number. + * If it does find a way to match it, it updates the c->nearest_word fields + * correctly. + */ +static int +left_connector_list_update(prune_context *pc, Connector *c, + int w, bool shallow) +{ + int n, lb; + bool foundmatch; + + if (c == NULL) return w; + n = left_connector_list_update(pc, c->next, w, false) - 1; + if (((int) c->nearest_word) < n) n = c->nearest_word; + + /* lb is now the leftmost word we need to check */ + lb = w - c->length_limit; + if (0 > lb) lb = 0; + + /* n is now the rightmost word we need to check */ + foundmatch = false; + for (; n >= lb ; n--) + { + pc->power_cost++; + if (right_table_search(pc, n, c, shallow, w)) + { + foundmatch = true; + break; + } + } + if (n < ((int) c->nearest_word)) + { + c->nearest_word = n; + pc->N_changed++; + } + return (foundmatch ? n : -1); +} + +/** + * Take this connector list, and try to match it with the words + * w+1, w+2, w+3... Returns the word to which the first connector of + * the list could possibly be matched. If c is NULL, returns w. If + * there is no way to match this list, it returns a number greater than + * N_words - 1. If it does find a way to match it, it updates the + * c->nearest_word fields correctly. + */ +static size_t +right_connector_list_update(prune_context *pc, Connector *c, + size_t w, bool shallow) +{ + size_t n, ub; + bool foundmatch; + Sentence sent = pc->sent; + + if (c == NULL) return w; + n = right_connector_list_update(pc, c->next, w, false) + 1; + if (c->nearest_word > n) n = c->nearest_word; + + /* ub is now the rightmost word we need to check */ + ub = w + c->length_limit; + if (ub > sent->length) ub = sent->length - 1; + + /* n is now the leftmost word we need to check */ + foundmatch = false; + for (; n <= ub ; n++) + { + pc->power_cost++; + if (left_table_search(pc, n, c, shallow, w)) + { + foundmatch = true; + break; + } + } + if (n > c->nearest_word) { + c->nearest_word = n; + pc->N_changed++; + } + return (foundmatch ? n : sent->length); +} + +/** The return value is the number of disjuncts deleted */ +int power_prune(Sentence sent, Parse_Options opts) +{ + power_table *pt; + prune_context *pc; + Disjunct *d, *free_later, *dx, *nd; + Connector *c; + size_t N_deleted, total_deleted; + size_t w; + + pc = (prune_context *) xalloc (sizeof(prune_context)); + pc->power_cost = 0; + pc->null_links = (opts->min_null_count > 0); + pc->N_changed = 1; /* forces it always to make at least two passes */ + + pc->sent = sent; + + pt = power_table_new(sent); + pc->pt = pt; + + free_later = NULL; + N_deleted = 0; + + total_deleted = 0; + + while (1) + { + /* left-to-right pass */ + for (w = 0; w < sent->length; w++) { + for (d = sent->word[w].d; d != NULL; d = d->next) { + if (d->left == NULL) continue; +#ifdef ALT_DISJUNCT_CONSISTENCY + pc->first_connector = d->left; +#endif + if (left_connector_list_update(pc, d->left, w, true) < 0) { + for (c=d->left; c != NULL; c = c->next) c->nearest_word = BAD_WORD; + for (c=d->right; c != NULL; c = c->next) c->nearest_word = BAD_WORD; + N_deleted++; + total_deleted++; + } + } + + clean_table(pt->r_table_size[w], pt->r_table[w]); + nd = NULL; + for (d = sent->word[w].d; d != NULL; d = dx) { + dx = d->next; + if ((d->left != NULL) && (d->left->nearest_word == BAD_WORD)) { + d->next = free_later; + free_later = d; + } else { + d->next = nd; + nd = d; + } + } + sent->word[w].d = nd; + } + lgdebug(D_PRUNE, "Debug: l->r pass changed %d and deleted %zu\n", + pc->N_changed, N_deleted); + + if (pc->N_changed == 0) break; + + pc->N_changed = N_deleted = 0; + /* right-to-left pass */ + + for (w = sent->length-1; w != (size_t) -1; w--) { + for (d = sent->word[w].d; d != NULL; d = d->next) { + if (d->right == NULL) continue; +#ifdef ALT_DISJUNCT_CONSISTENCY + pc->first_connector = d->right; +#endif + if (right_connector_list_update(pc, d->right, w, true) >= sent->length) { + for (c=d->right; c != NULL; c = c->next) c->nearest_word = BAD_WORD; + for (c=d->left; c != NULL; c = c->next) c->nearest_word = BAD_WORD; + N_deleted++; + total_deleted++; + } + } + clean_table(pt->l_table_size[w], pt->l_table[w]); + nd = NULL; + for (d = sent->word[w].d; d != NULL; d = dx) { + dx = d->next; + if ((d->right != NULL) && (d->right->nearest_word == BAD_WORD)) { + d->next = free_later; + free_later = d; + } else { + d->next = nd; + nd = d; + } + } + sent->word[w].d = nd; + } + + lgdebug(D_PRUNE, "Debug: r->l pass changed %d and deleted %zu\n", + pc->N_changed, N_deleted); + + if (pc->N_changed == 0) break; + pc->N_changed = N_deleted = 0; + } + free_disjuncts(free_later); + power_table_delete(pt); + pt = NULL; + pc->pt = NULL; + + lgdebug(D_PRUNE, "Debug: power prune cost: %d\n", pc->power_cost); + + print_time(opts, "power pruned"); + if (verbosity_level(D_PRUNE)) + { + prt_error("\n\\"); + prt_error("Debug: After power_pruning:\n\\"); + print_disjunct_counts(sent); + } + + xfree(pc, sizeof(prune_context)); + return total_deleted; +} + +/* =================================================================== + PP Pruning + + The "contains one" post-processing constraints give us a new way to + prune. Suppose there's a rule that says "a group that contains foo + must contain a bar or a baz." Here foo, bar, and baz are connector + types. foo is the trigger link, bar and baz are called the criterion + links. If, after considering the disjuncts we find that there is is + a foo, but neither a bar, nor a baz, then we can eliminate the disjunct + containing bar. + + Things are actually a bit more complex, because of the matching rules + and subscripts. The problem is that post-processing deals with link + names, while at this point all we have to work with is connector + names. Consider the foo part. Consider a connector C. When does + foo match C for our purposes? It matches it if every possible link + name L (that can result from C being at one end of that link) results + in post_process_match(foo,L) being true. Suppose foo contains a "*". + Then there is no C that has this property. This is because the *s in + C may be replaced by some other subscripts in the construction of L. + And the non-* chars in L will not post_process_match the * in foo. + + So let's assume that foo has no *. Now the result we want is simply + given by post_process_match(foo, C). Proof: L is the same as C but + with some *s replaced by some other letters. Since foo contains no * + the replacement in C of some * by some other letter could change + post_process_match from FALSE to TRUE, but not vice versa. Therefore + it's conservative to use this test. + + For the criterion parts, we need to determine if there is a + collection of connectors C1, C2,... such that by combining them you + can get a link name L that post_process_matches bar or baz. Here's a + way to do this. Say bar="Xabc". Then we see if there are connector + names that post_process_match "Xa##", "X#b#", and "X##c". They must + all be there in order for it to be possible to create a link name + "Xabc". A "*" in the criterion part is a little different. In this + case we can simply skip the * (treat it like an upper case letter) + for this purpose. So if bar="X*ab" then we look for "X*#b" and + "X*a#". (The * in this case could be treated the same as another + subscript without breaking it.) Note also that it's only necessary + to find a way to match one of the many criterion links that may be in + the rule. If none can match, then we can delete the disjunct + containing C. + + Here's how we're going to implement this. We'll maintain a multiset + of connector names. We'll represent them in a hash table, where the + hash function uses only the upper case letters of the connector name. + We'll insert all the connectors into the multiset. The multiset will + support the operation of deletion (probably simplest to just + decrement the count). Here's the algorithm. + + Insert all the connectors into M. + + While the previous pass caused a count to go to 0 do: + For each connector C do + For each rule R do + if C is a trigger for R and the criterion links + of the rule cannot be satisfied by the connectors in + M, Then: + We delete C's disjunct. But before we do, + we remove all the connectors of this disjunct + from the multiset. Keep tabs on whether or not + any of the counts went to 0. + + + + Efficiency hacks to be added later: + Note for a given rule can become less and less satisfiable. + That is, rule_satisfiable(r) for a given rule r can change from + TRUE to FALSE, but not vice versa. So once it's FALSE, we can just + remember that. + + Consider the effect of a pass p on the set of rules that are + satisfiable. Suppose this set does not change. Then pass p+1 + will do nothing. This is true even if pass p caused some + disjuncts to be deleted. (This observation will only obviate + the need for the last pass.) + + */ + +static multiset_table * cms_table_new(void) +{ + multiset_table *mt; + int i; + + mt = (multiset_table *) xalloc(sizeof(multiset_table)); + + for (i=0; icms_table[i] = NULL; + } + return mt; +} + +static void cms_table_delete(multiset_table *mt) +{ + Cms * cms, *xcms; + int i; + for (i=0; icms_table[i]; cms != NULL; cms = xcms) + { + xcms = cms->next; + xfree(cms, sizeof(Cms)); + } + } + xfree(mt, sizeof(multiset_table)); +} + +static unsigned int cms_hash(const char * s) +{ + unsigned int i = 5381; + if (islower((int) *s)) s++; /* skip head-dependent indicator */ + while (isupper((int) *s)) + { + i = ((i << 5) + i) + *s; + s++; + } + return (i & (CMS_SIZE-1)); +} + +/** + * This returns TRUE if there is a connector name C in the table + * such that post_process_match(pp_match_name, C) is TRUE + */ +static bool match_in_cms_table(multiset_table *cmt, const char * pp_match_name) +{ + Cms * cms; + for (cms = cmt->cms_table[cms_hash(pp_match_name)]; cms != NULL; cms = cms->next) + { + if (post_process_match(pp_match_name, cms->name)) return true; + } + return false; +} + +static Cms * lookup_in_cms_table(multiset_table *cmt, const char * str) +{ + Cms * cms; + for (cms = cmt->cms_table[cms_hash(str)]; cms != NULL; cms = cms->next) + { + if (string_set_cmp(str, cms->name)) return cms; + } + return NULL; +} + +static void insert_in_cms_table(multiset_table *cmt, const char * str) +{ + Cms * cms; + unsigned int h; + cms = lookup_in_cms_table(cmt, str); + if (cms != NULL) { + cms->count++; + } else { + cms = (Cms *) xalloc(sizeof(Cms)); + cms->name = str; /* don't copy the string...just keep a pointer to it. + we won't free these later */ + cms->count = 1; + h = cms_hash(str); + cms->next = cmt->cms_table[h]; + cmt->cms_table[h] = cms; + } +} + +/** + * Delete the given string from the table. Return TRUE if + * this caused a count to go to 0, return FALSE otherwise. + */ +static bool delete_from_cms_table(multiset_table *cmt, const char * str) +{ + Cms * cms = lookup_in_cms_table(cmt, str); + if (cms != NULL && cms->count > 0) + { + cms->count--; + return (cms->count == 0); + } + return false; +} + +static bool rule_satisfiable(multiset_table *cmt, pp_linkset *ls) +{ + unsigned int hashval; + const char * t; + char name[20], *s; + pp_linkset_node *p; + int bad, n_subscripts; + + for (hashval = 0; hashval < ls->hash_table_size; hashval++) + { + for (p = ls->hash_table[hashval]; p!=NULL; p=p->next) + { + /* ok, we've got our hands on one of the criterion links */ + strncpy(name, p->str, sizeof(name)-1); + /* could actually use the string in place because we change it back */ + name[sizeof(name)-1] = '\0'; + /* now we want to see if we can satisfy this criterion link */ + /* with a collection of the links in the cms table */ + + s = name; + if (islower((int)*s)) s++; /* skip head-dependent indicator */ + for (; isupper((int)*s); s++) {} + for (;*s != '\0'; s++) if (*s != '*') *s = '#'; + + s = name; + t = p->str; + if (islower((int)*s)) s++; /* skip head-dependent indicator */ + if (islower((int)*t)) t++; /* skip head-dependent indicator */ + for (; isupper((int) *s); s++, t++) {} + + /* s and t remain in lockstep */ + bad = 0; + n_subscripts = 0; + for (;*s != '\0' && bad==0; s++, t++) { + if (*s == '*') continue; + n_subscripts++; + /* after the upper case part, and is not a * so must be a regular subscript */ + *s = *t; + if (!match_in_cms_table(cmt, name)) bad++; + *s = '#'; + } + + if (n_subscripts == 0) { + /* now we handle the special case which occurs if there + were 0 subscripts */ + if (!match_in_cms_table(cmt, name)) bad++; + } + + /* now if bad==0 this criterion link does the job + to satisfy the needs of the trigger link */ + + if (bad == 0) return true; + } + } + return false; +} + +static void delete_unmarked_disjuncts(Sentence sent) +{ + size_t w; + Disjunct *d_head, *d, *dx; + + for (w=0; wlength; w++) { + d_head = NULL; + for (d=sent->word[w].d; d != NULL; d=dx) { + dx = d->next; + if (d->marked) { + d->next = d_head; + d_head = d; + } else { + d->next = NULL; + free_disjuncts(d); + } + } + sent->word[w].d = d_head; + } +} + +static int pp_prune(Sentence sent, Parse_Options opts) +{ + pp_knowledge * knowledge; + size_t i, w; + int total_deleted, N_deleted; + bool change, deleteme; + multiset_table *cmt; + + if (sent->postprocessor == NULL) return 0; + if (!opts->perform_pp_prune) return 0; + + knowledge = sent->postprocessor->knowledge; + + cmt = cms_table_new(); + + for (w = 0; w < sent->length; w++) + { + Disjunct *d; + for (d = sent->word[w].d; d != NULL; d = d->next) + { + char dir; + d->marked = true; + for (dir=0; dir < 2; dir++) + { + Connector *c; + for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) + { + insert_in_cms_table(cmt, connector_string(c)); + } + } + } + } + + total_deleted = 0; + change = true; + while (change) + { + char dir; + + change = false; + N_deleted = 0; + for (w = 0; w < sent->length; w++) + { + Disjunct *d; + for (d = sent->word[w].d; d != NULL; d = d->next) + { + if (!d->marked) continue; + deleteme = false; + for (dir = 0; dir < 2; dir++) + { + Connector *c; + for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) + { + for (i = 0; i < knowledge->n_contains_one_rules; i++) + { + pp_rule* rule = &knowledge->contains_one_rules[i]; /* the ith rule */ + const char * selector = rule->selector; /* selector string for this rule */ + pp_linkset * link_set = rule->link_set; /* the set of criterion links */ + + if (strchr(selector, '*') != NULL) continue; /* If it has a * forget it */ + + if (!post_process_match(selector, connector_string(c))) continue; + + /* + printf("pp_prune: trigger ok. selector = %s c->string = %s\n", selector, c->string); + */ + + /* We know c matches the trigger link of the rule. */ + /* Now check the criterion links */ + + if (!rule_satisfiable(cmt, link_set)) + { + deleteme = true; + rule->use_count++; + } + if (deleteme) break; + } + if (deleteme) break; + } + if (deleteme) break; + } + + if (deleteme) /* now we delete this disjunct */ + { + N_deleted++; + total_deleted++; + d->marked = false; /* mark for deletion later */ + for (dir=0; dir < 2; dir++) + { + Connector *c; + for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) + { + change |= delete_from_cms_table(cmt, connector_string(c)); + } + } + } + } + } + + lgdebug(D_PRUNE, "Debug: pp_prune pass deleted %d\n", N_deleted); + } + delete_unmarked_disjuncts(sent); + cms_table_delete(cmt); + + if ((0 != N_deleted) && verbosity_level(D_PRUNE)) + { + prt_error("\n\\"); + prt_error("Debug: After pp_pruning:\n\\"); + print_disjunct_counts(sent); + } + + print_time(opts, "pp pruning"); + + return total_deleted; +} + + +/** + * Do the following pruning steps until nothing happens: + * power pp power pp power pp.... + * Make sure you do them both at least once. + */ +void pp_and_power_prune(Sentence sent, Parse_Options opts) +{ + power_prune(sent, opts); + pp_prune(sent, opts); + + return; + + // Not reached. We can actually gain a few percent of + // performance be skipping the loop below. Mostly, it just + // does a lot of work, and pretty much finds nothing. + // And so we skip it. +#ifdef ONLY_IF_YOU_THINK_THIS_IS_WORTH_IT + for (;;) { + if (pp_prune(sent, opts) == 0) break; + if (power_prune(sent, opts) == 0) break; + } +#endif +} diff -Nru link-grammar-5.3.16/link-grammar/parse/prune.h link-grammar-5.5.0/link-grammar/parse/prune.h --- link-grammar-5.3.16/link-grammar/parse/prune.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/parse/prune.h 2017-05-29 08:07:13.000000000 +0000 @@ -0,0 +1,21 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _PRUNE_H +#define _PRUNE_H + +#include "link-includes.h" + +int power_prune(Sentence, Parse_Options); +void pp_and_power_prune(Sentence, Parse_Options); + +#endif /* _PRUNE_H */ diff -Nru link-grammar-5.3.16/link-grammar/post-process/constituents.c link-grammar-5.5.0/link-grammar/post-process/constituents.c --- link-grammar-5.3.16/link-grammar/post-process/constituents.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/constituents.c 2018-01-24 01:57:32.000000000 +0000 @@ -0,0 +1,1335 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include + +#include "api-structures.h" +#include "dict-common/dict-defines.h" // For RIGHT_WALL_WORD +#include "dict-common/dict-common.h" // For Dictionary_s +#include "dict-common/dict-defines.h" // For MAX_WORD +#include "error.h" +#include "linkage/linkage.h" +#include "post-process/post-process.h" +#include "post-process/pp-structures.h" +#include "string-set.h" +#include "utilities.h" + +#define D_CONST 8 /* debug level for this file */ + +#define OPEN_BRACKET '[' +#define CLOSE_BRACKET ']' + +typedef enum {OPEN_TOK, CLOSE_TOK, WORD_TOK} CType; +typedef enum {NONE, STYPE, PTYPE, QTYPE, QDTYPE} WType; + +typedef struct +{ + const char * type; + const char * start_link; + size_t left; /* leftmost word */ + size_t right; /* rightmost word */ + int canon; + bool valid; + char domain_type; +} constituent_t; + +/* + * Context used to store assorted intermediate data + * when the constituent string is being generated. + */ +typedef struct +{ + String_set * phrase_ss; + WType * wordtype; + constituent_t * constituent; + int conlen; +} con_context_t; + + +typedef struct CNode_s CNode; + +/* Invariant: Leaf if child==NULL */ +struct CNode_s +{ + char * label; + CNode * child; + CNode * next; + int start, end; +}; + +/* ================================================================ */ + +/** + * @param s A Label + * @param t A Label + * @return True if the uppercase parts of s and t are equal, else false. + */ +static bool uppercompare(const char * s, const char * t) +{ + while (isupper(*s) || isupper(*t)) + { + if (*s++ != *t++) return false; + } + return true; +} + +/** + * If a constituent c has a comma at either end, we exclude the + * comma. + */ +static void adjust_for_left_comma(con_context_t * ctxt, Linkage linkage, int c) +{ + int w; + w = ctxt->constituent[c].left; + if (strcmp(linkage->word[w], ",") == 0) + w++; + ctxt->constituent[c].left = w; +} + +static void adjust_for_right_comma(con_context_t *ctxt, Linkage linkage, int c) +{ + int w; + w = ctxt->constituent[c].right; + if ((strcmp(linkage->word[w], ",") == 0) || + (strcmp(linkage->word[w], RIGHT_WALL_WORD) == 0)) + { + w--; + } + ctxt->constituent[c].right = w; +} + +static void print_constituent(con_context_t *ctxt, Linkage linkage, int c) +{ + size_t w; + + err_msg(lg_Debug, " c %2d %4s [%c] (%2zu-%2zu): ", + c, ctxt->constituent[c].type, ctxt->constituent[c].domain_type, + ctxt->constituent[c].left, ctxt->constituent[c].right); + for (w = ctxt->constituent[c].left; w <= ctxt->constituent[c].right; w++) { + err_msg(lg_Debug, "%s ", linkage->word[w]); /**PV**/ + } + err_msg(lg_Debug, "\n"); +} + +/****************************************************** + * These functions do the bulk of the actual + * constituent-generating; they're called once. + *********************************************************/ + +typedef enum +{ + CASE_S=1, + CASE_UNUSED=2, /* XXX not used anywhere... */ + CASE_REL_CLAUSE=3, + CASE_APPOS=4, + CASE_OPENER=5, + CASE_PPOPEN=6, + CASE_SVINV=7, + CASE_PART_MOD=8, + CASE_PART_OPEN=9, + +} case_type; + +/** + * This function looks for constituents of type ctype1. Say it finds + * one, call it c1. It searches for the next larger constituent of + * type ctype2, call it c2. It then generates a new constituent of + * ctype3, containing all the words in c2 but not c1. + */ +static int gen_comp(con_context_t *ctxt, Linkage linkage, + int numcon_total, int numcon_subl, + const char * ctype1, const char * ctype2, + const char * ctype3, case_type x) +{ + size_t w, w2, w3; + int c, c1, c2; + bool done; + c = numcon_total + numcon_subl; + + for (c1=numcon_total; c1constituent[c1].start_link)==0)) + continue; + +#ifdef REVIVE_DEAD_CODE + /* If ctype1 is X, and domain_type is t, it's an infinitive - skip it */ + if ((x==CASE_UNUSED) && (ctxt->constituent[c1].domain_type=='t')) + continue; +#endif /* REVIVE_DEAD_CODE */ + + /* If it's domain-type z, it's a subject-relative clause; + the VP doesn't need an NP */ + if (ctxt->constituent[c1].domain_type=='z') + continue; + + /* If ctype1 is X or VP, and it's not started by an S, don't generate an NP + (Neither of the two previous checks are necessary now, right?) */ +#ifdef REVIVE_DEAD_CODE + /* use this ... if ((x==CASE_S || x==CASE_UNUSED) && */ +#endif /* REVIVE_DEAD_CODE */ + if ((x==CASE_S) && + (((post_process_match("S", ctxt->constituent[c1].start_link) == 0) && + (post_process_match("SX", ctxt->constituent[c1].start_link) == 0) && + (post_process_match("SF", ctxt->constituent[c1].start_link) == 0)) || + (post_process_match("S##w", ctxt->constituent[c1].start_link) != 0))) + continue; + + /* If it's an SBAR (relative clause case), it has to be a relative clause */ + if ((x==CASE_REL_CLAUSE) && + ((post_process_match("Rn", ctxt->constituent[c1].start_link) == 0) && + (post_process_match("R*", ctxt->constituent[c1].start_link) == 0) && + (post_process_match("MX#r", ctxt->constituent[c1].start_link) == 0) && + (post_process_match("Mr", ctxt->constituent[c1].start_link) == 0) && + (post_process_match("MX#d", ctxt->constituent[c1].start_link) == 0))) + continue; + + /* If ctype1 is SBAR (clause opener case), it has to be an f domain */ + if ((x==CASE_OPENER) && (ctxt->constituent[c1].domain_type!='f')) + continue; + + /* If ctype1 is SBAR (pp opener case), it has to be a g domain */ + if ((x==CASE_PPOPEN) && (ctxt->constituent[c1].domain_type!='g')) + continue; + + /* If ctype1 is NP (paraphrase case), it has to be started by an SI */ + if ((x==CASE_SVINV) && (post_process_match("SI", ctxt->constituent[c1].start_link)==0)) + continue; + + /* If ctype1 is VP (participle modifier case), it has to be + started by an Mv or Mg */ + if ((x==CASE_PART_MOD) && (post_process_match("M", ctxt->constituent[c1].start_link)==0)) + continue; + + /* If ctype1 is VP (participle opener case), it has + to be started by a COp */ + if ((x==CASE_PART_OPEN) && (post_process_match("COp", ctxt->constituent[c1].start_link)==0)) + continue; + + /* Now start at the bounds of c1, and work outwards until you + find a larger constituent of type ctype2 */ + if (!(strcmp(ctxt->constituent[c1].type, ctype1)==0)) + continue; + + if (verbosity_level(D_CONST)) + err_msg(lg_Debug, "Generating complement constituent for c %d of type %s\n\\", + c1, ctype1); + done = false; + for (w2 = ctxt->constituent[c1].left; (done == false) && (w2 != (size_t)-1); w2--) + { + for (w3 = ctxt->constituent[c1].right; w3num_words; w3++) + { + for (c2 = numcon_total; (done == false) && + (c2 < numcon_total + numcon_subl); c2++) { + if (!((ctxt->constituent[c2].left == w2) && + (ctxt->constituent[c2].right == w3)) || (c2==c1)) + continue; + if (!(strcmp(ctxt->constituent[c2].type, ctype2)==0)) + continue; + + /* if the new constituent (c) is to the left + of c1, its right edge should be adjacent to the + left edge of c1 - or as close as possible. */ + if ((x==CASE_OPENER) || (x==CASE_PPOPEN) || (x==CASE_PART_OPEN)) + { + /* This is the case where c is to the + RIGHT of c1 */ + w = ctxt->constituent[c1].right + 1; + if (w > ctxt->constituent[c2].right) + { + done = true; + continue; + } + ctxt->constituent[c].left = w; + ctxt->constituent[c].right = ctxt->constituent[c2].right; + } + else + { + w = ctxt->constituent[c1].left - 1; + if (w < ctxt->constituent[c2].left) { + done = true; + continue; + } + ctxt->constituent[c].right = w; + ctxt->constituent[c].left = ctxt->constituent[c2].left; + } + + adjust_for_left_comma(ctxt, linkage, c1); + adjust_for_right_comma(ctxt, linkage, c1); + + ctxt->constituent[c].type = + string_set_add(ctype3, ctxt->phrase_ss); + ctxt->constituent[c].domain_type = 'x'; + ctxt->constituent[c].start_link = + string_set_add("XX", ctxt->phrase_ss); + if (verbosity_level(D_CONST)) + { + err_msg(lg_Debug, "Larger c found: c %d (%s); ", c2, ctype2); + err_msg(lg_Debug, "Adding constituent:\n\\"); + print_constituent(ctxt, linkage, c); + } + c++; + assert (c < ctxt->conlen, "Too many constituents"); + done = true; + } + } + } + if (verbosity_level(D_CONST)) + { + if (done == false) + err_msg(lg_Debug, "No constituent added, because no larger %s" + " was found\n", ctype2); + else + lg_error_flush(); + } + } + numcon_subl = c - numcon_total; + return numcon_subl; +} + +/** + * Look for a constituent started by an MVs or MVg. + * Find any VP's or ADJP's that contain it (without going + * beyond a larger S or NP). Adjust them so that + * they end right before the m domain starts. + */ +static void adjust_subordinate_clauses(con_context_t *ctxt, Linkage linkage, + int numcon_total, + int numcon_subl) +{ + int c, c2; + size_t w, w2; + bool done; + + for (c=numcon_total; cconstituent[c].start_link) == 1) || + (post_process_match("MVg", ctxt->constituent[c].start_link) == 1)) + { + done = false; + for (w2 = ctxt->constituent[c].left-1; (false == done) && w2 != (size_t) -1; w2--) + { + for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++) + { + if (!((ctxt->constituent[c2].left == w2) && + (ctxt->constituent[c2].right >= ctxt->constituent[c].right))) + continue; + if ((strcmp(ctxt->constituent[c2].type, "S") == 0) || + (strcmp(ctxt->constituent[c2].type, "NP") == 0)) { + done = true; + break; + } + if ((ctxt->constituent[c2].domain_type == 'v') || + (ctxt->constituent[c2].domain_type == 'a')) + { + w = ctxt->constituent[c].left - 1; + ctxt->constituent[c2].right = w; + + if (verbosity_level(D_CONST)) + { + err_msg(lg_Debug, "Adjusting constituent %d:\n\\", c2); + print_constituent(ctxt, linkage, c2); + } + } + } + } + if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0) + ctxt->constituent[c].left++; + } + } +} + +/****************************************************** + * These functions are called once, after constituents + * have been generated, to merge them together and fix up + * some other things. + * + ********************************************************/ + +static int merge_constituents(con_context_t *ctxt, Linkage linkage, int numcon_total) +{ + int c1, c2=0; + + /* First go through and give each constituent a canonical number + (the index number of the lowest-numbered constituent + identical to it) */ + for (c1 = 0; c1 < numcon_total; c1++) + { + ctxt->constituent[c1].valid = true; + ctxt->constituent[c1].canon = c1; + for (c2 = c1 + 1; c2 < numcon_total; c2++) + { + if ((ctxt->constituent[c1].left == ctxt->constituent[c2].left) && + (ctxt->constituent[c1].right == ctxt->constituent[c2].right) && + (strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0)) + { + ctxt->constituent[c2].canon = c1; + } + } + } + + /* Now go through and find duplicates; if a pair is found, + * mark one as invalid. + */ + for (c1 = 0; c1 < numcon_total; c1++) + { + for (c2 = c1 + 1; c2 < numcon_total; c2++) + { + if (ctxt->constituent[c2].canon == ctxt->constituent[c1].canon) + ctxt->constituent[c2].valid = false; + } + } + + return numcon_total; +} + +/** + * Go through all the words. If a word is on the right end of + * an S (or SF or SX), wordtype[w]=STYPE. If it's also on the left end of a + * Pg*b, I, PP, or Pv, wordtype[w]=PTYPE. If it's a question-word + * used in an indirect question, wordtype[w]=QTYPE. If it's a + * question-word determiner, wordtype[w]=QDTYPE. Else wordtype[w]=NONE. + * (This function is called once.) + */ +static void generate_misc_word_info(con_context_t * ctxt, Linkage linkage) +{ + size_t w1, w2, l1, l2; + const char * label1, * label2; + + for (w1 = 0; w1 < linkage->num_words; w1++) + ctxt->wordtype[w1] = NONE; + + for (l1 = 0; l1 < linkage_get_num_links(linkage); l1++) { + w1=linkage_get_link_rword(linkage, l1); + label1 = linkage_get_link_label(linkage, l1); + if ((uppercompare(label1, "S")) || + (uppercompare(label1, "SX")) || + (uppercompare(label1, "SF"))) { + ctxt->wordtype[w1] = STYPE; + for (l2 = 0; l2 < linkage_get_num_links(linkage); l2++) { + w2=linkage_get_link_lword(linkage, l2); + label2 = linkage_get_link_label(linkage, l2); + if ((w1 == w2) && + ((post_process_match("Pg#b", label2)==1) || + (uppercompare(label2, "I")) || + (uppercompare(label2, "PP")) || + (post_process_match("Pv", label2)==1))) { + /* Pvf, Pgf? */ + ctxt->wordtype[w1] = PTYPE; + } + } + } + if (post_process_match("QI#d", label1)==1) { + ctxt->wordtype[w1] = QTYPE; + for (l2 = 0; l2 < linkage_get_num_links(linkage); l2++) { + w2 = linkage_get_link_lword(linkage, l2); + label2 = linkage_get_link_label(linkage, l2); + if ((w1 == w2) && (post_process_match("D##w", label2)==1)) { + ctxt->wordtype[w1] = QDTYPE; + } + } + } + if (post_process_match("Mr", label1)==1) ctxt->wordtype[w1] = QDTYPE; + if (post_process_match("MX#d", label1)==1) ctxt->wordtype[w1] = QDTYPE; + } +} + +static int new_style_conjunctions(con_context_t *ctxt, Linkage linkage, int numcon_total) +{ +#ifdef DEBUG + int c; + for (c = 0; c < numcon_total; c++) + { + constituent_t *ct = &ctxt->constituent[c]; + lgdebug(6, "ola %d valid=%d %s start=%s lr=%zu %zu\n", c, + ct->valid, ct->type, ct->start_link, ct->left, ct->right); + } +#endif + return numcon_total; +} + +static int last_minute_fixes(con_context_t *ctxt, Linkage linkage, int numcon_total) +{ + int c; + bool global_leftend_found, global_rightend_found; + size_t lastword; + + for (c = 0; c < numcon_total; c++) + { + /* In a paraphrase construction ("John ran, he said"), + the paraphrasing clause doesn't get + an S. (This is true in Treebank II, not Treebank I) */ + + if (uppercompare(ctxt->constituent[c].start_link, "CP")) + { + ctxt->constituent[c].valid = false; + } + + /* If it's a possessive with an "'s", the NP on the left + should be extended to include the "'s". */ + if ((uppercompare(ctxt->constituent[c].start_link, "YS")) || + (uppercompare(ctxt->constituent[c].start_link, "YP"))) + { + ctxt->constituent[c].right++; + } + + /* If a constituent has starting link MVpn, it's a time + expression like "last week"; label it as a noun phrase + (incorrectly) */ + + if (strcmp(ctxt->constituent[c].start_link, "MVpn") == 0) + { + ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss); + } + if (strcmp(ctxt->constituent[c].start_link, "COn") == 0) + { + ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss); + } + if (strcmp(ctxt->constituent[c].start_link, "Mpn") == 0) + { + ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss); + } + + /* If the constituent is an S started by "but" or "and" at + the beginning of the 67411142sentence, it should be ignored. */ + + if ((strcmp(ctxt->constituent[c].start_link, "Wdc") == 0) && + (ctxt->constituent[c].left == 2)) + { + ctxt->constituent[c].valid = false; + } + + /* For prenominal adjectives, an ADJP constituent is assigned + if it's a hyphenated (Ah) or comparative (Am) adjective; + otherwise no ADJP is assigned, unless the phrase is more + than one word long (e.g. "very big"). The same with certain + types of adverbs. */ + /* That was for Treebank I. For Treebank II, the rule only + seems to apply to prenominal adjectives (of all kinds). + However, it also applies to number expressions ("QP"). */ + + if ((post_process_match("A", ctxt->constituent[c].start_link) == 1) || + (ctxt->constituent[c].domain_type == 'd') || + (ctxt->constituent[c].domain_type == 'h')) { + if (ctxt->constituent[c].right-ctxt->constituent[c].left == 0) + { + ctxt->constituent[c].valid = false; + } + } + + if ((ctxt->constituent[c].domain_type == 'h') && + (strcmp(linkage->word[ctxt->constituent[c].left - 1], "$") == 0)) + { + ctxt->constituent[c].left--; + } + } + + /* If there's a global S constituent that includes everything + except a final terminating punctuation (period or question mark), + extend it by one word. We know its the terminating punctuation, + because it links to the right wall with an RW link. If its + not, then that final link is not there... + */ + for (c = 0; c < numcon_total; c++) + { + if ((ctxt->constituent[c].right == linkage->num_words - 3) && + (ctxt->constituent[c].left == 1) && + (strcmp(ctxt->constituent[c].type, "S") == 0)) + { + size_t ln; + for (ln = 0; ln < linkage->num_links; ln++) + { + if ((linkage->link_array[ln].lw == linkage->num_words - 2) && + (linkage->link_array[ln].rw == linkage->num_words - 1)) + { + ctxt->constituent[c].right++; + break; + } + } + } + } + + /* If there's no S boundary at the very left end of the sentence, + or the very right end, create a new S spanning the entire sentence */ + + lastword = linkage->num_words - 2; + global_leftend_found = false; + global_rightend_found = false; + for (c = 0; c < numcon_total; c++) + { + if ((ctxt->constituent[c].left == 1) && + (strcmp(ctxt->constituent[c].type, "S") == 0) && + ctxt->constituent[c].valid) + { + global_leftend_found = true; + } + } + + for (c = 0; c < numcon_total; c++) + { + if ((ctxt->constituent[c].right >= lastword) && + (strcmp(ctxt->constituent[c].type, "S") == 0) && + ctxt->constituent[c].valid) + { + global_rightend_found = true; + } + } + + if ((global_leftend_found == false) || (global_rightend_found == false)) + { + c = numcon_total; + ctxt->constituent[c].left = 1; + ctxt->constituent[c].right = linkage->num_words-1; + ctxt->constituent[c].type = string_set_add("S", ctxt->phrase_ss); + ctxt->constituent[c].valid = true; + ctxt->constituent[c].domain_type = 'x'; + numcon_total++; + if (verbosity_level(D_CONST)) + { + err_msg(lg_Debug, "Adding global sentence constituent:\n\\"); + print_constituent(ctxt, linkage, c); + } + } + + return numcon_total; +} + +static int add_constituent(con_context_t *ctxt, int c, const Linkage linkage, + const Domain *domain, + int l, int r, const char * name) +{ + int nwords = linkage->num_words-2; + c++; + + /* Avoid running off end, to walls. */ + if (l < 1) l=1; + if (r > nwords) r = nwords; + if (l > nwords) l = nwords; + assert(l <= r, "negative constituent length!" ); + + ctxt->constituent[c].type = string_set_add(name, ctxt->phrase_ss); + ctxt->constituent[c].left = l; + ctxt->constituent[c].right = r; + ctxt->constituent[c].domain_type = domain->type; + ctxt->constituent[c].start_link = + linkage_get_link_label(linkage, domain->start_link); + return c; +} + +static const char * cons_of_domain(const Linkage linkage, char domain_type) +{ + switch (domain_type) { + case 'a': + return "ADJP"; + case 'b': + return "SBAR"; + case 'c': + return "VP"; + case 'd': + return "QP"; + case 'e': + return "ADVP"; + case 'f': + return "SBAR"; + case 'g': + return "PP"; + case 'h': + return "QP"; + case 'i': + return "ADVP"; + case 'k': + return "PRT"; + case 'n': + return "NP"; + case 'p': + return "PP"; + case 'q': + return "SINV"; + case 's': + return "S"; + case 't': + return "VP"; + case 'u': + return "ADJP"; + case 'v': + return "VP"; + case 'y': + return "NP"; + case 'z': + return "VP"; + default: + { + err_ctxt ec = { linkage->sent }; + err_msgc(&ec, lg_Error, "Illegal domain: %c\n", domain_type); + return ""; + } + } +} + +static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage, + int numcon_total) +{ + size_t d, l, w2; + int c, w, c2, numcon_subl = 0; + PP_data *pp_data = &linkage->sent->constituent_pp->pp_data; + + for (d = 0, c = numcon_total; d < pp_data->N_domains; d++, c++) + { + size_t leftmost, rightmost, leftlimit; + int rootleft; + List_o_links * dlink; + + Domain domain = pp_data->domain_array[d]; + + // rootright = linkage_get_link_rword(linkage, domain.start_link); + rootleft = linkage_get_link_lword(linkage, domain.start_link); + + if ((domain.type=='c') || + (domain.type=='d') || + (domain.type=='e') || + (domain.type=='f') || + (domain.type=='g') || + (domain.type=='u') || + (domain.type=='y')) + { + leftlimit = 0; + leftmost = linkage_get_link_lword(linkage, domain.start_link); + rightmost = linkage_get_link_lword(linkage, domain.start_link); + } + else + { + leftlimit = linkage_get_link_lword(linkage, domain.start_link) + 1; + leftmost = linkage_get_link_rword(linkage, domain.start_link); + rightmost = linkage_get_link_rword(linkage, domain.start_link); + } + + /* Start by assigning both left and right limits to the + * right word of the start link. This will always be contained + * in the constituent. This will also handle the case + * where the domain contains no links. + */ + for (dlink = domain.lol; dlink != NULL; dlink = dlink->next) + { + l = dlink->link; + + if ((linkage_get_link_lword(linkage, l) < leftmost) && + (linkage_get_link_lword(linkage, l) >= leftlimit)) + { + leftmost = linkage_get_link_lword(linkage, l); + } + + if (linkage_get_link_rword(linkage, l) > rightmost) + { + rightmost = linkage_get_link_rword(linkage, l); + } + } + + c--; + c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, + cons_of_domain(linkage, domain.type)); + + if (domain.type == 'z') + { + c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); + } + if (domain.type=='c') + { + c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); + } + if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) || + (post_process_match("Rn", ctxt->constituent[c].start_link)==1)) + { + c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "SBAR"); + } + if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) || + (post_process_match("MX#r", ctxt->constituent[c].start_link)==1)) + { + w = leftmost; + if (strcmp(linkage->word[w], ",") == 0) w++; + c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHNP"); + } + if (post_process_match("Mj", ctxt->constituent[c].start_link) == 1) + { + w = leftmost; + if (strcmp(linkage->word[w], ",") == 0) w++; + c = add_constituent(ctxt, c, linkage, &domain, w, w+1, "WHPP"); + c = add_constituent(ctxt, c, linkage, &domain, w+1, w+1, "WHNP"); + } + if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) || + (post_process_match("B#d", ctxt->constituent[c].start_link)==1)) + { + c = add_constituent(ctxt, c, linkage, &domain, rootleft, rootleft, "WHNP"); + c = add_constituent(ctxt, c, linkage, &domain, + rootleft, ctxt->constituent[c-1].right, "SBAR"); + } + if (post_process_match("CP", ctxt->constituent[c].start_link)==1) + { + if (strcmp(linkage->word[leftmost], ",") == 0) + ctxt->constituent[c].left++; + c = add_constituent(ctxt, c, linkage, &domain, 1, linkage->num_words-1, "S"); + } + if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) || + (domain.type=='f')) + { + w = ctxt->constituent[c].left; + if (strcmp(linkage->word[w], ",") == 0) + w++; + if (strcmp(linkage->word[w], "when") == 0) + { + c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHADVP"); + } + } + if (domain.type=='t') + { + c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S"); + } + if ((post_process_match("QI", ctxt->constituent[c].start_link) == 1) || + (post_process_match("Mr", ctxt->constituent[c].start_link) == 1) || + (post_process_match("MX#d", ctxt->constituent[c].start_link) == 1)) + { + const char * name = ""; + w = leftmost; + if (strcmp(linkage->word[w], ",") == 0) w++; + if (ctxt->wordtype[w] == NONE) + name = "WHADVP"; + else if (ctxt->wordtype[w] == QTYPE) + name = "WHNP"; + else if (ctxt->wordtype[w] == QDTYPE) + name = "WHNP"; + else + assert(0, "Unexpected word type"); + c = add_constituent(ctxt, c, linkage, &domain, w, w, name); + + if (ctxt->wordtype[w] == QDTYPE) + { + /* Now find the finite verb to the right, start an S */ + /* Limit w2 to sentence length. */ + // for( w2=w+1; w2 < ctxt->r_limit-1; w2++ ) + for (w2 = w+1; w2 < rightmost; w2++) + if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break; + + /* Adjust the right boundary of previous constituent */ + ctxt->constituent[c].right = w2 - 1; + c = add_constituent(ctxt, c, linkage, &domain, w2, rightmost, "S"); + } + } + + if (ctxt->constituent[c].domain_type == '\0') + { + err_ctxt ec = { linkage->sent }; + err_msgc(&ec, lg_Error, "No domain type assigned to constituent\n"); + } + if (ctxt->constituent[c].start_link == NULL) + { + err_ctxt ec = { linkage->sent }; + err_msgc(&ec, lg_Error, "No type assigned to constituent\n"); + } + } + + numcon_subl = c - numcon_total; + /* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl); */ + + if (verbosity_level(D_CONST)) + { + err_msg(lg_Debug, "Constituents added at first stage:\n\\"); + for (c = numcon_total; c < numcon_total + numcon_subl; c++) + { + /* FIXME: Here it cannot be printed as one debug message because + * a newline is printed at the end. */ + print_constituent(ctxt, linkage, c); + } + } + + /* Opener case - generates S around main clause. + (This must be done first; the S generated will be needed for + later cases.) */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", CASE_OPENER); + + /* pp opener case */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", CASE_PPOPEN); + + /* participle opener case */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", CASE_PART_OPEN); + + /* Subject-phrase case; every main VP generates an S */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", CASE_S); + + /* Relative clause case; an SBAR generates a complement NP */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", CASE_REL_CLAUSE); + + /* Participle modifier case */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", CASE_PART_MOD); + + /* PP modifying NP */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", CASE_PART_MOD); + + /* Appositive case */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", CASE_APPOS); + + /* S-V inversion case; an NP generates a complement VP */ + numcon_subl = + gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", CASE_SVINV); + + adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl); + for (c = numcon_total; c < numcon_total + numcon_subl; c++) + { + if ((ctxt->constituent[c].domain_type=='p') && + (strcmp(linkage->word[ctxt->constituent[c].left], ",")==0)) + { + ctxt->constituent[c].left++; + } + } + + /* Make sure the constituents are nested. If two constituents + * are not nested: whichever constituent has the furthest left + * boundary, shift that boundary rightwards to the left boundary + * of the other one. + */ + while (true) + { + bool adjustment_made = false; + for (c = numcon_total; c < numcon_total + numcon_subl; c++) + { + for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++) + { + if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) && + (ctxt->constituent[c].right < ctxt->constituent[c2].right) && + (ctxt->constituent[c].right >= ctxt->constituent[c2].left)) + { + /* We've found two overlapping constituents. + If one is larger, except the smaller one + includes an extra comma, adjust the smaller one + to exclude the comma */ + + if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",") == 0) || + (strcmp(linkage->word[ctxt->constituent[c2].right], + RIGHT_WALL_WORD) == 0)) + { + if (verbosity_level(D_CONST)) + err_msg(lg_Debug, "Adjusting %d to fix comma overlap\n", c2); + adjust_for_right_comma(ctxt, linkage, c2); + adjustment_made = true; + } + else if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0) + { + if (verbosity_level(D_CONST)) + err_msg(lg_Debug, "Adjusting c %d to fix comma overlap\n", c); + adjust_for_left_comma(ctxt, linkage, c); + adjustment_made = true; + } + else + { + if (verbosity_level(D_CONST)) + { + err_ctxt ec = { linkage->sent }; + err_msgc(&ec, lg_Warn, + "Warning: the constituents aren't nested! " + "Adjusting them. (%d, %d)", c, c2); + } + ctxt->constituent[c].left = ctxt->constituent[c2].left; + } + } + } + } + if (adjustment_made == false) break; + } + + assert (numcon_total + numcon_subl < ctxt->conlen, "Too many constituents"); + return numcon_subl; +} + +static char * +exprint_constituent_structure(con_context_t *ctxt, + Linkage linkage, int numcon_total) +{ + size_t w; + int c; + bool *leftdone = alloca(numcon_total * sizeof(bool)); + bool *rightdone = alloca(numcon_total * sizeof(bool)); + int best, bestright, bestleft; + char s[MAX_WORD]; + dyn_str * cs = dyn_str_new(); + + assert (numcon_total < ctxt->conlen, "Too many constituents (b)"); + + for (c = 0; c < numcon_total; c++) + { + leftdone[c] = false; + rightdone[c] = false; + } + + /* Skip left wall; don't skip right wall, since it may + * have constituent boundaries. */ + for (w = 1; w < linkage->num_words; w++) + { + while (1) + { + best = -1; + bestright = -1; + for (c = 0; c < numcon_total; c++) + { + if ((ctxt->constituent[c].left == w) && + (leftdone[c] == false) && ctxt->constituent[c].valid && + ((int) ctxt->constituent[c].right >= bestright)) + { + best = c; + bestright = ctxt->constituent[c].right; + } + } + if (best == -1) + break; + + leftdone[best] = true; + dyn_strcat(cs, "["); + dyn_strcat(cs, ctxt->constituent[best].type); + dyn_strcat(cs, " "); + } + + /* Don't print out right wall */ + if (w < linkage->num_words - 1) + { + char *p; + strncpy(s, linkage->word[w], MAX_WORD); + s[MAX_WORD-1] = 0; + + /* Constituent processing will crash if the sentence contains + * square brackets, so we have to do something ... replace + * them with curly braces ... this is a terrible hack, but + * will have to do; for now. A better solution would be to + * allow the user to specify some reserved char as the + * bracket symbol, e.g. SOH and EOT or something like that. + */ + p = strchr(s, OPEN_BRACKET); + while (p) + { + *p = '{'; + p = strchr(p, OPEN_BRACKET); + } + + p = strchr(s, CLOSE_BRACKET); + while (p) + { + *p = '}'; + p = strchr(p, CLOSE_BRACKET); + } + +#if 0 /* firstupper check removed in 0c8107a */ + /* Now, if the first character of the word was + originally uppercase, we put it back that way */ + if (linkage->chosen_disjuncts[w]->word[0]->status & WS_FIRSTUPPER) + upcase_utf8_str(s, s, MAX_WORD); +#endif + dyn_strcat(cs, s); + dyn_strcat(cs, " "); + } + + while (1) + { + best = -1; + bestleft = -1; + for(c = 0; c < numcon_total; c++) + { + if ((ctxt->constituent[c].right == w) && + (rightdone[c] == false) && ctxt->constituent[c].valid && + ((int) ctxt->constituent[c].left > bestleft)) { + best = c; + bestleft = ctxt->constituent[c].left; + } + } + if (best == -1) + break; + rightdone[best] = true; + dyn_strcat(cs, ctxt->constituent[best].type); + dyn_strcat(cs, "] "); + } + } + + dyn_strcat(cs, "\n"); + return dyn_str_take(cs); +} + +static char * do_print_flat_constituents(con_context_t *ctxt, Linkage linkage) +{ + int numcon_total= 0, numcon_subl; + char * q; + Sentence sent = linkage->sent; + + ctxt->phrase_ss = string_set_create(); + generate_misc_word_info(ctxt, linkage); + + if (NULL == sent->constituent_pp) /* First time for this sentence */ + sent->constituent_pp = post_process_new(sent->dict->hpsg_knowledge); + + do_post_process(sent->constituent_pp, linkage, linkage->is_sent_long); + + /** No-op. If we wanted to debug domain names, we could do this... + * linkage_free_pp_info(linkage); + * linkage_set_domain_names(sent->constituent_pp, linkage); + */ + numcon_subl = read_constituents_from_domains(ctxt, linkage, numcon_total); + numcon_total += numcon_subl; + assert (numcon_total < ctxt->conlen, "Too many constituents (c)"); + numcon_total = merge_constituents(ctxt, linkage, numcon_total); + assert (numcon_total < ctxt->conlen, "Too many constituents (d)"); + numcon_total = new_style_conjunctions(ctxt, linkage, numcon_total); + assert (numcon_total < ctxt->conlen, "Too many constituents (e)"); + numcon_total = last_minute_fixes(ctxt, linkage, numcon_total); + assert (numcon_total < ctxt->conlen, "Too many constituents (f)"); + q = exprint_constituent_structure(ctxt, linkage, numcon_total); + string_set_delete(ctxt->phrase_ss); + ctxt->phrase_ss = NULL; + + post_process_free_data(&sent->constituent_pp->pp_data); + + return q; +} + +static char * print_flat_constituents(Linkage linkage) +{ + size_t wts = linkage->num_words * sizeof(WType); + size_t cns = (linkage->num_links + linkage->num_words) * sizeof(constituent_t); + + con_context_t ctxt; + memset(&ctxt, 0, sizeof(con_context_t)); + ctxt.wordtype = (WType *) alloca(wts); + memset(ctxt.wordtype, 0, wts); + ctxt.conlen = linkage->num_links + linkage->num_words; + ctxt.constituent = (constituent_t *) alloca(cns); + memset(ctxt.constituent, 0, cns); + + return do_print_flat_constituents(&ctxt, linkage); +} + +static CType token_type (char *token) +{ + if ((token[0] == OPEN_BRACKET) && (strlen(token) > 1)) + return OPEN_TOK; + if ((strlen(token) > 1) && (token[strlen(token) - 1] == CLOSE_BRACKET)) + return CLOSE_TOK; + return WORD_TOK; +} + +static CNode * make_CNode(char *q) +{ + CNode * cn; + cn = (CNode *) malloc(sizeof(CNode)); + cn->label = strdup(q); + cn->child = cn->next = (CNode *) NULL; + cn->next = (CNode *) NULL; + cn->start = cn->end = -1; + return cn; +} + +static CNode * parse_string(CNode * n, char **saveptr) +{ + char *q; + CNode *m, *last_child=NULL; + + while ((q = strtok_r(NULL, " ", saveptr))) { + switch (token_type(q)) { + case CLOSE_TOK : + q[strlen(q)-1]='\0'; + assert(strcmp(q, n->label)==0, + "Constituent tree: Labels do not match."); + return n; + break; + case OPEN_TOK: + m = make_CNode(q+1); + m = parse_string(m, saveptr); + break; + case WORD_TOK: + m = make_CNode(q); + break; + default: + assert(0, "Constituent tree: Illegal token type"); + } + if (n->child == NULL) { + last_child = n->child = m; + } + else { + last_child->next = m; + last_child = m; + } + } + assert(0, "Constituent tree: Constituent did not close"); + return NULL; +} + +static void print_tree(dyn_str * cs, int indent, CNode * n, int o1, int o2) +{ + int i, child_offset; + CNode * m; + + if (n == NULL) return; + + if (indent) + for (i = 0; i < o1; ++i) dyn_strcat(cs, " "); + + dyn_strcat(cs, "("); + dyn_strcat(cs, n->label); + dyn_strcat(cs, " "); + child_offset = o2 + strlen(n->label) + 2; + + for (m = n->child; m != NULL; m = m->next) + { + if (m->child == NULL) + { + char * p; + /* If the original string has left or right parens in it, + * the printed string will be messed up by these ... + * so replace them by curly braces. What else can one do? + */ + p = strchr(m->label, '('); + while(p) + { + *p = '{'; + p = strchr(p, '('); + } + + p = strchr(m->label, ')'); + while(p) + { + *p = '}'; + p = strchr(p, ')'); + } + + dyn_strcat(cs, m->label); + if ((m->next != NULL) && (m->next->child == NULL)) + dyn_strcat(cs, " "); + } + else + { + if (m != n->child) + { + if (indent) dyn_strcat(cs, "\n"); + else dyn_strcat(cs, " "); + print_tree(cs, indent, m, child_offset, child_offset); + } + else + { + print_tree(cs, indent, m, 0, child_offset); + } + if ((m->next != NULL) && (m->next->child == NULL)) + { + if (indent) + { + dyn_strcat(cs, "\n"); + for (i = 0; i < child_offset; ++i) + dyn_strcat(cs, " "); + } + else dyn_strcat(cs, " "); + } + } + } + dyn_strcat(cs, ")"); +} + +static int assign_spans(CNode * n, int start) +{ + int num_words=0; + CNode * m=NULL; + if (n==NULL) return 0; + n->start = start; + if (n->child == NULL) { + n->end = start; + return 1; + } + else { + for (m=n->child; m!=NULL; m=m->next) { + num_words += assign_spans(m, start+num_words); + } + n->end = start+num_words-1; + } + return num_words; +} + +static CNode * linkage_constituent_tree(Linkage linkage) +{ + char *p, *q, *saveptr; + CNode * root; + + p = print_flat_constituents(linkage); + + q = strtok_r(p, " ", &saveptr); + assert(token_type(q) == OPEN_TOK, "Illegal beginning of string"); + root = make_CNode(q+1); + root = parse_string(root, &saveptr); + assign_spans(root, 0); + free(p); + return root; +} + +/* Make the compiler shut up about the deprecated functions */ +/* +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +*/ + +static void linkage_free_constituent_tree(CNode * n) +{ + CNode *m, *x; + for (m=n->child; m!=NULL; m=x) { + x=m->next; + linkage_free_constituent_tree(m); + } + free(n->label); + free(n); +} + +/** + * Print out the constituent tree. + * mode 1: treebank-style constituent tree + * mode 2: flat, bracketed tree [A like [B this B] A] + * mode 3: flat, treebank-style tree (A like (B this)) + */ +char * linkage_print_constituent_tree(Linkage linkage, ConstituentDisplayStyle mode) +{ + CNode * root; + + if (!linkage) return NULL; + if (mode == NO_DISPLAY) + { + return NULL; + } + else if (mode == MULTILINE || mode == SINGLE_LINE) + { + dyn_str * cs; + cs = dyn_str_new(); + root = linkage_constituent_tree(linkage); + print_tree(cs, (mode==1), root, 0, 0); + linkage_free_constituent_tree(root); + dyn_strcat(cs, "\n"); + return dyn_str_take(cs); + } + else if (mode == BRACKET_TREE) + { + return print_flat_constituents(linkage); + } + prt_error("Warning: Illegal mode %d for printing constituents\n" + "Allowed values: %d to %d\n", mode, NO_DISPLAY, MAX_STYLES); + return NULL; +} + +void linkage_free_constituent_tree_str(char * s) +{ + free(s); +} diff -Nru link-grammar-5.3.16/link-grammar/post-process/post-process.c link-grammar-5.5.0/link-grammar/post-process/post-process.c --- link-grammar-5.3.16/link-grammar/post-process/post-process.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/post-process.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,1494 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +/* see bottom of file for comments on post processing */ + +#include +#include + +#include "post-process.h" + +#include "api-structures.h" +#include "error.h" +#include "linkage/linkage.h" +#include "linkage/score.h" +#include "pp_knowledge.h" +#include "pp_linkset.h" +#include "pp-structures.h" +#include "resources.h" +#include "string-set.h" + +#define PP_MAX_DOMAINS 128 + +/** + * post_process_match -- compare two link-types. + * + * string comparison in postprocessing. The first parameter is a + * post-processing symbol. The second one is a connector name from a + * link. The upper case parts must match. We imagine that the first + * arg is padded with an infinite sequence of "#" and that the 2nd one + * is padded with "*". "#" matches anything, but "*" is just like an + * ordinary char for matching purposes. + */ + +bool post_process_match(const char *s, const char *t) +{ + if (NULL == t) return false; + if (islower((int)*t)) t++; /* Skip head-dependent indicator */ + while (isupper((int)*s) || isupper((int)*t)) + { + if (*s != *t) return false; + s++; + t++; + } + while (*s != '\0') + { + if (*s != '#') + { + char c; + if (*t == '\0') c = '*'; else c = *t; + if (*s != c) return false; + } + s++; + if (*t != '\0') t++; + } + return true; +} + +/***************** utility routines (not exported) ***********************/ + +/** + * Returns false if the string s does not match anything in + * the array. The array elements are post-processing symbols. + */ +static bool string_in_list(const char * s, const char * a[]) +{ + int i; + for (i=0; a[i] != NULL; i++) + if (post_process_match(a[i], s)) return true; + return false; +} + +/** + * Return the name of the domain associated with the provided starting + * link. Return -1 if link isn't associated with a domain. + */ +static size_t find_domain_name(Postprocessor *pp, const char *link) +{ + size_t i, domain; + StartingLinkAndDomain *sllt = pp->knowledge->starting_link_lookup_table; + for (i=0;;i++) + { + domain = sllt[i].domain; + if (domain == SIZE_MAX) return SIZE_MAX; /* hit the end-of-list sentinel */ + if (post_process_match(sllt[i].starting_link, link)) return domain; + } +} + +/** Returns true if domain d1 is contained in domain d2 */ +static bool contained_in(const Domain * d1, const Domain * d2, + const Linkage sublinkage) +{ + bool *mark = alloca(sublinkage->num_links*sizeof(bool)); + List_o_links * lol; + memset(mark, 0, sublinkage->num_links*(sizeof(bool))); + for (lol=d2->lol; lol != NULL; lol = lol->next) + mark[lol->link] = true; + for (lol=d1->lol; lol != NULL; lol = lol->next) + if (!mark[lol->link]) return false; + return true; +} + +/** Returns the predicate "the given link is in the given domain" */ +static bool link_in_domain(size_t link, const Domain * d) +{ + List_o_links * lol; + for (lol = d->lol; lol != NULL; lol = lol->next) + if (lol->link == link) return true; + return false; +} + +/* #define CHECK_DOMAIN_NESTING */ + +#if defined(CHECK_DOMAIN_NESTING) +/* Although this is no longer used, I'm leaving the code here for future reference --DS 3/98 */ + +/* Returns true if the domains actually form a properly nested structure */ +static bool check_domain_nesting(Postprocessor *pp, int num_links) +{ + size_t id1, id2; + Domain * d1, * d2; + int counts[4]; + char mark[MAX_NUM_LINKS]; + List_o_links * lol; + int i; + PP_data *pp_data = &pp->pp_data; + + for (id1 = 0; id1 < pp_data->N_domains; id1++) + { + d1 = &pp_data->domain_array[id1]; + for (id2 = id1+1; id2 < pp_data->N_domains; id2++) + { + d2 = &pp_data->domain_array[id2]; + + memset(mark, 0, num_links); + for (lol=d2->lol; lol != NULL; lol = lol->next) + mark[lol->link] = 1; + + for (lol=d1->lol; lol != NULL; lol = lol->next) + mark[lol->link] += 2; + + counts[0] = counts[1] = counts[2] = counts[3] = 0; + for (i=0; i 0) && (counts[2] > 0) && (counts[3] > 0)) + return false; + } + } + return true; +} +#endif + +/** + * Free the list of links pointed to by lol + * (does not free any strings) + */ +static void free_List_o_links(List_o_links *lol) +{ + List_o_links * xlol; + while (lol != NULL) + { + xlol = lol->next; + free(lol); + lol = xlol; + } +} + +static void free_D_tree_leaves(DTreeLeaf *dtl) +{ + DTreeLeaf * xdtl; + while (dtl != NULL) + { + xdtl = dtl->next; + free(dtl); + dtl = xdtl; + } +} + +static void pp_free_domain_array(PP_data *ppd) +{ + size_t d; + for (d = 0; d < ppd->domlen; d++) + { + free_List_o_links(ppd->domain_array[d].lol); + ppd->domain_array[d].lol = NULL; + free_D_tree_leaves(ppd->domain_array[d].child); + ppd->domain_array[d].child = NULL; + } +} + +/** + * Gets called after every invocation of post_process() + */ +void post_process_free_data(PP_data * ppd) +{ + size_t w; + for (w = 0; w < ppd->wowlen; w++) + { + free_List_o_links(ppd->word_links[w]); + ppd->word_links[w] = NULL; + } + + pp_free_domain_array(ppd); + free_List_o_links(ppd->links_to_ignore); + ppd->links_to_ignore = NULL; + ppd->num_words = 0; + ppd->N_domains = 0; +} + +#ifdef THIS_FUNCTION_IS_NOT_CURRENTLY_USED +static void connectivity_dfs(Postprocessor *pp, Linkage sublinkage, + int w, pp_linkset *ls) +{ + List_o_links *lol; + assert(w < pp_data->num_words, "Bad word index"); + pp_data->visited[w] = true; + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + if (!pp_data->visited[lol->word] && + !pp_linkset_match(ls, sublinkage->link[lol->link]->name)) + connectivity_dfs(pp, sublinkage, lol->word, ls); + } +} +#endif /* THIS_FUNCTION_IS_NOT_CURRENTLY_USED */ + +const char * linkage_get_violation_name(const Linkage linkage) +{ + return linkage->lifo.pp_violation_msg; +} + +/************************ rule application *******************************/ + +static void clear_visited(PP_data *pp_data) +{ + memset(pp_data->visited, 0, pp_data->num_words * sizeof(bool)); +} + +static bool apply_rules(PP_data *pp_data, + bool (applyfn) (PP_data *, Linkage, pp_rule *), + Linkage sublinkage, + pp_rule *rule_array, + const char **msg) +{ + int i; + for (i = 0; (*msg = rule_array[i].msg) != NULL; i++) + { + if (!applyfn(pp_data, sublinkage, &(rule_array[i]))) + { + rule_array[i].use_count ++; + return false; + } + } + return true; +} + +static bool +apply_relevant_rules(Postprocessor *pp, + bool (applyfn)(PP_data *, Linkage, pp_rule *), + Linkage sublinkage, + pp_rule *rule_array, + int *relevant_rules, + const char **msg) +{ + int i, idx; + PP_data *pp_data = &pp->pp_data; + + /* If we didn't accumulate link names for this sentence, we need + * to apply all rules. */ + if (pp_linkset_population(pp->set_of_links_of_sentence) == 0) { + return apply_rules(pp_data, applyfn, sublinkage, rule_array, msg); + } + + /* We did, and we don't. */ + for (i = 0; (idx = relevant_rules[i]) != -1; i++) + { + *msg = rule_array[idx].msg; + if (!applyfn(pp_data, sublinkage, &(rule_array[idx]))) return false; + } + return true; +} + +/** + * returns true if and only if all groups containing the specified link + * contain at least one from the required list. (as determined by exact + * string matching) + */ +static bool +apply_contains_one(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) +{ + DTreeLeaf * dtl; + size_t d, count; + + for (d=0; dN_domains; d++) + { + for (dtl = pp_data->domain_array[d].child; + dtl != NULL && + !post_process_match(rule->selector, + sublinkage->link_array[dtl->link].link_name); + dtl = dtl->next) {} + if (dtl != NULL) + { + /* selector link of rule appears in this domain */ + count=0; + for (dtl = pp_data->domain_array[d].child; dtl != NULL; dtl = dtl->next) + { + if (string_in_list(sublinkage->link_array[dtl->link].link_name, + rule->link_array)) + { + count=1; + break; + } + } + if (count == 0) return false; + } + } + return true; +} + + +/** + * Returns true if and only if: + * all groups containing the selector link do not contain anything + * from the link_array contained in the rule. Uses exact string matching. + */ +static bool +apply_contains_none(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) +{ + size_t d; + + for (d=0; dN_domains; d++) + { + DTreeLeaf * dtl; + for (dtl = pp_data->domain_array[d].child; + dtl != NULL && + !post_process_match(rule->selector, + sublinkage->link_array[dtl->link].link_name); + dtl = dtl->next) {} + if (dtl != NULL) + { + /* selector link of rule appears in this domain */ + for (dtl = pp_data->domain_array[d].child; dtl != NULL; dtl = dtl->next) + { + if (string_in_list(sublinkage->link_array[dtl->link].link_name, + rule->link_array)) + return false; + } + } + } + return true; +} + +/** + * Returns true if and only if + * (1) the sentence doesn't contain the selector link for the rule, or + * (2) it does, and it also contains one or more from the rule's link set + */ +static bool +apply_contains_one_globally(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) +{ + size_t i, j, count; + for (i = 0; i < sublinkage->num_links; i++) + { + assert(sublinkage->link_array[i].lw != SIZE_MAX); + if (post_process_match(rule->selector, sublinkage->link_array[i].link_name)) break; + } + if (i == sublinkage->num_links) return true; + + /* selector link of rule appears in sentence */ + count = 0; + for (j = 0; j < sublinkage->num_links && count == 0; j++) + { + assert(sublinkage->link_array[j].lw != SIZE_MAX); + if (string_in_list(sublinkage->link_array[j].link_name, rule->link_array)) + { + count = 1; + break; + } + } + if (count == 0) return false; else return true; +} + +/** + * For each link in the linkage that is in the must_form_a_cycle list, + * we want to make sure that that link is in a cycle. We do this + * simply by deleting the link, then seeing if the end points of that + * link are still connected. + */ +static void reachable_without_dfs(PP_data *pp_data, + Linkage sublinkage, size_t a, size_t b, size_t w) +{ + /* This is a depth first search of words reachable from w, excluding + * any direct edge between word a and word b. */ + List_o_links *lol; + assert(w < pp_data->num_words, "Bad word index"); + pp_data->visited[w] = true; + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + assert(lol->word < pp_data->num_words, "Bad word index"); + if (!pp_data->visited[lol->word] && + !(w == a && lol->word == b) && + !(w == b && lol->word == a)) + { + reachable_without_dfs(pp_data, sublinkage, a, b, lol->word); + } + } +} + +/** + * Returns true if the linkage is connected when ignoring the links + * whose names are in the given list of link names. + * Actually, what it does is this: it returns false if the connectivity + * of the subgraph reachable from word 0 changes as a result of deleting + * these links. + */ +static bool +apply_must_form_a_cycle(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) +{ + List_o_links *lol; + size_t w; + + for (w = 0; w < pp_data->num_words; w++) + { + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + if (w > lol->word) continue; /* only consider each edge once */ + if (!pp_linkset_match(rule->link_set, sublinkage->link_array[lol->link].link_name)) continue; + + clear_visited(pp_data); + reachable_without_dfs(pp_data, sublinkage, w, lol->word, w); + if (!pp_data->visited[lol->word]) return false; + } + } + + for (lol = pp_data->links_to_ignore; lol != NULL; lol = lol->next) + { + w = sublinkage->link_array[lol->link].lw; + /* (w, lol->word) are the left and right ends of the edge we're considering */ + if (!pp_linkset_match(rule->link_set, sublinkage->link_array[lol->link].link_name)) continue; + + clear_visited(pp_data); + reachable_without_dfs(pp_data, sublinkage, w, lol->word, w); + + assert(lol->word < pp_data->num_words, "Bad word index"); + if (!pp_data->visited[lol->word]) return false; + } + + return true; +} + +/** + * Checks to see that all domains with this name have the property that + * all of the words that touch a link in the domain are not to the left + * of the root word of the domain. + */ +static bool +apply_bounded(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) +{ + size_t d, lw; + List_o_links * lol; + char d_type = rule->domain; + + for (d = 0; d < pp_data->N_domains; d++) + { + if (pp_data->domain_array[d].type != d_type) continue; + lw = sublinkage->link_array[pp_data->domain_array[d].start_link].lw; + for (lol = pp_data->domain_array[d].lol; lol != NULL; lol = lol->next) + { + if (sublinkage->link_array[lol->link].lw < lw) return false; + } + } + return true; +} + +/** + * fill in the pp->pp_data.word_links array with a list of words + * neighboring each word (actually a list of links). This is an + * undirected graph. + */ +static void build_graph(Postprocessor *pp, Linkage sublinkage) +{ + size_t link; + List_o_links * lol; + PP_data *pp_data = &pp->pp_data; + + /* Get more size, if needed */ + if (pp_data->wowlen <= pp_data->num_words) + { + size_t newsz; + pp_data->wowlen += pp_data->num_words; + newsz = pp_data->wowlen * sizeof(List_o_links *); + pp_data->word_links = (List_o_links **) realloc( + pp_data->word_links, newsz); + } + memset(pp_data->word_links, 0, pp_data->wowlen * sizeof(List_o_links *)); + + for (link = 0; link < sublinkage->num_links; link++) + { + assert (sublinkage->link_array[link].lw != SIZE_MAX); + if (NULL == sublinkage->link_array[link].link_name) continue; + if (pp_linkset_match(pp->knowledge->ignore_these_links, + sublinkage->link_array[link].link_name)) + { + lol = (List_o_links *) malloc(sizeof(List_o_links)); + lol->next = pp_data->links_to_ignore; + pp_data->links_to_ignore = lol; + lol->link = link; + lol->word = sublinkage->link_array[link].rw; + continue; + } + + lol = (List_o_links *) malloc(sizeof(List_o_links)); + lol->next = pp_data->word_links[sublinkage->link_array[link].lw]; + pp_data->word_links[sublinkage->link_array[link].lw] = lol; + lol->link = link; + lol->word = sublinkage->link_array[link].rw; + + lol = (List_o_links *) malloc(sizeof(List_o_links)); + lol->next = pp_data->word_links[sublinkage->link_array[link].rw]; + pp_data->word_links[sublinkage->link_array[link].rw] = lol; + lol->link = link; + lol->word = sublinkage->link_array[link].lw; + } +} + +static void setup_domain_array(Postprocessor *pp, + const char *string, int start_link) +{ + PP_data *pp_data = &pp->pp_data; + size_t n = pp_data->N_domains; + + /* Grab more memory if needed */ + if (pp_data->domlen <= n) + { + size_t oldsz, incsz; +#define DOMINC 16 + oldsz = pp_data->domlen * sizeof(Domain); + incsz = DOMINC * sizeof(Domain); + pp_data->domain_array = (Domain *) realloc(pp_data->domain_array, + oldsz + incsz); + memset(&pp_data->domain_array[pp_data->domlen], 0, incsz); + pp_data->domlen += DOMINC; + } + + pp_data->domain_array[n].string = string; + pp_data->domain_array[n].lol = NULL; + pp_data->domain_array[n].size = 0; + pp_data->domain_array[n].start_link = start_link; + + pp_data->N_domains++; + assert(pp_data->N_domainsN_domains - 1; /* the very last one */ + List_o_links *lol = (List_o_links *) malloc(sizeof(List_o_links)); + + lol->next = pp_data->domain_array[n].lol; + pp_data->domain_array[n].lol = lol; + pp_data->domain_array[n].size++; + lol->link = link; +} + +static void depth_first_search(Postprocessor *pp, Linkage sublinkage, + size_t w, size_t root, size_t start_link) +{ + List_o_links *lol; + PP_data *pp_data = &pp->pp_data; + + assert(w < pp_data->num_words, "Bad word index"); + pp_data->visited[w] = true; + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + if (lol->word < w && lol->link != start_link) + { + add_link_to_domain(pp_data, lol->link); + } + } + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + if (!pp_data->visited[lol->word] && (lol->word != root) && + !(lol->word < root && lol->word < w && + pp_linkset_match(pp->knowledge->restricted_links, + sublinkage->link_array[lol->link].link_name))) + { + depth_first_search(pp, sublinkage, lol->word, root, start_link); + } + } +} + +static void bad_depth_first_search(Postprocessor *pp, Linkage sublinkage, + size_t w, size_t root, size_t start_link) +{ + List_o_links * lol; + PP_data *pp_data = &pp->pp_data; + + assert(w < pp_data->num_words, "Bad word index"); + pp_data->visited[w] = true; + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + if ((lol->word < w) && (lol->link != start_link) && (w != root)) + { + add_link_to_domain(pp_data, lol->link); + } + } + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + assert(lol->word < pp_data->num_words, "Bad word index"); + if ((!pp_data->visited[lol->word]) && !(w == root && lol->word < w) && + !(lol->word < root && lol->word < w && + pp_linkset_match(pp->knowledge->restricted_links, + sublinkage->link_array[lol->link].link_name))) + { + bad_depth_first_search(pp, sublinkage, lol->word, root, start_link); + } + } +} + +static void d_depth_first_search(Postprocessor *pp, Linkage sublinkage, + size_t w, size_t root, size_t right, size_t start_link) +{ + List_o_links * lol; + PP_data *pp_data = &pp->pp_data; + + assert(w < pp_data->num_words, "Bad word index"); + pp_data->visited[w] = true; + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + if ((lol->word < w) && (lol->link != start_link) && (w != root)) + { + add_link_to_domain(pp_data, lol->link); + } + } + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + assert(lol->word < pp_data->num_words, "Bad word index"); + if (!pp_data->visited[lol->word] && !(w == root && lol->word >= right) && + !(w == root && lol->word < root) && + !(lol->word < root && lol->word < w && + pp_linkset_match(pp->knowledge->restricted_links, + sublinkage->link_array[lol->link].link_name))) + { + d_depth_first_search(pp,sublinkage,lol->word,root,right,start_link); + } + } +} + +static void left_depth_first_search(Postprocessor *pp, Linkage sublinkage, + size_t w, size_t right, size_t start_link) +{ + List_o_links *lol; + PP_data *pp_data = &pp->pp_data; + + assert(w < pp_data->num_words, "Bad word index"); + pp_data->visited[w] = true; + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + if (lol->word < w && lol->link != start_link) + { + add_link_to_domain(pp_data, lol->link); + } + } + for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) + { + assert(lol->word < pp_data->num_words, "Bad word index"); + if (!pp_data->visited[lol->word] && (lol->word != right)) + { + depth_first_search(pp, sublinkage, lol->word, right, start_link); + } + } +} + +static int domain_compare(const Domain * d1, const Domain * d2) +{ + return (d1->size - d2->size); /* for sorting the domains by size */ +} + +static void build_domains(Postprocessor *pp, Linkage sublinkage) +{ + size_t link, i, d; + const char *s; + PP_data *pp_data = &pp->pp_data; + + pp_data->N_domains = 0; + + for (link = 0; linknum_links; link++) + { + assert (sublinkage->link_array[link].lw != SIZE_MAX); + if (NULL == sublinkage->link_array[link].link_name) continue; + s = sublinkage->link_array[link].link_name; + + if (pp_linkset_match(pp->knowledge->ignore_these_links, s)) continue; + if (pp_linkset_match(pp->knowledge->domain_starter_links, s)) + { + setup_domain_array(pp, s, link); + if (pp_linkset_match(pp->knowledge->domain_contains_links, s)) + add_link_to_domain(pp_data, link); + + clear_visited(pp_data); + depth_first_search(pp, sublinkage, sublinkage->link_array[link].rw, + sublinkage->link_array[link].lw, link); + } + else + if (pp_linkset_match(pp->knowledge->urfl_domain_starter_links, s)) + { + setup_domain_array(pp, s, link); + /* always add the starter link to its urfl domain */ + add_link_to_domain(pp_data, link); + + clear_visited(pp_data); + bad_depth_first_search(pp, sublinkage,sublinkage->link_array[link].rw, + sublinkage->link_array[link].lw, link); + } + else + if (pp_linkset_match(pp->knowledge->urfl_only_domain_starter_links, s)) + { + setup_domain_array(pp, s, link); + /* do not add the starter link to its urfl_only domain */ + clear_visited(pp_data); + d_depth_first_search(pp, sublinkage, sublinkage->link_array[link].lw, + sublinkage->link_array[link].lw, + sublinkage->link_array[link].rw, link); + } + else + if (pp_linkset_match(pp->knowledge->left_domain_starter_links, s)) + { + setup_domain_array(pp, s, link); + /* do not add the starter link to a left domain */ + clear_visited(pp_data); + left_depth_first_search(pp, sublinkage, sublinkage->link_array[link].lw, + sublinkage->link_array[link].rw, link); + } + } + + /* sort the domains by size */ + qsort((void *) pp_data->domain_array, + pp_data->N_domains, + sizeof(Domain), + (int (*)(const void *, const void *)) domain_compare); + + /* sanity check: all links in all domains have a legal domain name */ + for (d = 0; d < pp_data->N_domains; d++) + { + i = find_domain_name(pp, pp_data->domain_array[d].string); + if (i == SIZE_MAX) + prt_error("Error: post_process(): Need an entry for %s in LINK_TYPE_TABLE\n", + pp_data->domain_array[d].string); + pp_data->domain_array[d].type = i; + } +} + +static void build_domain_forest(PP_data *pp_data, Linkage sublinkage) +{ + size_t d, d1, link; + DTreeLeaf * dtl; + + if (0 == pp_data->N_domains) return; + + pp_data->domain_array[pp_data->N_domains-1].parent = NULL; + for (d=0; d < pp_data->N_domains-1; d++) + { + for (d1 = d+1; d1 < pp_data->N_domains; d1++) + { + if (contained_in(&pp_data->domain_array[d], &pp_data->domain_array[d1], sublinkage)) + { + pp_data->domain_array[d].parent = &pp_data->domain_array[d1]; + break; + } + } + if (d1 == pp_data->N_domains) + { + /* we know this domain is a root of a new tree */ + pp_data->domain_array[d].parent = NULL; + } + } + + /* The parent links of domain nodes have been established. + * Now do the leaves. */ + for (d = 0; d < pp_data->N_domains; d++) + { + pp_data->domain_array[d].child = NULL; + } + + for (link=0; link < sublinkage->num_links; link++) + { + assert (sublinkage->link_array[link].lw != SIZE_MAX); + for (d=0; dN_domains; d++) + { + if (link_in_domain(link, &pp_data->domain_array[d])) + { + dtl = (DTreeLeaf *) malloc(sizeof(DTreeLeaf)); + dtl->link = link; + dtl->parent = &pp_data->domain_array[d]; + dtl->next = pp_data->domain_array[d].child; + pp_data->domain_array[d].child = dtl; + break; + } + } + } +} + +static int +internal_process(Postprocessor *pp, Linkage sublinkage, const char **msg) +{ + size_t i; + PP_data *pp_data = &pp->pp_data; + + /* quick test: try applying just the relevant global rules */ + if (!apply_relevant_rules(pp, apply_contains_one_globally, + sublinkage, + pp->knowledge->contains_one_rules, + pp->relevant_contains_one_rules, msg)) + { + for (i = 0; i < pp_data->wowlen; i++) + pp_data->word_links[i] = NULL; + pp_data->N_domains = 0; + return -1; + } + + /* build graph; confirm that it's legally connected */ + build_graph(pp, sublinkage); + build_domains(pp, sublinkage); + build_domain_forest(&pp->pp_data, sublinkage); + +#if defined(CHECK_DOMAIN_NESTING) + /* These messages were deemed to not be useful, so + * this code is commented out. See comment above. */ + if (!check_domain_nesting(pp, sublinkage->num_links)) + prt_error("Warning: The domains are not nested.\n"); +#endif + + /* The order below should be optimal for most cases */ + if (!apply_relevant_rules(pp, apply_contains_one, sublinkage, + pp->knowledge->contains_one_rules, + pp->relevant_contains_one_rules, msg)) return 1; + if (!apply_relevant_rules(pp, apply_contains_none, sublinkage, + pp->knowledge->contains_none_rules, + pp->relevant_contains_none_rules, msg)) return 1; + if (!apply_rules(pp_data, apply_must_form_a_cycle, sublinkage, + pp->knowledge->form_a_cycle_rules,msg)) return 1; + if (!apply_rules(pp_data, apply_bounded, sublinkage, + pp->knowledge->bounded_rules, msg)) return 1; + return 0; /* This linkage satisfied all the rules */ +} + + +/** + * Call this (a) after having called post_process_scan_linkage() on all + * generated linkages, but (b) before calling post_process() on any + * particular linkage. Here we mark all rules which we know (from having + * accumulated a set of link names appearing in *any* linkage) that won't + * ever be needed. + */ +static void prune_irrelevant_rules(Postprocessor *pp) +{ + pp_rule *rule; + int coIDX, cnIDX, rcoIDX = 0, rcnIDX = 0; + + /* If we didn't scan any linkages, there's no pruning to be done. */ + if (pp_linkset_population(pp->set_of_links_of_sentence) == 0) return; + + for (coIDX = 0; ; coIDX++) + { + rule = &(pp->knowledge->contains_one_rules[coIDX]); + if (rule->msg == NULL) break; + if (pp_linkset_match_bw(pp->set_of_links_of_sentence, rule->selector)) + { + /* Mark rule as being relevant to this sentence */ + pp->relevant_contains_one_rules[rcoIDX++] = coIDX; + pp_linkset_add(pp->set_of_links_in_an_active_rule, rule->selector); + } + } + pp->relevant_contains_one_rules[rcoIDX] = -1; /* end sentinel */ + + for (cnIDX = 0; ; cnIDX++) + { + rule = &(pp->knowledge->contains_none_rules[cnIDX]); + if (rule->msg == NULL) break; + if (pp_linkset_match_bw(pp->set_of_links_of_sentence, rule->selector)) + { + pp->relevant_contains_none_rules[rcnIDX++] = cnIDX; + pp_linkset_add(pp->set_of_links_in_an_active_rule, rule->selector); + } + } + pp->relevant_contains_none_rules[rcnIDX] = -1; + + if (verbosity_level(5)) + { + err_msg(lg_Debug, "PP: Saw %zd unique link names in all linkages.\n\\", + pp_linkset_population(pp->set_of_links_of_sentence)); + err_msg(lg_Debug, "PP: Using %i 'contains one' rules " + "and %i 'contains none' rules\n", + rcoIDX, rcnIDX); + } +} + + +/***************** definitions of exported functions ***********************/ + +#define PP_INITLEN 60 /* just starting size, it is expanded if needed */ + +static void pp_new_domain_array(PP_data *pp_data) +{ + pp_data->domlen = PP_INITLEN; + pp_data->domain_array = (Domain*) malloc(pp_data->domlen * sizeof(Domain)); + memset(pp_data->domain_array, 0, pp_data->domlen * sizeof(Domain)); +} + +/** + * read rules from path and initialize the appropriate fields in + * a postprocessor structure, a pointer to which is returned. + */ +Postprocessor * post_process_new(pp_knowledge * kno) +{ + Postprocessor *pp; + PP_data *pp_data; + + if (NULL == kno) return NULL; + + pp = (Postprocessor *) malloc (sizeof(Postprocessor)); + pp->knowledge = kno; + pp->string_set = string_set_create(); + pp->set_of_links_of_sentence = pp_linkset_open(1024); + pp->set_of_links_in_an_active_rule = pp_linkset_open(1024); + pp->relevant_contains_one_rules = + (int *) malloc ((pp->knowledge->n_contains_one_rules + 1) + *(sizeof pp->relevant_contains_one_rules[0])); + pp->relevant_contains_none_rules = + (int *) malloc ((pp->knowledge->n_contains_none_rules + 1) + *(sizeof pp->relevant_contains_none_rules[0])); + pp->relevant_contains_one_rules[0] = -1; + pp->relevant_contains_none_rules[0] = -1; + pp->violation = NULL; + pp->n_local_rules_firing = 0; + pp->n_global_rules_firing = 0; + + pp->q_pruned_rules = false; + + pp_data = &pp->pp_data; + pp_data->vlength = PP_INITLEN; + pp_data->visited = (bool*) malloc(pp_data->vlength * sizeof(bool)); + memset(pp_data->visited, 0, pp_data->vlength * sizeof(bool)); + + pp_data->links_to_ignore = NULL; + pp_new_domain_array(pp_data); + + pp_data->wowlen = PP_INITLEN; + pp_data->word_links = (List_o_links **) malloc(pp_data->wowlen * sizeof(List_o_links*)); + memset(pp_data->word_links, 0, pp_data->wowlen * sizeof(List_o_links *)); + + return pp; +} + +void post_process_free(Postprocessor *pp) +{ + PP_data *pp_data; + + /* frees up memory associated with pp, previously allocated by open */ + if (pp == NULL) return; + string_set_delete(pp->string_set); + pp_linkset_close(pp->set_of_links_of_sentence); + pp_linkset_close(pp->set_of_links_in_an_active_rule); + free(pp->relevant_contains_one_rules); + free(pp->relevant_contains_none_rules); + pp->knowledge = NULL; + pp->violation = NULL; + + pp_data = &pp->pp_data; + post_process_free_data(pp_data); + free(pp_data->visited); + free(pp_data->domain_array); + free(pp_data->word_links); + + free(pp); +} + +/** + * During a first pass (prior to actual post-processing of the linkages + * of a sentence), call this once for every generated linkage. Here we + * simply maintain a set of "seen" link names for rule pruning, later on. + */ +static void post_process_scan_linkage(Postprocessor *pp, Linkage linkage) +{ + size_t i; + if (pp == NULL) return; + for (i = 0; i < linkage->num_links; i++) + { + assert(linkage->link_array[i].lw != SIZE_MAX); + + pp_linkset_add(pp->set_of_links_of_sentence, + linkage->link_array[i].link_name); + } +} + +static size_t report_rule_use(pp_rule *set) +{ + size_t cnt = 0; + size_t i; + for (i=0; set[i].msg != NULL; i++) + { + err_msg(lg_Debug, "Used: %d rule: %s\n", set[i].use_count, set[i].msg); + cnt++; + } + return cnt; +} + +static size_t report_unused_rule(pp_rule *set) +{ + size_t i; + size_t cnt = 0; + for (i=0; set[i].msg != NULL; i++) + { + if (0 == set[i].use_count) + { + err_msg(lg_Debug, "Unused rule: %s\n", set[i].msg); + cnt++; + } + } + return cnt; +} + +static void report_pp_stats(Postprocessor *pp) +{ + size_t rule_cnt = 0; + size_t unused_cnt = 0; + pp_knowledge * kno; + if (!verbosity_level(9)) return; + + err_msg(lg_Debug, "PP stats: local_rules_firing=%d\n", pp->n_local_rules_firing); + kno = pp->knowledge; + + err_msg(lg_Debug, "\nPP stats: form_a_cycle_rules\n"); + rule_cnt += report_rule_use(kno->form_a_cycle_rules); + + err_msg(lg_Debug, "\nPP stats: contains_one_rules\n"); + rule_cnt += report_rule_use(kno->contains_one_rules); + + err_msg(lg_Debug, "\nPP stats: contains_none_rules\n"); + rule_cnt += report_rule_use(kno->contains_none_rules); + + err_msg(lg_Debug, "\nPP stats: bounded_rules\n"); + rule_cnt += report_rule_use(kno->bounded_rules); + + err_msg(lg_Debug, "\nPP stats: Rules that were not used:\n"); + unused_cnt += report_unused_rule(kno->form_a_cycle_rules); + unused_cnt += report_unused_rule(kno->contains_one_rules); + unused_cnt += report_unused_rule(kno->contains_none_rules); + unused_cnt += report_unused_rule(kno->bounded_rules); + + err_msg(lg_Debug, "\nPP stats: %zd of %zd rules unused\n", unused_cnt, rule_cnt); +} + +/** + * NB: linkage->link[i]->l=-1 means that this connector is to be ignored. + */ +void do_post_process(Postprocessor *pp, Linkage sublinkage, bool is_long) +{ + const char *msg; + PP_data *pp_data; + + if (pp == NULL) return; + pp_data = &pp->pp_data; + + // XXX wtf .. why is this not leaking memory ? + pp_data->links_to_ignore = NULL; + + pp_data->num_words = sublinkage->num_words; + + /* Grab more memory if needed */ + if (pp_data->vlength <= pp_data->num_words) + { + size_t newsz; + pp_data->vlength += pp_data->num_words; + newsz = pp_data->vlength * sizeof(bool); + pp_data->visited = (bool *) realloc(pp_data->visited, newsz); + } + clear_visited(pp_data); + + /* For long sentences, we can save some time by pruning the rules + * which can't possibly be used during postprocessing the linkages + * of this sentence. For short sentences, this is pointless. */ + if (is_long && pp->q_pruned_rules == false) + { + prune_irrelevant_rules(pp); + } + pp->q_pruned_rules = true; + + switch (internal_process(pp, sublinkage, &msg)) + { + case -1: + /* some global test failed even before we had to build the domains */ + pp->n_global_rules_firing++; + pp->violation = msg; + report_pp_stats(pp); + return; + case 1: + /* one of the "normal" post processing tests failed */ + pp->n_local_rules_firing++; + pp->violation = msg; + break; + case 0: + /* This linkage is legal according to the post processing rules */ + pp->violation = NULL; + break; + } + + report_pp_stats(pp); +} + +/** + * This does basic post-processing for all linkages. + */ +void post_process_lkgs(Sentence sent, Parse_Options opts) +{ + size_t in; + size_t N_linkages_post_processed = 0; + size_t N_valid_linkages = sent->num_valid_linkages; + size_t N_linkages_alloced = sent->num_linkages_alloced; + bool twopass = sent->length >= opts->twopass_length; + Postprocessor *pp = sent->postprocessor; + + /* Special-case the "amy/ady" morphology handling. */ + /* More generally, it there's no post-processor, do nothing. */ + /* Well, almost nothing. We still want to assign a score. */ + // if (sent->dict->affix_table->anysplit) + if (NULL == pp) + { + sent->num_linkages_post_processed = sent->num_valid_linkages; + for (in=0; in < N_linkages_alloced; in++) + { + Linkage lkg = &sent->lnkages[in]; + linkage_score(lkg, opts); + } + return; + } + + /* (optional) First pass: just visit the linkages */ + /* The purpose of the first pass is to make the post-processing + * more efficient. Because (hopefully) by the time the real work + * is done in the 2nd pass, the relevant rule set has been pruned + * in the first pass. + */ + if (twopass) + { + for (in=0; in < N_linkages_alloced; in++) + { + Linkage lkg = &sent->lnkages[in]; + Linkage_info *lifo = &lkg->lifo; + + if (lifo->discarded || lifo->N_violations) continue; + + post_process_scan_linkage(pp, lkg); + + if ((49 == in%50) && resources_exhausted(opts->resources)) break; + } + } + + /* Second pass: actually perform post-processing */ + for (in=0; in < N_linkages_alloced; in++) + { + Linkage lkg = &sent->lnkages[in]; + Linkage_info *lifo = &lkg->lifo; + + if (lifo->discarded || lifo->N_violations) continue; + + do_post_process(pp, lkg, twopass); + post_process_free_data(&pp->pp_data); + + if (NULL != pp->violation) + { + N_valid_linkages--; + lifo->N_violations++; + + /* Set the message, only if not set (e.g. by sane_morphism) */ + if (NULL == lifo->pp_violation_msg) + lifo->pp_violation_msg = pp->violation; + } + N_linkages_post_processed++; + + linkage_score(lkg, opts); + if ((9 == in%10) && resources_exhausted(opts->resources)) break; + } + + /* If the timer expired, then we never finished post-processing. + * Mark the remaining sentences as bad, as otherwise strange + * results get reported. */ + for (; in < N_linkages_alloced; in++) + { + Linkage lkg = &sent->lnkages[in]; + Linkage_info *lifo = &lkg->lifo; + + if (lifo->discarded || lifo->N_violations) continue; + + N_valid_linkages--; + lifo->N_violations++; + + /* Set the message, only if not set (e.g. by sane_morphism) */ + if (NULL == lifo->pp_violation_msg) + lifo->pp_violation_msg = "Timeout during postprocessing"; + } + + print_time(opts, "Postprocessed all linkages"); + + if (verbosity_level(6)) + { + err_msg(lg_Info, "%zu of %zu linkages with no P.P. violations\n", + N_valid_linkages, N_linkages_post_processed); + } + + sent->num_linkages_post_processed = N_linkages_post_processed; + sent->num_valid_linkages = N_valid_linkages; +} + +/* ================ compute the domain names ============= */ +/* + * The code below is used in one place only: when printing the domain + * names. If the domain names are not being printed, then this is a + * complete waste of CPU time. + */ + +static void free_domain_names(PP_domains *ppi) +{ + if (ppi->num_domains > 0) free(ppi->domain_name); + ppi->domain_name = NULL; + ppi->num_domains = 0; +} + +void linkage_free_pp_domains(Linkage lkg) +{ + size_t j; + if (!lkg || !lkg->pp_domains) return; + + for (j = 0; j < lkg->num_links; ++j) + free_domain_names(&lkg->pp_domains[j]); + free(lkg->pp_domains); + lkg->pp_domains = NULL; +} + +typedef struct D_type_list_s D_type_list; +struct D_type_list_s +{ + D_type_list * next; + int type; +}; + +static void free_d_type(D_type_list * dtl) +{ + D_type_list * dtlx; + for (; dtl != NULL; dtl = dtlx) + { + dtlx = dtl->next; + free((void*) dtl); + } +} + +static D_type_list ** build_type_array(PP_data *pp_data, + size_t numlinks) +{ + size_t nbytes = numlinks * sizeof(D_type_list*); + D_type_list** dta = malloc(nbytes); + memset(dta, 0, nbytes); + + for (size_t d = 0; d < pp_data->N_domains; d++) + { + List_o_links * lol; + for (lol = pp_data->domain_array[d].lol; lol != NULL; lol = lol->next) + { + assert(lol->link < numlinks, "Something wrong about link numbering!"); + + D_type_list * dtl; + dtl = (D_type_list *) malloc(sizeof(D_type_list)); + dtl->type = pp_data->domain_array[d].type; + dtl->next = dta[lol->link]; + dta[lol->link] = dtl; + } + } + return dta; +} + +/** + * Store the domain names in the linkage. These are not needed + * unless the user asks the domain names to be printed! + */ +static void linkage_set_domain_names(Postprocessor *postprocessor, + Linkage linkage) +{ + if (NULL == linkage) return; + if (NULL == postprocessor) return; + if (0 == postprocessor->pp_data.N_domains) return; + + /* Copy the post-processing results over into the linkage */ + if (postprocessor->violation != NULL) return; + + D_type_list **dta = build_type_array(&postprocessor->pp_data, + linkage->num_links); + + assert(NULL == linkage->pp_domains, "Not expecting pp_domains here!"); + + linkage->pp_domains = malloc(sizeof(PP_domains) * linkage->num_links); + memset(linkage->pp_domains, 0, sizeof(PP_domains) * linkage->num_links); + + for (size_t j = 0; j < linkage->num_links; ++j) + { + D_type_list * d; + int k = 0; + for (d = dta[j]; d != NULL; d = d->next) k++; + linkage->pp_domains[j].num_domains = k; + if (k > 0) + { + linkage->pp_domains[j].domain_name = + (const char **) malloc(k * sizeof(const char *)); + } + k = 0; + for (d = dta[j]; d != NULL; d = d->next) + { + char buff[] = {d->type, '\0'}; + + linkage->pp_domains[j].domain_name[k] = + string_set_add (buff, postprocessor->string_set); + + k++; + } + } + + /* Done with the d_type_array */ + for (size_t i=0; inum_links; i++) + free_d_type(dta[i]); + free(dta); +} + +/** + * Compute linkage domain names. + * + * This assumes that post-processing has been done once, already; + * however, it re-performs post-processing a second time, because + * the data need to obtain the domain names has been lost. + */ +void compute_domain_names(Linkage lkg) +{ + Postprocessor *pp = lkg->sent->postprocessor; + if (NULL == pp) return; + + Linkage_info *lifo = &lkg->lifo; + if (lifo->discarded || lifo->N_violations) + return; + + // If pp_domains is set, its been computed already + if (NULL != lkg->pp_domains) return; + + do_post_process(pp, lkg, true); + linkage_set_domain_names(pp, lkg); + post_process_free_data(&pp->pp_data); +} + +static inline bool verify_link_index(const Linkage linkage, LinkIdx index) +{ + if (!linkage) return false; + if (index >= linkage->num_links) return false; + return true; +} + +/** XXX this will not return valid data unless compute_domain_names + * has been called first. FIXME? or does this matter? + */ +int linkage_get_link_num_domains(const Linkage linkage, LinkIdx index) +{ + if (NULL == linkage->pp_domains) return -1; + if (!verify_link_index(linkage, index)) return -1; + return linkage->pp_domains[index].num_domains; +} + +/** XXX this will not return valid data unless compute_domain_names + * has been called first. FIXME? or does this matter? + */ +const char ** linkage_get_link_domain_names(const Linkage linkage, LinkIdx index) +{ + if (NULL == linkage->pp_domains) return NULL; + if (!verify_link_index(linkage, index)) return NULL; + return linkage->pp_domains[index].domain_name; +} + + +/* OLD COMMENTS (OUT OF DATE): + This file does the post-processing. + The main routine is "post_process()". It uses the link names only, + and not the connectors. + + A domain is a set of links. Each domain has a defining link. + Only certain types of links serve to define a domain. These + parameters are set by the lists of link names in a separate, + human-readable file referred to herein as the 'knowledge file.' + + The domains are nested: given two domains, either they're disjoint, + or one contains the other, i.e. they're tree structured. The set of links + in a domain (but in no smaller domain) are called the "group" of the + domain. Data structures are built to store all this stuff. + The tree structured property is not mathematically guaranteed by + the domain construction algorithm. Davy simply claims that because + of how he built the dictionary, the domains will always be so + structured. The program checks this and gives an error message + if it's violated. + + Define the "root word" of a link (or domain) to be the word at the + left end of the link. The other end of the defining link is called + the "right word". + + The domain corresponding to a link is defined to be the set of links + reachable by starting from the right word, following links and never + using the root word or any word to its left. + + There are some minor exceptions to this. The "restricted_link" lists + those connectors that, even if they point back before the root word, + are included in the domain. Some of the starting links are included + in their domain, these are listed in the "domain_contains_links" list. + + Such was the way it was. Now Davy tells me there should be another type + of domain that's quite different. Let's call these "urfl" domains. + Certain type of connectors start urfl domains. They're listed below. + In a urfl domain, the search includes the root word. It does a separate + search to find urfl domains. + + Restricted links should work just as they do with ordinary domains. If they + come out of the right word, or anything to the right of it (that's + in the domain), they should be included but should not be traced + further. If they come out of the root word, they should not be + included. + */ + +/* + I also, unfortunately, want to propose a new type of domain. These + would include everything that can be reached from the root word of the + link, to the right, that is closer than the right word of the link. + (They would not include the link itself.) + + In the following sentence, then, the "Urfl_Only Domain" of the G link + would include only the "O" link: + + +-----G----+ + +---O--+ +-AI+ + | | | | + hitting dogs is fun.a + + In the following sentence it would include the "O", the "TT", the "I", + the second "O", and the "A". + + +----------------G---------------+ + +-----TT-----+ +-----O-----+ | + +---O---+ +-I+ +---A--+ +-AI+ + | | | | | | | | + telling people to do stupid things is fun.a + + This would allow us to judge the following: + + kicking dogs bores me + *kicking dogs kicks dogs + explaining the program is easy + *explaining the program is running + + (These are distinctions that I thought we would never be able to make, + so I told myself they were semantic rather than syntactic. But with + domains, they should be easy.) + */ + + /* Modifications, 6/96 ALB: + 1) Rules and link sets are relegated to a separate, user-written + file(s), herein referred to as the 'knowledge file' + 2) This information is read by a lexer, in pp_lexer.l (lex code) + whose exported routines are all prefixed by 'pp_lexer' + 3) when postprocessing a sentence, the links of each domain are + placed in a set for quick lookup, ('contains one' and 'contains none') + 4) Functions which were never called have been eliminated: + link_inhabits(), match_in_list(), group_type_contains(), + group_type_contains_one(), group_type_contains_all() + 5) Some 'one-by-one' initializations have been replaced by faster + block memory operations (memset etc.) + 6) The above comments are correct but incomplete! (1/97) + 7) observation: the 'contains one' is, empirically, by far the most + violated rule, so it should come first in applying the rules. + + Modifications, 9/97 ALB: + Deglobalization. Made code consistent with api. + */ diff -Nru link-grammar-5.3.16/link-grammar/post-process/post-process.h link-grammar-5.5.0/link-grammar/post-process/post-process.h --- link-grammar-5.3.16/link-grammar/post-process/post-process.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/post-process.h 2018-01-22 18:48:21.000000000 +0000 @@ -0,0 +1,45 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ +/********************************************************************** + Calling paradigm: + . call post_process_new() with the handle of a knowledge set. This + returns a handle, used for all subsequent calls to post-process. + . Do for each sentence: + - Do for each generated linkage of a sentence: + + call post_process_scan_linkage() + - Do for each generated linkage of a sentence: + + call do_post_process() + - Call post_process_free() +***********************************************************************/ + +#ifndef _POSTPROCESS_H_ +#define _POSTPROCESS_H_ + +#include "api-types.h" +#include "link-includes.h" + +typedef struct PP_data_s PP_data; + +Postprocessor * post_process_new(pp_knowledge *); +void post_process_free(Postprocessor *); + +void post_process_lkgs(Sentence, Parse_Options); + +void do_post_process(Postprocessor *, Linkage, bool); +void post_process_free_data(PP_data * ppd); +bool post_process_match(const char *, const char *); /* utility function */ + +void compute_domain_names(Linkage); +void linkage_free_pp_domains(Linkage); + +#endif diff -Nru link-grammar-5.3.16/link-grammar/post-process/pp_knowledge.c link-grammar-5.5.0/link-grammar/post-process/pp_knowledge.c --- link-grammar-5.3.16/link-grammar/post-process/pp_knowledge.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/pp_knowledge.c 2018-01-22 20:01:13.000000000 +0000 @@ -0,0 +1,436 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +/*********************************************************************** + pp_knowledge.c + 7/97 + Contains rules and associated information for post processing. This + information is supplied in a human-readable file and is parsed by + pp_lexer.h +***********************************************************************/ + +#include + +#include "externs.h" +#include "pp_knowledge.h" +#include "pp_lexer.h" +#include "pp_linkset.h" +#include "string-set.h" +#include "dict-common/file-utils.h" // for dictopen + +#define D_PPK 10 /* verbosity level for this file */ +#define PP_MAX_UNIQUE_LINK_NAMES 1024 /* just needs to be approximate */ + +/****************** non-exported functions ***************************/ + +static bool check_domain_is_legal(pp_knowledge *k, const char *p) +{ + if (0x0 != p[1]) + { + prt_error("Error: File %s: Domain (%s) must be a single character\n", + k->path, p); + return false; + } + return true; +} + +static void initialize_set_of_links_starting_bounded_domain(pp_knowledge *k) +{ + int i,j,d,domain_of_rule; + k->set_of_links_starting_bounded_domain = + pp_linkset_open(PP_MAX_UNIQUE_LINK_NAMES); + for (i=0; k->bounded_rules[i].msg!=0; i++) + { + domain_of_rule = k->bounded_rules[i].domain; + for (j=0; (d=(k->starting_link_lookup_table[j].domain))!=-1; j++) + if (d==domain_of_rule) + pp_linkset_add(k->set_of_links_starting_bounded_domain, + k->starting_link_lookup_table[j].starting_link); + } +} + +/** + * Read table of [link, domain type]. + * This tells us what domain type each link belongs to. + * This lookup table *must* be defined in the knowledge file. + */ +static bool read_starting_link_table(pp_knowledge *k) +{ + const char *p; + const char label[] = "STARTING_LINK_TYPE_TABLE"; + size_t i, even; + int n_tokens; + + if (!pp_lexer_set_label(k->lt, label)) + { + prt_error("Error: File %s: Couldn't find starting link table %s\n", + k->path, label); + return false; + } + + n_tokens = pp_lexer_count_tokens_of_label(k->lt); + if (-1 == n_tokens) return false; + even = n_tokens % 2; + if(0 != even) + { + prt_error("Error: Link table must have format [ ]+\n"); + return false; + } + + k->nStartingLinks = n_tokens/2; + k->starting_link_lookup_table = (StartingLinkAndDomain*) + malloc((1+k->nStartingLinks)*sizeof(StartingLinkAndDomain)); + for (i=0; inStartingLinks; i++) + { + /* read the starting link itself */ + k->starting_link_lookup_table[i].starting_link = + string_set_add(pp_lexer_get_next_token_of_label(k->lt),k->string_set); + + /* read the domain type of the link */ + p = pp_lexer_get_next_token_of_label(k->lt); + if (!check_domain_is_legal(k, p)) return false; + k->starting_link_lookup_table[i].domain = (int) p[0]; + } + + /* end sentinel */ + k->starting_link_lookup_table[k->nStartingLinks].domain = -1; + return true; +} + +static pp_linkset LINK_SET_ERROR; /* sentinel value for error */ + +static pp_linkset *read_link_set(pp_knowledge *k, + const char *label, String_set *ss) +{ + /* read link set, marked by label in knowledge file, into a set of links + whose handle is returned. Return NULL if link set not defined in file, + in which case the set is taken to be empty. */ + int n_strings,i; + pp_linkset *ls; + if (!pp_lexer_set_label(k->lt, label)) + { + if (verbosity_level(+D_PPK)) + prt_error("Warning: File %s: Link set %s not defined: assuming empty\n", + k->path, label); + n_strings = 0; + } + else + { + n_strings = pp_lexer_count_tokens_of_label(k->lt); + if (-1 == n_strings) return &LINK_SET_ERROR; + } + ls = pp_linkset_open(n_strings); + for (i=0; ilt),ss)); + return ls; +} + +static bool read_link_sets(pp_knowledge *k) +{ + String_set *ss = k->string_set; /* shorthand */ + k->domain_starter_links =read_link_set(k,"DOMAIN_STARTER_LINKS",ss); + if (&LINK_SET_ERROR == k->domain_starter_links) return false; + + k->urfl_domain_starter_links=read_link_set(k,"URFL_DOMAIN_STARTER_LINKS",ss); + if (&LINK_SET_ERROR == k->urfl_domain_starter_links) return false; + + k->domain_contains_links =read_link_set(k,"DOMAIN_CONTAINS_LINKS",ss); + if (&LINK_SET_ERROR == k->domain_contains_links) return false; + + k->ignore_these_links =read_link_set(k,"IGNORE_THESE_LINKS",ss); + if (&LINK_SET_ERROR == k->ignore_these_links) return false; + + k->restricted_links =read_link_set(k,"RESTRICTED_LINKS",ss); + if (&LINK_SET_ERROR == k->domain_starter_links) return false; + + k->must_form_a_cycle_links =read_link_set(k,"MUST_FORM_A_CYCLE_LINKS",ss); + if (&LINK_SET_ERROR == k->must_form_a_cycle_links) return false; + + k->urfl_only_domain_starter_links= + read_link_set(k,"URFL_ONLY_DOMAIN_STARTER_LINKS",ss); + if (&LINK_SET_ERROR == k->urfl_only_domain_starter_links) return false; + + k->left_domain_starter_links=read_link_set(k,"LEFT_DOMAIN_STARTER_LINKS",ss); + if (&LINK_SET_ERROR == k->left_domain_starter_links) return false; + + return true; +} + +static void free_link_sets(pp_knowledge *k) +{ + pp_linkset_close(k->domain_starter_links); + pp_linkset_close(k->urfl_domain_starter_links); + pp_linkset_close(k->domain_contains_links); + pp_linkset_close(k->ignore_these_links); + pp_linkset_close(k->restricted_links); + pp_linkset_close(k->must_form_a_cycle_links); + pp_linkset_close(k->urfl_only_domain_starter_links); + pp_linkset_close(k->left_domain_starter_links); +} + +static bool read_form_a_cycle_rules(pp_knowledge *k, const char *label) +{ + size_t n_commas, n_tokens; + size_t r, i; + pp_linkset *lsHandle; + const char **tokens; + if (!pp_lexer_set_label(k->lt, label)) { + k->n_form_a_cycle_rules = 0; + if (verbosity_level(+D_PPK)) + prt_error("Warning: File %s: Not using any 'form a cycle' rules\n", + k->path); + } + else { + n_commas = pp_lexer_count_commas_of_label(k->lt); + k->n_form_a_cycle_rules = (n_commas + 1)/2; + } + k->form_a_cycle_rules= + (pp_rule*) malloc ((1+k->n_form_a_cycle_rules)*sizeof(pp_rule)); + for (r=0; rn_form_a_cycle_rules; r++) + { + /* read link set */ + tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); + if (n_tokens <= 0) + { + prt_error("Error: File %s: Syntax error\n", k->path); + return false; + } + lsHandle = pp_linkset_open(n_tokens); + for (i=0; istring_set)); + k->form_a_cycle_rules[r].link_set = lsHandle; + + /* read error message */ + tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); + if (n_tokens > 1) + { + prt_error("Error: File %s: Invalid syntax (rule %zu of %s)\n", + k->path, r+1,label); + return false; + } + k->form_a_cycle_rules[r].msg = string_set_add(tokens[0], k->string_set); + k->form_a_cycle_rules[r].use_count = 0; + } + + /* sentinel entry */ + k->form_a_cycle_rules[k->n_form_a_cycle_rules].msg = 0; + k->form_a_cycle_rules[k->n_form_a_cycle_rules].use_count = 0; + + return true; +} + +static bool read_bounded_rules(pp_knowledge *k, const char *label) +{ + const char **tokens; + size_t n_commas, n_tokens; + size_t r; + if (!pp_lexer_set_label(k->lt, label)) { + k->n_bounded_rules = 0; + if (verbosity_level(+D_PPK)) + prt_error("Warning: File %s: Not using any 'bounded' rules\n", k->path); + } + else { + n_commas = pp_lexer_count_commas_of_label(k->lt); + k->n_bounded_rules = (n_commas + 1)/2; + } + k->bounded_rules = (pp_rule*) malloc ((1+k->n_bounded_rules)*sizeof(pp_rule)); + for (r=0; rn_bounded_rules; r++) + { + /* read domain */ + tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); + if (n_tokens!=1) + { + prt_error("Error: File %s: Invalid syntax: rule %zu of %s\n", + k->path, r+1,label); + return false; + } + k->bounded_rules[r].domain = (int) tokens[0][0]; + + /* read error message */ + tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); + if (n_tokens!=1) + { + prt_error("Error: File %s: Invalid syntax: rule %zu of %s\n", + k->path, r+1,label); + return false; + } + k->bounded_rules[r].msg = string_set_add(tokens[0], k->string_set); + k->bounded_rules[r].use_count = 0; + } + + /* sentinel entry */ + k->bounded_rules[k->n_bounded_rules].msg = 0; + k->bounded_rules[k->n_bounded_rules].use_count = 0; + + return true; +} + +static bool read_contains_rules(pp_knowledge *k, const char *label, + pp_rule **rules, size_t *nRules) +{ + /* Reading the 'contains_one_rules' and reading the + 'contains_none_rules' into their respective arrays */ + size_t n_tokens, i, r; + int n_commas; + const char *p; + const char **tokens; + if (!pp_lexer_set_label(k->lt, label)) { + *nRules = 0; + if (verbosity_level(+D_PPK)) + prt_error("Warning: File %s: Not using any %s rules\n", k->path, label); + } + else { + n_commas = pp_lexer_count_commas_of_label(k->lt); + if (-1 == n_commas) return false; + *nRules = (n_commas + 1)/3; + } + *rules = (pp_rule*) malloc ((1+*nRules)*sizeof(pp_rule)); + for (r=0; r<*nRules; r++) + { + /* first read link */ + tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); + if (n_tokens > 1) + { + prt_error("Error: File %s: Invalid syntax in %s (rule %zu)\n", + k->path, label, r+1); + return false; + } + + (*rules)[r].selector = string_set_add(tokens[0], k->string_set); + + /* read link set */ + tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); + (*rules)[r].link_set = pp_linkset_open(n_tokens); + (*rules)[r].link_set_size = n_tokens; + (*rules)[r].link_array = (const char **) malloc((1+n_tokens)*sizeof(const char*)); + for (i=0; istring_set); + pp_linkset_add((*rules)[r].link_set, p); + (*rules)[r].link_array[i] = p; + } + (*rules)[r].link_array[i]=0; /* NULL-terminator */ + + /* read error message */ + tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); + if (n_tokens > 1) + { + prt_error("Error: File %s: Invalid syntax in %s (rule %zu)\n", + k->path, label, r+1); + return false; + } + + (*rules)[r].msg = string_set_add(tokens[0], k->string_set); + (*rules)[r].use_count = 0; + } + + /* sentinel entry */ + (*rules)[*nRules].msg = 0; + (*rules)[*nRules].use_count = 0; + + return true; +} + + +static bool read_rules(pp_knowledge *k) +{ + if (!read_form_a_cycle_rules(k, "FORM_A_CYCLE_RULES")) return false; + if (!read_bounded_rules(k, "BOUNDED_RULES")) return false; + if (!read_contains_rules(k, "CONTAINS_ONE_RULES" , + &(k->contains_one_rules), &(k->n_contains_one_rules))) + return false; + if (!read_contains_rules(k, "CONTAINS_NONE_RULES", + &(k->contains_none_rules), &(k->n_contains_none_rules))) + return false; + return true; +} + +static void free_rules(pp_knowledge *k) +{ + size_t r; + pp_rule *rule; + if (NULL != k->contains_one_rules) + { + for (r=0; k->contains_one_rules[r].msg!=0; r++) + { + rule = &(k->contains_one_rules[r]); /* shorthand */ + free(rule->link_array); + pp_linkset_close(rule->link_set); + } + for (r=0; k->contains_none_rules[r].msg!=0; r++) + { + rule = &(k->contains_none_rules[r]); /* shorthand */ + free(rule->link_array); + pp_linkset_close(rule->link_set); + } + } + + for (r = 0; r < k->n_form_a_cycle_rules; r++) + pp_linkset_close(k->form_a_cycle_rules[r].link_set); + free(k->bounded_rules); + free(k->form_a_cycle_rules); + free(k->contains_one_rules); + free(k->contains_none_rules); +} + +/********************* exported functions ***************************/ + +pp_knowledge *pp_knowledge_open(const char *path) +{ + /* read knowledge from disk into pp_knowledge */ + FILE *f = dictopen(path, "r"); + if (NULL == f) + { + prt_error("Error: Couldn't find post-process knowledge file %s\n", path); + return NULL; + } + pp_knowledge *k = (pp_knowledge *) malloc (sizeof(pp_knowledge)); + *k = (pp_knowledge){0}; + k->lt = pp_lexer_open(f); + fclose(f); + if (NULL == k->lt) goto failure; + k->string_set = string_set_create(); + k->path = string_set_add(path, k->string_set); + if (!read_starting_link_table(k)) goto failure; + + if (!read_link_sets(k)) goto failure; + if (!read_rules(k)) goto failure; + initialize_set_of_links_starting_bounded_domain(k); + + /* If the knowledge file was empty, do nothing at all. */ + if (0 == k->nStartingLinks) + { + pp_knowledge_close(k); + return NULL; + } + + return k; + +failure: + prt_error("Error: Unable to open knowledge file %s.\n", path); + pp_knowledge_close(k); + return NULL; +} + +void pp_knowledge_close(pp_knowledge *k) +{ + if (!k) return; + /* clear the memory taken up by k */ + free(k->starting_link_lookup_table); + free_link_sets(k); + free_rules(k); + pp_linkset_close(k->set_of_links_starting_bounded_domain); + string_set_delete(k->string_set); + if (NULL != k->lt) pp_lexer_close(k->lt); + free(k); +} diff -Nru link-grammar-5.3.16/link-grammar/post-process/pp_knowledge.h link-grammar-5.5.0/link-grammar/post-process/pp_knowledge.h --- link-grammar-5.3.16/link-grammar/post-process/pp_knowledge.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/pp_knowledge.h 2017-05-27 12:19:39.000000000 +0000 @@ -0,0 +1,16 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-types.h" + +pp_knowledge *pp_knowledge_open(const char *path); +void pp_knowledge_close(pp_knowledge *knowledge); diff -Nru link-grammar-5.3.16/link-grammar/post-process/pp_lexer.c link-grammar-5.5.0/link-grammar/post-process/pp_lexer.c --- link-grammar-5.3.16/link-grammar/post-process/pp_lexer.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/pp_lexer.c 2018-04-23 19:56:34.000000000 +0000 @@ -0,0 +1,2284 @@ + +#line 3 "../../link-grammar/post-process/pp_lexer.c" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 1 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN yyg->yy_start = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START ((yyg->yy_start - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart(yyin ,yyscanner ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + #define YY_LINENO_REWIND_TO(ptr) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = yyg->yy_hold_char; \ + YY_RESTORE_YY_MORE_OFFSET \ + yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \ + ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] + +void yyrestart (FILE *input_file ,yyscan_t yyscanner ); +void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void yypop_buffer_state (yyscan_t yyscanner ); + +static void yyensure_buffer_stack (yyscan_t yyscanner ); +static void yy_load_buffer_state (yyscan_t yyscanner ); +static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner ); + +#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER ,yyscanner) + +YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner ); + +void *yyalloc (yy_size_t ,yyscan_t yyscanner ); +void *yyrealloc (void *,yy_size_t ,yyscan_t yyscanner ); +void yyfree (void * ,yyscan_t yyscanner ); + +#define yy_new_buffer yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +typedef unsigned char YY_CHAR; + +typedef int yy_state_type; + +#define yytext_ptr yytext_r + +static yy_state_type yy_get_previous_state (yyscan_t yyscanner ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ,yyscan_t yyscanner); +static int yy_get_next_buffer (yyscan_t yyscanner ); +static void yynoreturn yy_fatal_error (yyconst char* msg ,yyscan_t yyscanner ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + yyg->yytext_ptr = yy_bp; \ + yyleng = (int) (yy_cp - yy_bp); \ + yyg->yy_hold_char = *yy_cp; \ + *yy_cp = '\0'; \ + yyg->yy_c_buf_p = yy_cp; + +#define YY_NUM_RULES 8 +#define YY_END_OF_BUFFER 9 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[28] = + { 0, + 2, 2, 2, 2, 9, 5, 2, 2, 7, 7, + 5, 2, 5, 7, 5, 3, 2, 0, 4, 2, + 0, 5, 0, 1, 3, 6, 0 + } ; + +static yyconst YY_CHAR yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 6, 7, 1, + 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst YY_CHAR yy_meta[9] = + { 0, + 1, 2, 2, 3, 4, 5, 1, 1 + } ; + +static yyconst flex_uint16_t yy_base[32] = + { 0, + 0, 7, 14, 21, 18, 11, 0, 0, 0, 66, + 10, 27, 34, 42, 7, 66, 0, 7, 4, 0, + 0, 0, 0, 66, 0, 66, 66, 50, 54, 57, + 60 + } ; + +static yyconst flex_int16_t yy_def[32] = + { 0, + 27, 1, 1, 3, 27, 28, 29, 29, 30, 27, + 31, 27, 27, 13, 28, 27, 29, 30, 31, 12, + 14, 13, 14, 27, 14, 27, 0, 27, 27, 27, + 27 + } ; + +static yyconst flex_uint16_t yy_nxt[75] = + { 0, + 6, 7, 8, 7, 9, 10, 6, 11, 12, 16, + 12, 26, 16, 13, 10, 16, 16, 27, 10, 27, + 10, 10, 12, 27, 12, 27, 27, 14, 20, 17, + 20, 27, 27, 21, 22, 23, 24, 23, 23, 25, + 22, 22, 23, 27, 27, 27, 27, 23, 23, 23, + 15, 27, 27, 27, 15, 17, 17, 18, 27, 18, + 19, 27, 27, 27, 19, 5, 27, 27, 27, 27, + 27, 27, 27, 27 + } ; + +static yyconst flex_int16_t yy_chk[75] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 19, + 2, 18, 15, 2, 3, 11, 6, 5, 3, 0, + 3, 3, 4, 0, 4, 0, 0, 4, 12, 12, + 12, 0, 0, 12, 13, 13, 13, 13, 13, 13, + 13, 13, 14, 0, 0, 0, 0, 14, 14, 14, + 28, 0, 0, 0, 28, 29, 29, 30, 0, 30, + 31, 0, 0, 0, 31, 27, 27, 27, 27, 27, + 27, 27, 27, 27 + } ; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +#line 1 "../../link-grammar/post-process/pp_lexer.l" +#line 2 "../../link-grammar/post-process/pp_lexer.l" +/************************************************************************** + Lex specification for post-process knowledge file + + 6/96 ALB + Possible areas of improvement: + . allow sets to be referred to anywhere in file, not just + after they are defined. (but then you need to be careful + about infinite loops involving recursively defined sets) + +**************************************************************************/ + +#include "pp_lexer.h" +#include "pp-structures.h" +#include "string-set.h" +#include "utilities.h" + +#define YY_NO_INPUT +#define YY_NO_UNPUT + +/* forward references for non-exported functions */ +static void setup(PPLexTable *lt); +static bool set_label(PPLexTable *lt, const char *label); +static bool add_string_to_label(PPLexTable *lt, const char *str); +static bool add_set_of_strings_to_label(PPLexTable *lt,const char *label_of_set); +static int get_index_of_label(PPLexTable *lt, const char *label); + +/* ptr to lex table we're currently filling in */ +#define YY_EXTRA_TYPE PPLexTable* + +#line 478 "../../link-grammar/post-process/pp_lexer.c" + +#define INITIAL 0 +#define INCLUDE 1 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +/* Holds the entire state of the reentrant scanner. */ +struct yyguts_t + { + + /* User-defined. Not touched by flex. */ + YY_EXTRA_TYPE yyextra_r; + + /* The rest are the same as the globals declared in the non-reentrant scanner. */ + FILE *yyin_r, *yyout_r; + size_t yy_buffer_stack_top; /**< index of top of stack. */ + size_t yy_buffer_stack_max; /**< capacity of stack. */ + YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */ + char yy_hold_char; + int yy_n_chars; + int yyleng_r; + char *yy_c_buf_p; + int yy_init; + int yy_start; + int yy_did_buffer_switch_on_eof; + int yy_start_stack_ptr; + int yy_start_stack_depth; + int *yy_start_stack; + yy_state_type yy_last_accepting_state; + char* yy_last_accepting_cpos; + + int yylineno_r; + int yy_flex_debug_r; + + char *yytext_r; + int yy_more_flag; + int yy_more_len; + + }; /* end struct yyguts_t */ + +static int yy_init_globals (yyscan_t yyscanner ); + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy (yyscan_t yyscanner ); + +int yyget_debug (yyscan_t yyscanner ); + +void yyset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner ); + +void yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *yyget_in (yyscan_t yyscanner ); + +void yyset_in (FILE * _in_str ,yyscan_t yyscanner ); + +FILE *yyget_out (yyscan_t yyscanner ); + +void yyset_out (FILE * _out_str ,yyscan_t yyscanner ); + + int yyget_leng (yyscan_t yyscanner ); + +char *yyget_text (yyscan_t yyscanner ); + +int yyget_lineno (yyscan_t yyscanner ); + +void yyset_lineno (int _line_number ,yyscan_t yyscanner ); + +int yyget_column (yyscan_t yyscanner ); + +void yyset_column (int _column_no ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap (yyscan_t yyscanner ); +#else +extern int yywrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef YY_NO_UNPUT + + static void yyunput (int c,char *buf_ptr ,yyscan_t yyscanner); + +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (yyscan_t yyscanner ); +#else +static int input (yyscan_t yyscanner ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = (int) fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (yyscan_t yyscanner); + +#define YY_DECL int yylex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK /*LINTED*/break; +#endif + +#define YY_RULE_SETUP \ + if ( yyleng > 0 ) \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = \ + (yytext[yyleng - 1] == '\n'); \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + yy_state_type yy_current_state; + char *yy_cp, *yy_bp; + int yy_act; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( !yyg->yy_init ) + { + yyg->yy_init = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! yyg->yy_start ) + yyg->yy_start = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); + } + + yy_load_buffer_state(yyscanner ); + } + + { +#line 43 "../../link-grammar/post-process/pp_lexer.l" + + +#line 746 "../../link-grammar/post-process/pp_lexer.c" + + while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ + { + yy_cp = yyg->yy_c_buf_p; + + /* Support of yytext. */ + *yy_cp = yyg->yy_hold_char; + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = yyg->yy_start; + yy_current_state += YY_AT_BOL(); +yy_match: + do + { + YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 28 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (flex_int16_t) yy_c]; + ++yy_cp; + } + while ( yy_base[yy_current_state] != 66 ); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + if ( yy_act == 0 ) + { /* have to back up */ + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + yy_act = yy_accept[yy_current_state]; + } + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = yyg->yy_hold_char; + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + +case 1: +*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */ +yyg->yy_c_buf_p = yy_cp -= 1; +YY_DO_BEFORE_ACTION; /* set up yytext again */ +YY_RULE_SETUP +#line 45 "../../link-grammar/post-process/pp_lexer.l" +; + YY_BREAK +case 2: +/* rule 2 can match eol */ +YY_RULE_SETUP +#line 46 "../../link-grammar/post-process/pp_lexer.l" +; + YY_BREAK +case 3: +YY_RULE_SETUP +#line 48 "../../link-grammar/post-process/pp_lexer.l" +{ set_label(yyextra, yytext); } + YY_BREAK +case 4: +YY_RULE_SETUP +#line 49 "../../link-grammar/post-process/pp_lexer.l" +{ add_set_of_strings_to_label(yyextra, &(yytext[1])); } + YY_BREAK +case 5: +YY_RULE_SETUP +#line 50 "../../link-grammar/post-process/pp_lexer.l" +{ add_string_to_label(yyextra, yytext); } + YY_BREAK +case 6: +YY_RULE_SETUP +#line 51 "../../link-grammar/post-process/pp_lexer.l" +{ add_string_to_label(yyextra, yytext); } + YY_BREAK +case 7: +YY_RULE_SETUP +#line 53 "../../link-grammar/post-process/pp_lexer.l" +{ prt_error("pp_lexer: unable to parse knowledge file (line %i).\n", yylineno); } + YY_BREAK +case 8: +YY_RULE_SETUP +#line 56 "../../link-grammar/post-process/pp_lexer.l" +ECHO; + YY_BREAK +#line 848 "../../link-grammar/post-process/pp_lexer.c" +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(INCLUDE): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = yyg->yy_hold_char; + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner); + + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++yyg->yy_c_buf_p; + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = yyg->yy_c_buf_p; + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_END_OF_FILE: + { + yyg->yy_did_buffer_switch_on_eof = 0; + + if ( yywrap(yyscanner ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = + yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + yyg->yy_c_buf_p = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars]; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ + } /* end of user's declarations */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + char *source = yyg->yytext_ptr; + int number_to_move, i; + int ret_val; + + if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr - 1); + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; + + int yy_c_buf_p_offset = + (int) (yyg->yy_c_buf_p - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = NULL; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + yyg->yy_n_chars, num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + if ( yyg->yy_n_chars == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart(yyin ,yyscanner); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + int new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + } + + yyg->yy_n_chars += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR; + + yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (yyscan_t yyscanner) +{ + yy_state_type yy_current_state; + char *yy_cp; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_current_state = yyg->yy_start; + yy_current_state += YY_AT_BOL(); + + for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp ) + { + YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 28 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (flex_int16_t) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner) +{ + int yy_is_jam; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */ + char *yy_cp = yyg->yy_c_buf_p; + + YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 28 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (flex_int16_t) yy_c]; + yy_is_jam = (yy_current_state == 27); + + (void)yyg; + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_UNPUT + + static void yyunput (int c, char * yy_bp , yyscan_t yyscanner) +{ + char *yy_cp; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_cp = yyg->yy_c_buf_p; + + /* undo effects of setting up yytext */ + *yy_cp = yyg->yy_hold_char; + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + { /* need to shift things up to make room */ + /* +2 for EOB chars. */ + int number_to_move = yyg->yy_n_chars + 2; + char *dest = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size + 2]; + char *source = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]; + + while ( source > YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + *--dest = *--source; + + yy_cp += (int) (dest - source); + yy_bp += (int) (dest - source); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = + yyg->yy_n_chars = (int) YY_CURRENT_BUFFER_LVALUE->yy_buf_size; + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + YY_FATAL_ERROR( "flex scanner push-back overflow" ); + } + + *--yy_cp = (char) c; + + yyg->yytext_ptr = yy_bp; + yyg->yy_hold_char = *yy_cp; + yyg->yy_c_buf_p = yy_cp; +} + +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (yyscan_t yyscanner) +#else + static int input (yyscan_t yyscanner) +#endif + +{ + int c; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + *yyg->yy_c_buf_p = yyg->yy_hold_char; + + if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + /* This was really a NUL. */ + *yyg->yy_c_buf_p = '\0'; + + else + { /* need more input */ + int offset = yyg->yy_c_buf_p - yyg->yytext_ptr; + ++yyg->yy_c_buf_p; + + switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart(yyin ,yyscanner); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap(yyscanner ) ) + return 0; + + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(yyscanner); +#else + return input(yyscanner); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = yyg->yytext_ptr + offset; + break; + } + } + } + + c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */ + *yyg->yy_c_buf_p = '\0'; /* preserve yytext */ + yyg->yy_hold_char = *++yyg->yy_c_buf_p; + + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = (c == '\n'); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * @param yyscanner The scanner object. + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); + } + + yy_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner); + yy_load_buffer_state(yyscanner ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * @param yyscanner The scanner object. + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (yyscanner); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state(yyscanner ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + yyg->yy_did_buffer_switch_on_eof = 1; +} + +static void yy_load_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + yyg->yy_hold_char = *yyg->yy_c_buf_p; +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * @param yyscanner The scanner object. + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ,yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = (yy_size_t)size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ,yyscanner ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer(b,file ,yyscanner); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * @param yyscanner The scanner object. + */ + void yy_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree((void *) b->yy_ch_buf ,yyscanner ); + + yyfree((void *) b ,yyscanner ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner) + +{ + int oerrno = errno; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_flush_buffer(b ,yyscanner); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * @param yyscanner The scanner object. + */ + void yy_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state(yyscanner ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * @param yyscanner The scanner object. + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(yyscanner); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + yyg->yy_buffer_stack_top++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state(yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * @param yyscanner The scanner object. + */ +void yypop_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ,yyscanner); + YY_CURRENT_BUFFER_LVALUE = NULL; + if (yyg->yy_buffer_stack_top > 0) + --yyg->yy_buffer_stack_top; + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state(yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (yyscan_t yyscanner) +{ + int num_to_alloc; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (!yyg->yy_buffer_stack) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */ + yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + yyg->yy_buffer_stack_max = num_to_alloc; + yyg->yy_buffer_stack_top = 0; + return; + } + + if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + yy_size_t grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = yyg->yy_buffer_stack_max + grow_size; + yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc + (yyg->yy_buffer_stack, + num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*)); + yyg->yy_buffer_stack_max = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return NULL; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ,yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = NULL; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer(b ,yyscanner ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (yyconst char * yystr , yyscan_t yyscanner) +{ + + return yy_scan_bytes(yystr,(int) strlen(yystr) ,yyscanner); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (yyconst char * yybytes, int _yybytes_len , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = (yy_size_t) (_yybytes_len + 2); + buf = (char *) yyalloc(n ,yyscanner ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer(buf,n ,yyscanner); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yynoreturn yy_fatal_error (yyconst char* msg , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = yyg->yy_hold_char; \ + yyg->yy_c_buf_p = yytext + yyless_macro_arg; \ + yyg->yy_hold_char = *yyg->yy_c_buf_p; \ + *yyg->yy_c_buf_p = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the user-defined data for this scanner. + * @param yyscanner The scanner object. + */ +YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyextra; +} + +/** Get the current line number. + * @param yyscanner The scanner object. + */ +int yyget_lineno (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yylineno; +} + +/** Get the current column number. + * @param yyscanner The scanner object. + */ +int yyget_column (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yycolumn; +} + +/** Get the input stream. + * @param yyscanner The scanner object. + */ +FILE *yyget_in (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyin; +} + +/** Get the output stream. + * @param yyscanner The scanner object. + */ +FILE *yyget_out (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyout; +} + +/** Get the length of the current token. + * @param yyscanner The scanner object. + */ +int yyget_leng (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyleng; +} + +/** Get the current token. + * @param yyscanner The scanner object. + */ + +char *yyget_text (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yytext; +} + +/** Set the user-defined data. This data is never touched by the scanner. + * @param user_defined The data to be associated with this scanner. + * @param yyscanner The scanner object. + */ +void yyset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyextra = user_defined ; +} + +/** Set the current line number. + * @param _line_number line number + * @param yyscanner The scanner object. + */ +void yyset_lineno (int _line_number , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* lineno is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + YY_FATAL_ERROR( "yyset_lineno called with no buffer" ); + + yylineno = _line_number; +} + +/** Set the current column. + * @param _column_no column number + * @param yyscanner The scanner object. + */ +void yyset_column (int _column_no , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* column is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + YY_FATAL_ERROR( "yyset_column called with no buffer" ); + + yycolumn = _column_no; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param _in_str A readable stream. + * @param yyscanner The scanner object. + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * _in_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyin = _in_str ; +} + +void yyset_out (FILE * _out_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyout = _out_str ; +} + +int yyget_debug (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yy_flex_debug; +} + +void yyset_debug (int _bdebug , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yy_flex_debug = _bdebug ; +} + +/* Accessor methods for yylval and yylloc */ + +/* User-visible API */ + +/* yylex_init is special because it creates the scanner itself, so it is + * the ONLY reentrant function that doesn't take the scanner as the last argument. + * That's why we explicitly handle the declaration, instead of using our macros. + */ + +int yylex_init(yyscan_t* ptr_yy_globals) + +{ + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + return yy_init_globals ( *ptr_yy_globals ); +} + +/* yylex_init_extra has the same functionality as yylex_init, but follows the + * convention of taking the scanner as the last argument. Note however, that + * this is a *pointer* to a scanner, as it will be allocated by this call (and + * is the reason, too, why this function also must handle its own declaration). + * The user defined value in the first argument will be available to yyalloc in + * the yyextra field. + */ + +int yylex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals ) + +{ + struct yyguts_t dummy_yyguts; + + yyset_extra (yy_user_defined, &dummy_yyguts); + + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in + yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + yyset_extra (yy_user_defined, *ptr_yy_globals); + + return yy_init_globals ( *ptr_yy_globals ); +} + +static int yy_init_globals (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + yyg->yy_buffer_stack = NULL; + yyg->yy_buffer_stack_top = 0; + yyg->yy_buffer_stack_max = 0; + yyg->yy_c_buf_p = NULL; + yyg->yy_init = 0; + yyg->yy_start = 0; + + yyg->yy_start_stack_ptr = 0; + yyg->yy_start_stack_depth = 0; + yyg->yy_start_stack = NULL; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = NULL; + yyout = NULL; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer(YY_CURRENT_BUFFER ,yyscanner ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(yyscanner); + } + + /* Destroy the stack itself. */ + yyfree(yyg->yy_buffer_stack ,yyscanner); + yyg->yy_buffer_stack = NULL; + + /* Destroy the start condition stack. */ + yyfree(yyg->yy_start_stack ,yyscanner ); + yyg->yy_start_stack = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( yyscanner); + + /* Destroy the main struct (reentrant only). */ + yyfree ( yyscanner , yyscanner ); + yyscanner = NULL; + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + + int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner) +{ + int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + return malloc(size); +} + +void *yyrealloc (void * ptr, yy_size_t size , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return realloc(ptr, size); +} + +void yyfree (void * ptr , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 56 "../../link-grammar/post-process/pp_lexer.l" + + + +/************************ exported functions ******************************/ + +void pp_lexer_close(PPLexTable *lt) +{ + int i; + pp_label_node *node,*next; + for (i=0; inodes_of_label[i]; + while (node) + { + next = node->next; + free(node); + node=next; + } + } + string_set_delete(lt->string_set); + if (lt->tokens) free(lt->tokens); + yylex_destroy(lt->scanner); + free(lt); +} + +PPLexTable *pp_lexer_open(FILE *f) +{ + PPLexTable *lt; + bool yylex_ok = false; + + assert(f, "pp_lexer_open: passed a NULL file pointer"); + + lt = (PPLexTable*) malloc (sizeof(PPLexTable)); + setup(lt); + yylex_init(<->scanner); + yyset_extra(lt,lt->scanner); + + struct yyguts_t * yyg = (struct yyguts_t*) lt->scanner; + yyin = f; /* redirect lex to look at the specified file */ + + if (YY_NULL == yylex(lt->scanner)) yylex_ok = true; + lt->idx_of_active_label=-1; + + if (!yylex_ok) + { + pp_lexer_close(lt); + return NULL; + } + return lt; +} + +int pp_lexer_set_label(PPLexTable *lt, const char *label) +{ + /* set lexer state to first node of this label */ + lt->idx_of_active_label = get_index_of_label(lt, label); + if (lt->idx_of_active_label==-1) return 0; /* label not found */ + lt->current_node_of_active_label=lt->nodes_of_label[lt->idx_of_active_label]; + return 1; +} + +int pp_lexer_count_tokens_of_label(PPLexTable *lt) +{ + /* counts all tokens, even the commas */ + int n; + pp_label_node *p; + if (lt->idx_of_active_label == -1) + { + prt_error("Error: pp_lexer: current label is invalid\n"); + return -1; + } + for (n=0, p=lt->nodes_of_label[lt->idx_of_active_label]; p;p=p->next, n++){} + return n; +} + + +/** Retrieves next token of set label, or NULL if list exhausted. */ +const char *pp_lexer_get_next_token_of_label(PPLexTable *lt) +{ + if (lt->current_node_of_active_label == NULL) return NULL; + const char *p = lt->current_node_of_active_label->str; + lt->current_node_of_active_label = lt->current_node_of_active_label->next; + return p; +} + +int pp_lexer_count_commas_of_label(PPLexTable *lt) +{ + int n; + pp_label_node *p; + if (lt->idx_of_active_label == -1) + { + prt_error("Error: pp_lexer: current label is invalid\n"); + return -1; + } + for (n=0,p=lt->nodes_of_label[lt->idx_of_active_label];p!=NULL;p=p->next) + if (!strcmp(p->str, ",")) n++; + return n; +} + +const char **pp_lexer_get_next_group_of_tokens_of_label(PPLexTable *lt, size_t *n_tokens) +{ + /* all tokens until next comma, null-terminated */ + int n; + pp_label_node *p; + + p = lt->current_node_of_active_label; + for (n=0; p!=NULL && strcmp(p->str,","); n++, p=p->next) {} + if (n > lt->extents) { + lt->extents = n; + lt->tokens = (const char **) realloc (lt->tokens, n * sizeof(const char*)); + } + + p = lt->current_node_of_active_label; + for (n=0; p!=NULL && strcmp(p->str,","); n++, p=p->next) + lt->tokens[n] = string_set_add(p->str, lt->string_set); + + /* advance "current node of label" state */ + lt->current_node_of_active_label = p; + if (p!=NULL) lt->current_node_of_active_label = p->next; + + *n_tokens = n; + return lt->tokens; +} + +int yywrap(void* foo) +{ + /* must return 1 for end of input, 0 otherwise */ + return 1; +} + +/********************** non-exported functions ************************/ + +static void setup(PPLexTable *lt) +{ + int i; + for (i=0; inodes_of_label[i] = NULL; + lt->last_node_of_label[i] = NULL; + lt->labels[i] = NULL; + } + lt->string_set = string_set_create(); + lt->tokens = NULL; + lt->extents = 0; +} + +static bool set_label(PPLexTable *lt, const char *label) +{ + int i; + char *c; + char *label_sans_colon; + + /* check for and then slice off the trailing colon */ + label_sans_colon = strdupa(label); + c=&(label_sans_colon[strlen(label_sans_colon)-1]); + if (*c != ':') + { + prt_error("Error: Label %s must end with :\n", label); + return false; + } + *c = 0; + + /* have we seen this label already? If so, abort */ + for (i=0;lt->labels[i]!=NULL && strcmp(lt->labels[i],label_sans_colon);i++) {} + if (lt->labels[i]!=NULL) + { + prt_error("Error: pp_lexer: label %s multiply defined!\n", label_sans_colon); + return false; + } + + /* new label. Store it */ + if (i == PP_LEXER_MAX_LABELS-1) + { + prt_error("Error: pp_lexer: too many labels. Raise PP_LEXER_MAX_LABELS\n"); + return false; + } + lt->labels[i] = string_set_add(label_sans_colon, lt->string_set); + lt->idx_of_active_label = i; + + return true; +} + +static bool check_string(const char *str) +{ + if (strlen(str)>1 && strchr(str, ',')!=NULL) + { + prt_error("Error: pp_lexer: string %s contains a comma, which is a no-no.\n", str); + return false; + } + return true; +} + + +static bool add_string_to_label(PPLexTable *lt, const char *str) +{ + struct yyguts_t * yyg = (struct yyguts_t*) lt->scanner; + + /* add the single string str to the set of strings associated with label */ + pp_label_node *new_node; + + if (lt->idx_of_active_label == -1) + { + prt_error("Error: pp_lexer: invalid syntax (line %i)\n", yylineno); + return false; + } + + /* make sure string is legal */ + if (!check_string(str)) return false; + + /* create a new node in (as yet to be determined) linked list of strings */ + new_node = (pp_label_node *) malloc (sizeof(pp_label_node)); + new_node->str = string_set_add(str, lt->string_set); + new_node->next = NULL; + + /* stick newly-created node at the *end* of the appropriate linked list */ + if (lt->last_node_of_label[lt->idx_of_active_label]==NULL) + { + /* first entry on linked list */ + lt->nodes_of_label[lt->idx_of_active_label] = new_node; + lt->last_node_of_label[lt->idx_of_active_label] = new_node; + } + else + { + /* non-first entry on linked list */ + lt->last_node_of_label[lt->idx_of_active_label]->next = new_node; + lt->last_node_of_label[lt->idx_of_active_label] = new_node; + } + + return true; +} + +static bool add_set_of_strings_to_label(PPLexTable *lt,const char *label_of_set) +{ + struct yyguts_t * yyg = (struct yyguts_t*) lt->scanner; + + /* add the set of strings, defined earlier by label_of_set, to the set of + strings associated with the current label */ + pp_label_node *p; + int idx_of_label_of_set; + if (lt->idx_of_active_label==-1) + { + prt_error("Error: pp_lexer: invalid syntax (line %i)\n", yylineno); + return false; + } + if ((idx_of_label_of_set = get_index_of_label(lt, label_of_set))==-1) + { + prt_error("Error: pp_lexer: label %s must be defined before it's referred to (line %i)\n", + label_of_set, yylineno); + return false; + } + for (p=lt->nodes_of_label[idx_of_label_of_set]; p!=NULL; p=p->next) + add_string_to_label(lt, p->str); + + return true; +} + +static int get_index_of_label(PPLexTable *lt, const char *label) +{ + int i; + for (i=0; lt->labels[i]!=NULL; i++) + if (!strcmp(lt->labels[i], label)) return i; + return -1; +} + diff -Nru link-grammar-5.3.16/link-grammar/post-process/pp_lexer.h link-grammar-5.5.0/link-grammar/post-process/pp_lexer.h --- link-grammar-5.3.16/link-grammar/post-process/pp_lexer.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/pp_lexer.h 2018-01-11 08:47:32.000000000 +0000 @@ -0,0 +1,43 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include "pp-structures.h" + +typedef struct pp_label_node_s +{ + /* linked list of strings associated with a label in the table */ + const char *str; + struct pp_label_node_s *next; +} pp_label_node; /* next=NULL: end of list */ + +#define PP_LEXER_MAX_LABELS 512 +struct PPLexTable_s +{ + void *scanner; /* Reentrant lexer structure */ + String_set *string_set; + const char *labels[PP_LEXER_MAX_LABELS]; /* array of labels */ + pp_label_node *nodes_of_label[PP_LEXER_MAX_LABELS]; /*str. for each label*/ + pp_label_node *last_node_of_label[PP_LEXER_MAX_LABELS]; /* efficiency */ + pp_label_node *current_node_of_active_label;/* state: curr node of label */ + int idx_of_active_label; /* read state: current label */ + const char **tokens; + int extents; +}; + +PPLexTable *pp_lexer_open (FILE *f); +void pp_lexer_close (PPLexTable *lt); +int pp_lexer_set_label (PPLexTable *lt, const char *label); +int pp_lexer_count_tokens_of_label (PPLexTable *lt); +const char *pp_lexer_get_next_token_of_label(PPLexTable *lt); +int pp_lexer_count_commas_of_label (PPLexTable *lt); +const char **pp_lexer_get_next_group_of_tokens_of_label(PPLexTable *lt, size_t *n_toks); diff -Nru link-grammar-5.3.16/link-grammar/post-process/pp_lexer.l link-grammar-5.5.0/link-grammar/post-process/pp_lexer.l --- link-grammar-5.3.16/link-grammar/post-process/pp_lexer.l 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/pp_lexer.l 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,317 @@ +%{ +/************************************************************************** + Lex specification for post-process knowledge file + + 6/96 ALB + Possible areas of improvement: + . allow sets to be referred to anywhere in file, not just + after they are defined. (but then you need to be careful + about infinite loops involving recursively defined sets) + +**************************************************************************/ + +#include "pp_lexer.h" +#include "pp-structures.h" +#include "string-set.h" +#include "utilities.h" + +#define YY_NO_INPUT +#define YY_NO_UNPUT + +/* forward references for non-exported functions */ +static void setup(PPLexTable *lt); +static bool set_label(PPLexTable *lt, const char *label); +static bool add_string_to_label(PPLexTable *lt, const char *str); +static bool add_set_of_strings_to_label(PPLexTable *lt,const char *label_of_set); +static int get_index_of_label(PPLexTable *lt, const char *label); + +/* ptr to lex table we're currently filling in */ +#define YY_EXTRA_TYPE PPLexTable* +%} + +%option reentrant + +%s INCLUDE + +LABEL [^ \t\"\n:]+: +NONLABELSET @[^ \t\n\":]+ +NONLABEL [^ \t\"\n:]+ +NONLABELQUOTED \"[^\t\"\n:]+\" +WHITE [ \t\n]* +COMMENT ^[\ \t]*;.*$ + +%% + +{COMMENT} ; +{WHITE} ; + +{LABEL} { set_label(yyextra, yytext); } +{NONLABELSET} { add_set_of_strings_to_label(yyextra, &(yytext[1])); } +{NONLABEL} { add_string_to_label(yyextra, yytext); } +{NONLABELQUOTED} { add_string_to_label(yyextra, yytext); } + +. { prt_error("pp_lexer: unable to parse knowledge file (line %i).\n", yylineno); } + + +%% + +/************************ exported functions ******************************/ + +void pp_lexer_close(PPLexTable *lt) +{ + int i; + pp_label_node *node,*next; + for (i=0; inodes_of_label[i]; + while (node) + { + next = node->next; + free(node); + node=next; + } + } + string_set_delete(lt->string_set); + if (lt->tokens) free(lt->tokens); + yylex_destroy(lt->scanner); + free(lt); +} + +PPLexTable *pp_lexer_open(FILE *f) +{ + PPLexTable *lt; + bool yylex_ok = false; + + assert(f, "pp_lexer_open: passed a NULL file pointer"); + + lt = (PPLexTable*) malloc (sizeof(PPLexTable)); + setup(lt); + yylex_init(<->scanner); + yyset_extra(lt, lt->scanner); + + struct yyguts_t * yyg = (struct yyguts_t*) lt->scanner; + yyin = f; /* redirect lex to look at the specified file */ + + if (YY_NULL == yylex(lt->scanner)) yylex_ok = true; + lt->idx_of_active_label=-1; + + if (!yylex_ok) + { + pp_lexer_close(lt); + return NULL; + } + return lt; +} + +int pp_lexer_set_label(PPLexTable *lt, const char *label) +{ + /* set lexer state to first node of this label */ + lt->idx_of_active_label = get_index_of_label(lt, label); + if (lt->idx_of_active_label==-1) return 0; /* label not found */ + lt->current_node_of_active_label=lt->nodes_of_label[lt->idx_of_active_label]; + return 1; +} + +int pp_lexer_count_tokens_of_label(PPLexTable *lt) +{ + /* counts all tokens, even the commas */ + int n; + pp_label_node *p; + if (lt->idx_of_active_label == -1) + { + prt_error("Error: pp_lexer: current label is invalid\n"); + return -1; + } + for (n=0, p=lt->nodes_of_label[lt->idx_of_active_label]; p;p=p->next, n++){} + return n; +} + + +/** Retrieves next token of set label, or NULL if list exhausted. */ +const char *pp_lexer_get_next_token_of_label(PPLexTable *lt) +{ + if (lt->current_node_of_active_label == NULL) return NULL; + const char *p = lt->current_node_of_active_label->str; + lt->current_node_of_active_label = lt->current_node_of_active_label->next; + return p; +} + +int pp_lexer_count_commas_of_label(PPLexTable *lt) +{ + int n; + pp_label_node *p; + if (lt->idx_of_active_label == -1) + { + prt_error("Error: pp_lexer: current label is invalid\n"); + return -1; + } + for (n=0,p=lt->nodes_of_label[lt->idx_of_active_label];p!=NULL;p=p->next) + if (!strcmp(p->str, ",")) n++; + return n; +} + +const char **pp_lexer_get_next_group_of_tokens_of_label(PPLexTable *lt, size_t *n_tokens) +{ + /* all tokens until next comma, null-terminated */ + int n; + pp_label_node *p; + + p = lt->current_node_of_active_label; + for (n=0; p!=NULL && strcmp(p->str,","); n++, p=p->next) {} + if (n > lt->extents) { + lt->extents = n; + lt->tokens = (const char **) realloc (lt->tokens, n * sizeof(const char*)); + } + + p = lt->current_node_of_active_label; + for (n=0; p!=NULL && strcmp(p->str,","); n++, p=p->next) + lt->tokens[n] = string_set_add(p->str, lt->string_set); + + /* advance "current node of label" state */ + lt->current_node_of_active_label = p; + if (p!=NULL) lt->current_node_of_active_label = p->next; + + *n_tokens = n; + return lt->tokens; +} + +int yywrap(void* foo) +{ + /* must return 1 for end of input, 0 otherwise */ + return 1; +} + +/********************** non-exported functions ************************/ + +static void setup(PPLexTable *lt) +{ + int i; + for (i=0; inodes_of_label[i] = NULL; + lt->last_node_of_label[i] = NULL; + lt->labels[i] = NULL; + } + lt->string_set = string_set_create(); + lt->tokens = NULL; + lt->extents = 0; +} + +static bool set_label(PPLexTable *lt, const char *label) +{ + int i; + char *c; + char *label_sans_colon; + + /* check for and then slice off the trailing colon */ + label_sans_colon = strdupa(label); + c=&(label_sans_colon[strlen(label_sans_colon)-1]); + if (*c != ':') + { + prt_error("Error: Label %s must end with :\n", label); + return false; + } + *c = 0; + + /* have we seen this label already? If so, abort */ + for (i=0;lt->labels[i]!=NULL && strcmp(lt->labels[i],label_sans_colon);i++) {} + if (lt->labels[i]!=NULL) + { + prt_error("Error: pp_lexer: label %s multiply defined!\n", label_sans_colon); + return false; + } + + /* new label. Store it */ + if (i == PP_LEXER_MAX_LABELS-1) + { + prt_error("Error: pp_lexer: too many labels. Raise PP_LEXER_MAX_LABELS\n"); + return false; + } + lt->labels[i] = string_set_add(label_sans_colon, lt->string_set); + lt->idx_of_active_label = i; + + return true; +} + +static bool check_string(const char *str) +{ + if (strlen(str)>1 && strchr(str, ',')!=NULL) + { + prt_error("Error: pp_lexer: string %s contains a comma, which is a no-no.\n", str); + return false; + } + return true; +} + + +static bool add_string_to_label(PPLexTable *lt, const char *str) +{ + struct yyguts_t * yyg = (struct yyguts_t*) lt->scanner; + + /* add the single string str to the set of strings associated with label */ + pp_label_node *new_node; + + if (lt->idx_of_active_label == -1) + { + prt_error("Error: pp_lexer: invalid syntax (line %i)\n", yylineno); + return false; + } + + /* make sure string is legal */ + if (!check_string(str)) return false; + + /* create a new node in (as yet to be determined) linked list of strings */ + new_node = (pp_label_node *) malloc (sizeof(pp_label_node)); + new_node->str = string_set_add(str, lt->string_set); + new_node->next = NULL; + + /* stick newly-created node at the *end* of the appropriate linked list */ + if (lt->last_node_of_label[lt->idx_of_active_label]==NULL) + { + /* first entry on linked list */ + lt->nodes_of_label[lt->idx_of_active_label] = new_node; + lt->last_node_of_label[lt->idx_of_active_label] = new_node; + } + else + { + /* non-first entry on linked list */ + lt->last_node_of_label[lt->idx_of_active_label]->next = new_node; + lt->last_node_of_label[lt->idx_of_active_label] = new_node; + } + + return true; +} + +static bool add_set_of_strings_to_label(PPLexTable *lt,const char *label_of_set) +{ + struct yyguts_t * yyg = (struct yyguts_t*) lt->scanner; + + /* add the set of strings, defined earlier by label_of_set, to the set of + strings associated with the current label */ + pp_label_node *p; + int idx_of_label_of_set; + if (lt->idx_of_active_label==-1) + { + prt_error("Error: pp_lexer: invalid syntax (line %i)\n", yylineno); + return false; + } + if ((idx_of_label_of_set = get_index_of_label(lt, label_of_set))==-1) + { + prt_error("Error: pp_lexer: label %s must be defined before it's referred to (line %i)\n", + label_of_set, yylineno); + return false; + } + for (p=lt->nodes_of_label[idx_of_label_of_set]; p!=NULL; p=p->next) + add_string_to_label(lt, p->str); + + return true; +} + +static int get_index_of_label(PPLexTable *lt, const char *label) +{ + int i; + for (i=0; lt->labels[i]!=NULL; i++) + if (!strcmp(lt->labels[i], label)) return i; + return -1; +} diff -Nru link-grammar-5.3.16/link-grammar/post-process/pp_linkset.c link-grammar-5.5.0/link-grammar/post-process/pp_linkset.c --- link-grammar-5.3.16/link-grammar/post-process/pp_linkset.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/pp_linkset.c 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,162 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +/*********************************************************************** +pp_linkset.c +maintains sets of pointers to link names +Similar to string-set, except that the comparison and hashing functions are +tailored for links. More importantly, all we store here is pointers. It's up +to the caller to ensure that the pointers always point to something useful. +**********************************************************************/ + +#include + +#include "post-process.h" +#include "pp_linkset.h" +#include "utilities.h" + +#define LINKSET_SPARSENESS 2 +#define LINKSET_SEED_VALUE 37 + +static void clear_hash_table(pp_linkset *ls) +{ + memset(ls->hash_table,0,ls->hash_table_size*sizeof(pp_linkset_node *)); +} + +static void initialize(pp_linkset *ls, int size) +{ + ls->hash_table_size = size*LINKSET_SPARSENESS; + ls->population = 0; + ls->hash_table = + (pp_linkset_node**) malloc (ls->hash_table_size*sizeof(pp_linkset_node *)); + clear_hash_table(ls); +} + +/* FIXME: Use connector enumeration to save computing the hash. */ +static unsigned int compute_hash(pp_linkset *ls, const char *str) +{ + /* hash is computed from capitalized prefix only */ + unsigned int i, hashval; + hashval = LINKSET_SEED_VALUE; + i = 0; + if (islower((int)str[0])) i++; /* skip head-dependent indicator */ + for (; isupper((int)str[i]); i++) + hashval = str[i] + 31*hashval; + hashval %= ls->hash_table_size; + return hashval; +} + +static pp_linkset_node *add_internal(pp_linkset *ls, const char *str) +{ + pp_linkset_node *p, *n; + unsigned int hashval; + + /* look for str (exactly) in linkset */ + hashval = compute_hash(ls, str); + for (p=ls->hash_table[hashval]; p!=0; p=p->next) + if (!strcmp(p->str,str)) return NULL; /* already present */ + + /* create a new node for u; stick it at head of linked list */ + n = (pp_linkset_node *) malloc (sizeof(pp_linkset_node)); + n->next = ls->hash_table[hashval]; + n->str = str; + ls->hash_table[hashval] = n; + return n; +} + +pp_linkset *pp_linkset_open(int size) +{ + pp_linkset *ls; + if (size==0) return NULL; + ls = (pp_linkset *) malloc (sizeof(pp_linkset)); + initialize(ls, size); + return ls; +} + +void pp_linkset_close(pp_linkset *ls) +{ + if (ls == NULL) return; + pp_linkset_clear(ls); /* free memory taken by linked lists */ + free(ls->hash_table); + free(ls); +} + +void pp_linkset_clear(pp_linkset *ls) +{ + /* clear dangling linked lists, but retain hash table itself */ + unsigned int i; + pp_linkset_node *p; + if (ls == NULL) return; + for (i=0; ihash_table_size; i++) + { + p = ls->hash_table[i]; + while (p) + { + pp_linkset_node *q = p; + p = p->next; + free(q); + } + } + clear_hash_table(ls); + ls->population = 0; +} + +/** + * returns 0 if already there, 1 if new. Stores only the pointer + */ +bool pp_linkset_add(pp_linkset *ls, const char *str) +{ + assert(ls != NULL, "pp_linkset internal error: Trying to add to a null set"); + + if (add_internal(ls, str) == NULL) return false; + ls->population++; + return true; +} + +/** + * Set query. Returns 1 if str pp-matches something in the set, 0 otherwise + */ +bool pp_linkset_match(pp_linkset *ls, const char *str) +{ + int hashval; + pp_linkset_node *p; + if (ls == NULL) return false; + hashval = compute_hash(ls, str); + p = ls->hash_table[hashval]; + while (p != 0) + { + if (post_process_match(p->str, str)) return true; + p = p->next; + } + return false; +} + +bool pp_linkset_match_bw(pp_linkset *ls, const char *str) +{ + unsigned int hashval; + pp_linkset_node *p; + if (ls == NULL) return false; + hashval = compute_hash(ls, str); + p = ls->hash_table[hashval]; + while (p != 0) + { + if (post_process_match(str, p->str)) return true; + p = p->next; + } + return false; +} + +size_t pp_linkset_population(pp_linkset *ls) +{ + return (ls == NULL) ? 0 : ls->population; +} + diff -Nru link-grammar-5.3.16/link-grammar/post-process/pp_linkset.h link-grammar-5.5.0/link-grammar/post-process/pp_linkset.h --- link-grammar-5.3.16/link-grammar/post-process/pp_linkset.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/pp_linkset.h 2017-05-28 11:14:43.000000000 +0000 @@ -0,0 +1,22 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include "pp-structures.h" + +pp_linkset *pp_linkset_open(int size); +void pp_linkset_close (pp_linkset *ls); +void pp_linkset_clear (pp_linkset *ls); +bool pp_linkset_add (pp_linkset *ls, const char *str); +bool pp_linkset_match (pp_linkset *ls, const char *str); +bool pp_linkset_match_bw (pp_linkset *ls, const char *str); +size_t pp_linkset_population(pp_linkset *ls); diff -Nru link-grammar-5.3.16/link-grammar/post-process/pp-structures.h link-grammar-5.5.0/link-grammar/post-process/pp-structures.h --- link-grammar-5.3.16/link-grammar/post-process/pp-structures.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process/pp-structures.h 2018-01-22 18:48:21.000000000 +0000 @@ -0,0 +1,165 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _PP_STRUCTURES_H_ +#define _PP_STRUCTURES_H_ + +#include +#include "api-types.h" +#include "post-process.h" + +typedef struct Domain_s Domain; +typedef struct DTreeLeaf_s DTreeLeaf; +typedef struct List_o_links_struct List_o_links; + +struct Domain_s +{ + const char * string; + List_o_links * lol; + DTreeLeaf * child; + Domain * parent; + size_t size; + size_t start_link; /* the link that started this domain */ + char type; /* one letter name */ +}; + + +struct DTreeLeaf_s +{ + Domain * parent; + DTreeLeaf * next; + int link; +}; + +struct PP_data_s +{ + List_o_links ** word_links; + size_t wowlen; + size_t N_domains; + Domain * domain_array; /* The domains, sorted by size */ + size_t domlen; /* Allocated size of domain_array */ + size_t num_words; /* Number of words in linkage */ + List_o_links * links_to_ignore; + + bool *visited; /* For the depth-first search */ + size_t vlength; /* Length of visited array */ +}; + +/* A new Postprocessor struct is alloc'ed for each sentence. It contains + * sentence-specific post-processing information. + */ +struct Postprocessor_s +{ + pp_knowledge * knowledge; /* Internal rep'n of the actual rules */ + int n_global_rules_firing; /* this & the next are diagnostic */ + int n_local_rules_firing; + pp_linkset *set_of_links_of_sentence; /* seen in *any* linkage of sent */ + pp_linkset *set_of_links_in_an_active_rule;/*used in *some* linkage of sent*/ + int *relevant_contains_one_rules; /* -1-terminated list of indices */ + int *relevant_contains_none_rules; + bool q_pruned_rules; /* don't prune rules more than once in p.p. */ + String_set *string_set; /* Link names seen for sentence */ + + /* Per-linkage state; this data must be reset prior to processing + * each new linkage. */ + const char *violation; + PP_data pp_data; +}; + +struct PP_domains_s +{ + size_t num_domains; + const char ** domain_name; +}; + +/* -------------------------------------------------------------- */ +/* Other. "private", internal-use-only post-processing structures */ + +/* Davy added these */ +struct List_o_links_struct +{ + size_t link; /* the link number */ + size_t word; /* the word at the other end of this link */ + List_o_links * next; +}; + +/* from pp_linkset.c */ +typedef struct pp_linkset_node_s +{ + const char *str; + struct pp_linkset_node_s *next; +} pp_linkset_node; + +struct pp_linkset_s +{ + unsigned int hash_table_size; + unsigned int population; + pp_linkset_node **hash_table; /* data actually lives here */ +}; + +typedef struct StartingLinkAndDomain_s StartingLinkAndDomain; +struct StartingLinkAndDomain_s +{ + const char *starting_link; + int domain; /* domain which the link belongs to (-1: terminator)*/ +}; + +typedef struct pp_rule_s +{ + /* Holds a single post-processing rule. Since rules come in many + flavors, not all fields of the following are always relevant */ + const char *selector; /* name of link to which rule applies */ + pp_linkset *link_set; /* handle to set of links relevant to rule */ + int link_set_size; /* size of this set */ + int domain; /* type of domain to which rule applies */ + const char **link_array; /* array holding the spelled-out names */ + const char *msg; /* explanation (NULL=end sentinel in array)*/ + int use_count; /* Number of times rule has been applied */ +} pp_rule; + +typedef struct PPLexTable_s PPLexTable; +struct pp_knowledge_s +{ + PPLexTable *lt; /* Internal rep'n of sets of strings from knowledge file */ + const char *path; /* Name of file we loaded from */ + + /* handles to sets of links specified in knowledge file. These constitute + auxiliary data, necessary to implement the rules, below. See comments + in post-process.c for a description of these. */ + pp_linkset *domain_starter_links; + pp_linkset *urfl_domain_starter_links; + pp_linkset *urfl_only_domain_starter_links; + pp_linkset *domain_contains_links; + pp_linkset *must_form_a_cycle_links; + pp_linkset *restricted_links; + pp_linkset *ignore_these_links; + pp_linkset *left_domain_starter_links; + + /* arrays of rules specified in knowledge file */ + pp_rule *form_a_cycle_rules; + pp_rule *contains_one_rules; + pp_rule *contains_none_rules; + pp_rule *bounded_rules; + + size_t n_form_a_cycle_rules; + size_t n_contains_one_rules; + size_t n_contains_none_rules; + size_t n_bounded_rules; + + size_t nStartingLinks; + pp_linkset *set_of_links_starting_bounded_domain; + StartingLinkAndDomain *starting_link_lookup_table; + String_set *string_set; +}; + +#endif diff -Nru link-grammar-5.3.16/link-grammar/post-process.c link-grammar-5.5.0/link-grammar/post-process.c --- link-grammar-5.3.16/link-grammar/post-process.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1419 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/* see bottom of file for comments on post processing */ - -#include -#include - -#include "post-process.h" - -#include "api-structures.h" -#include "error.h" -#include "externs.h" -#include "pp_knowledge.h" -#include "pp_linkset.h" -#include "structures.h" -#include "string-set.h" - -#define PP_MAX_DOMAINS 128 - -/** - * post_process_match -- compare two link-types. - * - * string comparison in postprocessing. The first parameter is a - * post-processing symbol. The second one is a connector name from a - * link. The upper case parts must match. We imagine that the first - * arg is padded with an infinite sequence of "#" and that the 2nd one - * is padded with "*". "#" matches anything, but "*" is just like an - * ordinary char for matching purposes. - */ - -bool post_process_match(const char *s, const char *t) -{ - if (NULL == t) return false; - if (islower((int)*t)) t++; /* Skip head-dependent indicator */ - while (isupper((int)*s) || isupper((int)*t)) - { - if (*s != *t) return false; - s++; - t++; - } - while (*s != '\0') - { - if (*s != '#') - { - char c; - if (*t == '\0') c = '*'; else c = *t; - if (*s != c) return false; - } - s++; - if (*t != '\0') t++; - } - return true; -} - -/***************** utility routines (not exported) ***********************/ - -/** - * Returns false if the string s does not match anything in - * the array. The array elements are post-processing symbols. - */ -static int string_in_list(const char * s, const char * a[]) -{ - int i; - for (i=0; a[i] != NULL; i++) - if (post_process_match(a[i], s)) return true; - return false; -} - -/** - * Return the name of the domain associated with the provided starting - * link. Return -1 if link isn't associated with a domain. - */ -static size_t find_domain_name(Postprocessor *pp, const char *link) -{ - size_t i, domain; - StartingLinkAndDomain *sllt = pp->knowledge->starting_link_lookup_table; - for (i=0;;i++) - { - domain = sllt[i].domain; - if (domain == SIZE_MAX) return SIZE_MAX; /* hit the end-of-list sentinel */ - if (post_process_match(sllt[i].starting_link, link)) return domain; - } -} - -/** Returns true if domain d1 is contained in domain d2 */ -static int contained_in(const Domain * d1, const Domain * d2, - const Linkage sublinkage) -{ - bool *mark = alloca(sublinkage->num_links*sizeof(bool)); - List_o_links * lol; - memset(mark, 0, sublinkage->num_links*(sizeof(bool))); - for (lol=d2->lol; lol != NULL; lol = lol->next) - mark[lol->link] = true; - for (lol=d1->lol; lol != NULL; lol = lol->next) - if (!mark[lol->link]) return false; - return true; -} - -/** Returns the predicate "the given link is in the given domain" */ -static bool link_in_domain(size_t link, const Domain * d) -{ - List_o_links * lol; - for (lol = d->lol; lol != NULL; lol = lol->next) - if (lol->link == link) return true; - return false; -} - -/* #define CHECK_DOMAIN_NESTING */ - -#if defined(CHECK_DOMAIN_NESTING) -/* Although this is no longer used, I'm leaving the code here for future reference --DS 3/98 */ - -/* Returns true if the domains actually form a properly nested structure */ -static bool check_domain_nesting(Postprocessor *pp, int num_links) -{ - size_t id1, id2; - Domain * d1, * d2; - int counts[4]; - char mark[MAX_NUM_LINKS]; - List_o_links * lol; - int i; - PP_data *pp_data = &pp->pp_data; - - for (id1 = 0; id1 < pp_data->N_domains; id1++) - { - d1 = &pp_data->domain_array[id1]; - for (id2 = id1+1; id2 < pp_data->N_domains; id2++) - { - d2 = &pp_data->domain_array[id2]; - - memset(mark, 0, num_links); - for (lol=d2->lol; lol != NULL; lol = lol->next) - mark[lol->link] = 1; - - for (lol=d1->lol; lol != NULL; lol = lol->next) - mark[lol->link] += 2; - - counts[0] = counts[1] = counts[2] = counts[3] = 0; - for (i=0; i 0) && (counts[2] > 0) && (counts[3] > 0)) - return false; - } - } - return true; -} -#endif - -/** - * Free the list of links pointed to by lol - * (does not free any strings) - */ -static void free_List_o_links(List_o_links *lol) -{ - List_o_links * xlol; - while (lol != NULL) - { - xlol = lol->next; - free(lol); - lol = xlol; - } -} - -static void free_D_tree_leaves(DTreeLeaf *dtl) -{ - DTreeLeaf * xdtl; - while (dtl != NULL) - { - xdtl = dtl->next; - free(dtl); - dtl = xdtl; - } -} - -void pp_free_domain_array(PP_data *ppd) -{ - size_t d; - for (d = 0; d < ppd->domlen; d++) - { - free_List_o_links(ppd->domain_array[d].lol); - ppd->domain_array[d].lol = NULL; - free_D_tree_leaves(ppd->domain_array[d].child); - ppd->domain_array[d].child = NULL; - } -} - -/** - * Gets called after every invocation of post_process() - */ -void post_process_free_data(PP_data * ppd) -{ - size_t w; - for (w = 0; w < ppd->wowlen; w++) - { - free_List_o_links(ppd->word_links[w]); - ppd->word_links[w] = NULL; - } - - pp_free_domain_array(ppd); - free_List_o_links(ppd->links_to_ignore); - ppd->links_to_ignore = NULL; - ppd->num_words = 0; - ppd->N_domains = 0; -} - -#ifdef THIS_FUNCTION_IS_NOT_CURRENTLY_USED -static void connectivity_dfs(Postprocessor *pp, Linkage sublinkage, - int w, pp_linkset *ls) -{ - List_o_links *lol; - assert(w < pp_data->num_words, "Bad word index"); - pp_data->visited[w] = true; - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - if (!pp_data->visited[lol->word] && - !pp_linkset_match(ls, sublinkage->link[lol->link]->name)) - connectivity_dfs(pp, sublinkage, lol->word, ls); - } -} -#endif /* THIS_FUNCTION_IS_NOT_CURRENTLY_USED */ - -static void chk_d_type(PP_node* ppn, size_t idx) -{ - if (ppn->dtsz <= idx) - { - ppn->dtsz += idx + 5; - ppn->d_type_array = realloc(ppn->d_type_array, - ppn->dtsz * sizeof(D_type_list*)); - } -} - -/** - * This is used in one place only: to set up the domain type array, - * which is needed in only one place ever: when printing the domain - * names. If the domain names are not being printed, then this is - * a complete waste of CPU time. - * - * XXX refactor so that this is not done, unless the names are printed. - */ -void build_type_array(Postprocessor *pp) -{ - D_type_list * dtl; - size_t d; - List_o_links * lol; - PP_data *pp_data = &pp->pp_data; - - for (d = 0; d < pp_data->N_domains; d++) - { - for (lol = pp_data->domain_array[d].lol; lol != NULL; lol = lol->next) - { - chk_d_type(pp->pp_node, lol->link); - dtl = (D_type_list *) malloc(sizeof(D_type_list)); - dtl->next = pp->pp_node->d_type_array[lol->link]; - dtl->type = pp_data->domain_array[d].type; - pp->pp_node->d_type_array[lol->link] = dtl; - } - } -} - -static void free_d_type(D_type_list * dtl) -{ - D_type_list * dtlx; - for (; dtl != NULL; dtl = dtlx) - { - dtlx = dtl->next; - free((void*) dtl); - } -} - -/** free the pp node from last time */ -static void free_pp_node(Postprocessor *pp) -{ - size_t i; - PP_node *ppn = pp->pp_node; - pp->pp_node = NULL; - if (ppn == NULL) return; - - for (i=0; idtsz; i++) - { - free_d_type(ppn->d_type_array[i]); - } - free(ppn->d_type_array); - free((void*) ppn); -} - -/** set up a fresh pp_node for later use */ -static void alloc_pp_node(Postprocessor *pp) -{ - size_t dz; - PP_node *ppn = (PP_node *) malloc(sizeof(PP_node)); - - /* highly unlikely that the number of links will ever exceed this */ - ppn->dtsz = 2 * pp->pp_data.num_words; - dz = ppn->dtsz * sizeof(D_type_list*); - ppn->d_type_array = (D_type_list **) malloc (dz); - memset(ppn->d_type_array, 0, dz); - - pp->pp_node = ppn; -} - -static void clear_pp_node(Postprocessor *pp) -{ - size_t i; - PP_node *ppn = pp->pp_node; - if (NULL == ppn) { alloc_pp_node(pp); ppn = pp->pp_node; } - - ppn->violation = NULL; - for (i=0; idtsz; i++) - { - free_d_type(ppn->d_type_array[i]); - ppn->d_type_array[i] = NULL; - } -} - -/* ================ compute the domain names ============= */ -/** - * Store the domain names in the linkage. These are not needed - * unless the user asks the domain names to be printed! - */ -void linkage_set_domain_names(Postprocessor *postprocessor, Linkage linkage) -{ - PP_node * pp; - size_t j, k; - D_type_list * d; - - if (NULL == linkage) return; - if (NULL == postprocessor) return; - - linkage->pp_info = (PP_info *) exalloc(sizeof(PP_info) * linkage->num_links); - memset(linkage->pp_info, 0, sizeof(PP_info) * linkage->num_links); - - /* Copy the post-processing results over into the linkage */ - pp = postprocessor->pp_node; - if (pp->violation != NULL) - return; - - for (j = 0; j < linkage->num_links; ++j) - { - k = 0; - for (d = pp->d_type_array[j]; d != NULL; d = d->next) k++; - linkage->pp_info[j].num_domains = k; - if (k > 0) - { - linkage->pp_info[j].domain_name = (const char **) exalloc(sizeof(const char *)*k); - } - k = 0; - for (d = pp->d_type_array[j]; d != NULL; d = d->next) - { - char buff[] = {d->type, '\0'}; - - linkage->pp_info[j].domain_name[k] = - string_set_add (buff, postprocessor->string_set); - - k++; - } - } -} - -static inline bool verify_link_index(const Linkage linkage, LinkIdx index) -{ - if (!linkage) return false; - if (index >= linkage->num_links) return false; - return true; -} - -int linkage_get_link_num_domains(const Linkage linkage, LinkIdx index) -{ - if (!verify_link_index(linkage, index)) return -1; - return linkage->pp_info[index].num_domains; -} - -const char ** linkage_get_link_domain_names(const Linkage linkage, LinkIdx index) -{ - if (!verify_link_index(linkage, index)) return NULL; - return linkage->pp_info[index].domain_name; -} - -const char * linkage_get_violation_name(const Linkage linkage) -{ - return linkage->lifo.pp_violation_msg; -} - -void exfree_domain_names(PP_info *ppi) -{ - if (ppi->num_domains > 0) - exfree((void *) ppi->domain_name, sizeof(const char *) * ppi->num_domains); - ppi->domain_name = NULL; - ppi->num_domains = 0; -} - -void linkage_free_pp_info(Linkage lkg) -{ - size_t j; - if (!lkg || !lkg->pp_info) return; - - for (j = 0; j < lkg->num_links; ++j) - exfree_domain_names(&lkg->pp_info[j]); - exfree(lkg->pp_info, sizeof(PP_info) * lkg->num_links); - lkg->pp_info = NULL; -} - -/************************ rule application *******************************/ - -static void clear_visited(PP_data *pp_data) -{ - memset(pp_data->visited, 0, pp_data->num_words * sizeof(bool)); -} - -static bool apply_rules(PP_data *pp_data, - bool (applyfn) (PP_data *, Linkage, pp_rule *), - Linkage sublinkage, - pp_rule *rule_array, - const char **msg) -{ - int i; - for (i = 0; (*msg = rule_array[i].msg) != NULL; i++) - { - if (!applyfn(pp_data, sublinkage, &(rule_array[i]))) - { - rule_array[i].use_count ++; - return false; - } - } - return true; -} - -static bool -apply_relevant_rules(Postprocessor *pp, - bool (applyfn)(PP_data *, Linkage, pp_rule *), - Linkage sublinkage, - pp_rule *rule_array, - int *relevant_rules, - const char **msg) -{ - int i, idx; - PP_data *pp_data = &pp->pp_data; - - /* If we didn't accumulate link names for this sentence, we need - * to apply all rules. */ - if (pp_linkset_population(pp->set_of_links_of_sentence) == 0) { - return apply_rules(pp_data, applyfn, sublinkage, rule_array, msg); - } - - /* We did, and we don't. */ - for (i = 0; (idx = relevant_rules[i]) != -1; i++) - { - *msg = rule_array[idx].msg; - if (!applyfn(pp_data, sublinkage, &(rule_array[idx]))) return false; - } - return true; -} - -/** - * returns true if and only if all groups containing the specified link - * contain at least one from the required list. (as determined by exact - * string matching) - */ -static bool -apply_contains_one(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) -{ - DTreeLeaf * dtl; - size_t d, count; - - for (d=0; dN_domains; d++) - { - for (dtl = pp_data->domain_array[d].child; - dtl != NULL && - !post_process_match(rule->selector, - sublinkage->link_array[dtl->link].link_name); - dtl = dtl->next) {} - if (dtl != NULL) - { - /* selector link of rule appears in this domain */ - count=0; - for (dtl = pp_data->domain_array[d].child; dtl != NULL; dtl = dtl->next) - { - if (string_in_list(sublinkage->link_array[dtl->link].link_name, - rule->link_array)) - { - count=1; - break; - } - } - if (count == 0) return false; - } - } - return true; -} - - -/** - * Returns true if and only if: - * all groups containing the selector link do not contain anything - * from the link_array contained in the rule. Uses exact string matching. - */ -static bool -apply_contains_none(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) -{ - size_t d; - - for (d=0; dN_domains; d++) - { - DTreeLeaf * dtl; - for (dtl = pp_data->domain_array[d].child; - dtl != NULL && - !post_process_match(rule->selector, - sublinkage->link_array[dtl->link].link_name); - dtl = dtl->next) {} - if (dtl != NULL) - { - /* selector link of rule appears in this domain */ - for (dtl = pp_data->domain_array[d].child; dtl != NULL; dtl = dtl->next) - { - if (string_in_list(sublinkage->link_array[dtl->link].link_name, - rule->link_array)) - return false; - } - } - } - return true; -} - -/** - * Returns true if and only if - * (1) the sentence doesn't contain the selector link for the rule, or - * (2) it does, and it also contains one or more from the rule's link set - */ -static bool -apply_contains_one_globally(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) -{ - size_t i, j, count; - for (i = 0; i < sublinkage->num_links; i++) - { - assert(sublinkage->link_array[i].lw != SIZE_MAX); - if (post_process_match(rule->selector, sublinkage->link_array[i].link_name)) break; - } - if (i == sublinkage->num_links) return true; - - /* selector link of rule appears in sentence */ - count = 0; - for (j = 0; j < sublinkage->num_links && count == 0; j++) - { - assert(sublinkage->link_array[j].lw != SIZE_MAX); - if (string_in_list(sublinkage->link_array[j].link_name, rule->link_array)) - { - count = 1; - break; - } - } - if (count == 0) return false; else return true; -} - -/** - * For each link in the linkage that is in the must_form_a_cycle list, - * we want to make sure that that link is in a cycle. We do this - * simply by deleting the link, then seeing if the end points of that - * link are still connected. - */ -static void reachable_without_dfs(PP_data *pp_data, - Linkage sublinkage, size_t a, size_t b, size_t w) -{ - /* This is a depth first search of words reachable from w, excluding - * any direct edge between word a and word b. */ - List_o_links *lol; - assert(w < pp_data->num_words, "Bad word index"); - pp_data->visited[w] = true; - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - assert(lol->word < pp_data->num_words, "Bad word index"); - if (!pp_data->visited[lol->word] && - !(w == a && lol->word == b) && - !(w == b && lol->word == a)) - { - reachable_without_dfs(pp_data, sublinkage, a, b, lol->word); - } - } -} - -/** - * Returns true if the linkage is connected when ignoring the links - * whose names are in the given list of link names. - * Actually, what it does is this: it returns false if the connectivity - * of the subgraph reachable from word 0 changes as a result of deleting - * these links. - */ -static bool -apply_must_form_a_cycle(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) -{ - List_o_links *lol; - size_t w; - - for (w = 0; w < pp_data->num_words; w++) - { - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - if (w > lol->word) continue; /* only consider each edge once */ - if (!pp_linkset_match(rule->link_set, sublinkage->link_array[lol->link].link_name)) continue; - - clear_visited(pp_data); - reachable_without_dfs(pp_data, sublinkage, w, lol->word, w); - if (!pp_data->visited[lol->word]) return false; - } - } - - for (lol = pp_data->links_to_ignore; lol != NULL; lol = lol->next) - { - w = sublinkage->link_array[lol->link].lw; - /* (w, lol->word) are the left and right ends of the edge we're considering */ - if (!pp_linkset_match(rule->link_set, sublinkage->link_array[lol->link].link_name)) continue; - - clear_visited(pp_data); - reachable_without_dfs(pp_data, sublinkage, w, lol->word, w); - - assert(lol->word < pp_data->num_words, "Bad word index"); - if (!pp_data->visited[lol->word]) return false; - } - - return true; -} - -/** - * Checks to see that all domains with this name have the property that - * all of the words that touch a link in the domain are not to the left - * of the root word of the domain. - */ -static bool -apply_bounded(PP_data *pp_data, Linkage sublinkage, pp_rule *rule) -{ - size_t d, lw; - List_o_links * lol; - char d_type = rule->domain; - - for (d = 0; d < pp_data->N_domains; d++) - { - if (pp_data->domain_array[d].type != d_type) continue; - lw = sublinkage->link_array[pp_data->domain_array[d].start_link].lw; - for (lol = pp_data->domain_array[d].lol; lol != NULL; lol = lol->next) - { - if (sublinkage->link_array[lol->link].lw < lw) return false; - } - } - return true; -} - -/** - * fill in the pp->pp_data.word_links array with a list of words - * neighboring each word (actually a list of links). This is an - * undirected graph. - */ -static void build_graph(Postprocessor *pp, Linkage sublinkage) -{ - size_t link; - List_o_links * lol; - PP_data *pp_data = &pp->pp_data; - - /* Get more size, if needed */ - if (pp_data->wowlen <= pp_data->num_words) - { - size_t newsz; - pp_data->wowlen += pp_data->num_words; - newsz = pp_data->wowlen * sizeof(List_o_links *); - pp_data->word_links = (List_o_links **) realloc( - pp_data->word_links, newsz); - } - memset(pp_data->word_links, 0, pp_data->wowlen * sizeof(List_o_links *)); - - for (link = 0; link < sublinkage->num_links; link++) - { - assert (sublinkage->link_array[link].lw != SIZE_MAX); - if (NULL == sublinkage->link_array[link].link_name) continue; - if (pp_linkset_match(pp->knowledge->ignore_these_links, - sublinkage->link_array[link].link_name)) - { - lol = (List_o_links *) malloc(sizeof(List_o_links)); - lol->next = pp_data->links_to_ignore; - pp_data->links_to_ignore = lol; - lol->link = link; - lol->word = sublinkage->link_array[link].rw; - continue; - } - - lol = (List_o_links *) malloc(sizeof(List_o_links)); - lol->next = pp_data->word_links[sublinkage->link_array[link].lw]; - pp_data->word_links[sublinkage->link_array[link].lw] = lol; - lol->link = link; - lol->word = sublinkage->link_array[link].rw; - - lol = (List_o_links *) malloc(sizeof(List_o_links)); - lol->next = pp_data->word_links[sublinkage->link_array[link].rw]; - pp_data->word_links[sublinkage->link_array[link].rw] = lol; - lol->link = link; - lol->word = sublinkage->link_array[link].lw; - } -} - -static void setup_domain_array(Postprocessor *pp, - const char *string, int start_link) -{ - PP_data *pp_data = &pp->pp_data; - size_t n = pp_data->N_domains; - - /* Grab more memory if needed */ - if (pp_data->domlen <= n) - { - size_t oldsz, incsz; -#define DOMINC 16 - oldsz = pp_data->domlen * sizeof(Domain); - incsz = DOMINC * sizeof(Domain); - pp_data->domain_array = (Domain *) realloc(pp_data->domain_array, - oldsz + incsz); - memset(&pp_data->domain_array[pp_data->domlen], 0, incsz); - pp_data->domlen += DOMINC; - } - - pp_data->domain_array[n].string = string; - pp_data->domain_array[n].lol = NULL; - pp_data->domain_array[n].size = 0; - pp_data->domain_array[n].start_link = start_link; - - pp_data->N_domains++; - assert(pp_data->N_domainsN_domains - 1; /* the very last one */ - List_o_links *lol = (List_o_links *) malloc(sizeof(List_o_links)); - - lol->next = pp_data->domain_array[n].lol; - pp_data->domain_array[n].lol = lol; - pp_data->domain_array[n].size++; - lol->link = link; -} - -static void depth_first_search(Postprocessor *pp, Linkage sublinkage, - size_t w, size_t root, size_t start_link) -{ - List_o_links *lol; - PP_data *pp_data = &pp->pp_data; - - assert(w < pp_data->num_words, "Bad word index"); - pp_data->visited[w] = true; - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - if (lol->word < w && lol->link != start_link) - { - add_link_to_domain(pp_data, lol->link); - } - } - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - if (!pp_data->visited[lol->word] && (lol->word != root) && - !(lol->word < root && lol->word < w && - pp_linkset_match(pp->knowledge->restricted_links, - sublinkage->link_array[lol->link].link_name))) - { - depth_first_search(pp, sublinkage, lol->word, root, start_link); - } - } -} - -static void bad_depth_first_search(Postprocessor *pp, Linkage sublinkage, - size_t w, size_t root, size_t start_link) -{ - List_o_links * lol; - PP_data *pp_data = &pp->pp_data; - - assert(w < pp_data->num_words, "Bad word index"); - pp_data->visited[w] = true; - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - if ((lol->word < w) && (lol->link != start_link) && (w != root)) - { - add_link_to_domain(pp_data, lol->link); - } - } - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - assert(lol->word < pp_data->num_words, "Bad word index"); - if ((!pp_data->visited[lol->word]) && !(w == root && lol->word < w) && - !(lol->word < root && lol->word < w && - pp_linkset_match(pp->knowledge->restricted_links, - sublinkage->link_array[lol->link].link_name))) - { - bad_depth_first_search(pp, sublinkage, lol->word, root, start_link); - } - } -} - -static void d_depth_first_search(Postprocessor *pp, Linkage sublinkage, - size_t w, size_t root, size_t right, size_t start_link) -{ - List_o_links * lol; - PP_data *pp_data = &pp->pp_data; - - assert(w < pp_data->num_words, "Bad word index"); - pp_data->visited[w] = true; - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - if ((lol->word < w) && (lol->link != start_link) && (w != root)) - { - add_link_to_domain(pp_data, lol->link); - } - } - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - assert(lol->word < pp_data->num_words, "Bad word index"); - if (!pp_data->visited[lol->word] && !(w == root && lol->word >= right) && - !(w == root && lol->word < root) && - !(lol->word < root && lol->word < w && - pp_linkset_match(pp->knowledge->restricted_links, - sublinkage->link_array[lol->link].link_name))) - { - d_depth_first_search(pp,sublinkage,lol->word,root,right,start_link); - } - } -} - -static void left_depth_first_search(Postprocessor *pp, Linkage sublinkage, - size_t w, size_t right, size_t start_link) -{ - List_o_links *lol; - PP_data *pp_data = &pp->pp_data; - - assert(w < pp_data->num_words, "Bad word index"); - pp_data->visited[w] = true; - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - if (lol->word < w && lol->link != start_link) - { - add_link_to_domain(pp_data, lol->link); - } - } - for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next) - { - assert(lol->word < pp_data->num_words, "Bad word index"); - if (!pp_data->visited[lol->word] && (lol->word != right)) - { - depth_first_search(pp, sublinkage, lol->word, right, start_link); - } - } -} - -static int domain_compare(const Domain * d1, const Domain * d2) -{ - return (d1->size - d2->size); /* for sorting the domains by size */ -} - -static void build_domains(Postprocessor *pp, Linkage sublinkage) -{ - size_t link, i, d; - const char *s; - PP_data *pp_data = &pp->pp_data; - - pp_data->N_domains = 0; - - for (link = 0; linknum_links; link++) - { - assert (sublinkage->link_array[link].lw != SIZE_MAX); - if (NULL == sublinkage->link_array[link].link_name) continue; - s = sublinkage->link_array[link].link_name; - - if (pp_linkset_match(pp->knowledge->ignore_these_links, s)) continue; - if (pp_linkset_match(pp->knowledge->domain_starter_links, s)) - { - setup_domain_array(pp, s, link); - if (pp_linkset_match(pp->knowledge->domain_contains_links, s)) - add_link_to_domain(pp_data, link); - - clear_visited(pp_data); - depth_first_search(pp, sublinkage, sublinkage->link_array[link].rw, - sublinkage->link_array[link].lw, link); - } - else - if (pp_linkset_match(pp->knowledge->urfl_domain_starter_links, s)) - { - setup_domain_array(pp, s, link); - /* always add the starter link to its urfl domain */ - add_link_to_domain(pp_data, link); - - clear_visited(pp_data); - bad_depth_first_search(pp, sublinkage,sublinkage->link_array[link].rw, - sublinkage->link_array[link].lw, link); - } - else - if (pp_linkset_match(pp->knowledge->urfl_only_domain_starter_links, s)) - { - setup_domain_array(pp, s, link); - /* do not add the starter link to its urfl_only domain */ - clear_visited(pp_data); - d_depth_first_search(pp, sublinkage, sublinkage->link_array[link].lw, - sublinkage->link_array[link].lw, - sublinkage->link_array[link].rw, link); - } - else - if (pp_linkset_match(pp->knowledge->left_domain_starter_links, s)) - { - setup_domain_array(pp, s, link); - /* do not add the starter link to a left domain */ - clear_visited(pp_data); - left_depth_first_search(pp, sublinkage, sublinkage->link_array[link].lw, - sublinkage->link_array[link].rw, link); - } - } - - /* sort the domains by size */ - qsort((void *) pp_data->domain_array, - pp_data->N_domains, - sizeof(Domain), - (int (*)(const void *, const void *)) domain_compare); - - /* sanity check: all links in all domains have a legal domain name */ - for (d = 0; d < pp_data->N_domains; d++) - { - i = find_domain_name(pp, pp_data->domain_array[d].string); - if (i == SIZE_MAX) - prt_error("Error: post_process(): Need an entry for %s in LINK_TYPE_TABLE\n", - pp_data->domain_array[d].string); - pp_data->domain_array[d].type = i; - } -} - -static void build_domain_forest(PP_data *pp_data, Linkage sublinkage) -{ - size_t d, d1, link; - DTreeLeaf * dtl; - - if (pp_data->N_domains > 0) - { - pp_data->domain_array[pp_data->N_domains-1].parent = NULL; - for (d=0; d < pp_data->N_domains-1; d++) - { - for (d1 = d+1; d1 < pp_data->N_domains; d1++) - { - if (contained_in(&pp_data->domain_array[d], &pp_data->domain_array[d1], sublinkage)) - { - pp_data->domain_array[d].parent = &pp_data->domain_array[d1]; - break; - } - } - if (d1 == pp_data->N_domains) - { - /* we know this domain is a root of a new tree */ - pp_data->domain_array[d].parent = NULL; - } - } - } - /* The parent links of domain nodes have been established. - * Now do the leaves. */ - for (d = 0; d < pp_data->N_domains; d++) - { - pp_data->domain_array[d].child = NULL; - } - for (link=0; link < sublinkage->num_links; link++) - { - assert (sublinkage->link_array[link].lw != SIZE_MAX); - for (d=0; dN_domains; d++) - { - if (link_in_domain(link, &pp_data->domain_array[d])) - { - dtl = (DTreeLeaf *) malloc(sizeof(DTreeLeaf)); - dtl->link = link; - dtl->parent = &pp_data->domain_array[d]; - dtl->next = pp_data->domain_array[d].child; - pp_data->domain_array[d].child = dtl; - break; - } - } - } -} - -static int -internal_process(Postprocessor *pp, Linkage sublinkage, const char **msg) -{ - size_t i; - PP_data *pp_data = &pp->pp_data; - - /* quick test: try applying just the relevant global rules */ - if (!apply_relevant_rules(pp, apply_contains_one_globally, - sublinkage, - pp->knowledge->contains_one_rules, - pp->relevant_contains_one_rules, msg)) - { - for (i = 0; i < pp_data->wowlen; i++) - pp_data->word_links[i] = NULL; - pp_data->N_domains = 0; - return -1; - } - - /* build graph; confirm that it's legally connected */ - build_graph(pp, sublinkage); - build_domains(pp, sublinkage); - build_domain_forest(&pp->pp_data, sublinkage); - -#if defined(CHECK_DOMAIN_NESTING) - /* These messages were deemed to not be useful, so - * this code is commented out. See comment above. */ - if (!check_domain_nesting(pp, sublinkage->num_links)) - prt_error("Warning: The domains are not nested.\n"); -#endif - - /* The order below should be optimal for most cases */ - if (!apply_relevant_rules(pp, apply_contains_one, sublinkage, - pp->knowledge->contains_one_rules, - pp->relevant_contains_one_rules, msg)) return 1; - if (!apply_relevant_rules(pp, apply_contains_none, sublinkage, - pp->knowledge->contains_none_rules, - pp->relevant_contains_none_rules, msg)) return 1; - if (!apply_rules(pp_data, apply_must_form_a_cycle, sublinkage, - pp->knowledge->form_a_cycle_rules,msg)) return 1; - if (!apply_rules(pp_data, apply_bounded, sublinkage, - pp->knowledge->bounded_rules, msg)) return 1; - return 0; /* This linkage satisfied all the rules */ -} - - -/** - * Call this (a) after having called post_process_scan_linkage() on all - * generated linkages, but (b) before calling post_process() on any - * particular linkage. Here we mark all rules which we know (from having - * accumulated a set of link names appearing in *any* linkage) that won't - * ever be needed. - */ -static void prune_irrelevant_rules(Postprocessor *pp) -{ - pp_rule *rule; - int coIDX, cnIDX, rcoIDX = 0, rcnIDX = 0; - - /* If we didn't scan any linkages, there's no pruning to be done. */ - if (pp_linkset_population(pp->set_of_links_of_sentence) == 0) return; - - for (coIDX = 0; ; coIDX++) - { - rule = &(pp->knowledge->contains_one_rules[coIDX]); - if (rule->msg == NULL) break; - if (pp_linkset_match_bw(pp->set_of_links_of_sentence, rule->selector)) - { - /* Mark rule as being relevant to this sentence */ - pp->relevant_contains_one_rules[rcoIDX++] = coIDX; - pp_linkset_add(pp->set_of_links_in_an_active_rule, rule->selector); - } - } - pp->relevant_contains_one_rules[rcoIDX] = -1; /* end sentinel */ - - for (cnIDX = 0; ; cnIDX++) - { - rule = &(pp->knowledge->contains_none_rules[cnIDX]); - if (rule->msg == NULL) break; - if (pp_linkset_match_bw(pp->set_of_links_of_sentence, rule->selector)) - { - pp->relevant_contains_none_rules[rcnIDX++] = cnIDX; - pp_linkset_add(pp->set_of_links_in_an_active_rule, rule->selector); - } - } - pp->relevant_contains_none_rules[rcnIDX] = -1; - - if (verbosity_level(5)) - { - err_msg(lg_Debug, "PP: Saw %zd unique link names in all linkages.\n\\", - pp_linkset_population(pp->set_of_links_of_sentence)); - err_msg(lg_Debug, "PP: Using %i 'contains one' rules " - "and %i 'contains none' rules\n", - rcoIDX, rcnIDX); - } -} - - -/***************** definitions of exported functions ***********************/ - -#define PP_INITLEN 60 /* just starting size, it is expanded if needed */ - -void pp_new_domain_array(PP_data *pp_data) -{ - pp_data->domlen = PP_INITLEN; - pp_data->domain_array = (Domain*) malloc(pp_data->domlen * sizeof(Domain)); - memset(pp_data->domain_array, 0, pp_data->domlen * sizeof(Domain)); -} - -/** - * read rules from path and initialize the appropriate fields in - * a postprocessor structure, a pointer to which is returned. - */ -Postprocessor * post_process_new(pp_knowledge * kno) -{ - Postprocessor *pp; - PP_data *pp_data; - - pp = (Postprocessor *) malloc (sizeof(Postprocessor)); - pp->knowledge = kno; - pp->string_set = string_set_create(); - pp->set_of_links_of_sentence = pp_linkset_open(1024); - pp->set_of_links_in_an_active_rule = pp_linkset_open(1024); - pp->relevant_contains_one_rules = - (int *) malloc ((pp->knowledge->n_contains_one_rules + 1) - *(sizeof pp->relevant_contains_one_rules[0])); - pp->relevant_contains_none_rules = - (int *) malloc ((pp->knowledge->n_contains_none_rules + 1) - *(sizeof pp->relevant_contains_none_rules[0])); - pp->relevant_contains_one_rules[0] = -1; - pp->relevant_contains_none_rules[0] = -1; - pp->pp_node = NULL; - pp->n_local_rules_firing = 0; - pp->n_global_rules_firing = 0; - - pp->q_pruned_rules = false; - - pp_data = &pp->pp_data; - pp_data->vlength = PP_INITLEN; - pp_data->visited = (bool*) malloc(pp_data->vlength * sizeof(bool)); - memset(pp_data->visited, 0, pp_data->vlength * sizeof(bool)); - - pp_data->links_to_ignore = NULL; - pp_new_domain_array(pp_data); - - pp_data->wowlen = PP_INITLEN; - pp_data->word_links = (List_o_links **) malloc(pp_data->wowlen * sizeof(List_o_links*)); - memset(pp_data->word_links, 0, pp_data->wowlen * sizeof(List_o_links *)); - - return pp; -} - -void post_process_free(Postprocessor *pp) -{ - PP_data *pp_data; - - /* frees up memory associated with pp, previously allocated by open */ - if (pp == NULL) return; - string_set_delete(pp->string_set); - pp_linkset_close(pp->set_of_links_of_sentence); - pp_linkset_close(pp->set_of_links_in_an_active_rule); - free(pp->relevant_contains_one_rules); - free(pp->relevant_contains_none_rules); - pp->knowledge = NULL; - free_pp_node(pp); - - - pp_data = &pp->pp_data; - post_process_free_data(pp_data); - free(pp_data->visited); - free(pp_data->domain_array); - free(pp_data->word_links); - - free(pp); -} - -/** - * During a first pass (prior to actual post-processing of the linkages - * of a sentence), call this once for every generated linkage. Here we - * simply maintain a set of "seen" link names for rule pruning, later on. - */ -void post_process_scan_linkage(Postprocessor *pp, Linkage linkage) -{ - size_t i; - if (pp == NULL) return; - for (i = 0; i < linkage->num_links; i++) - { - assert(linkage->link_array[i].lw != SIZE_MAX); - - pp_linkset_add(pp->set_of_links_of_sentence, - linkage->link_array[i].link_name); - } -} - -static size_t report_rule_use(pp_rule *set) -{ - size_t cnt = 0; - size_t i; - for (i=0; set[i].msg != NULL; i++) - { - err_msg(lg_Debug, "Used: %d rule: %s\n", set[i].use_count, set[i].msg); - cnt++; - } - return cnt; -} - -static size_t report_unused_rule(pp_rule *set) -{ - size_t i; - size_t cnt = 0; - for (i=0; set[i].msg != NULL; i++) - { - if (0 == set[i].use_count) - { - err_msg(lg_Debug, "Unused rule: %s\n", set[i].msg); - cnt++; - } - } - return cnt; -} - -static void report_pp_stats(Postprocessor *pp) -{ - size_t rule_cnt = 0; - size_t unused_cnt = 0; - pp_knowledge * kno; - if (!verbosity_level(9)) return; - - err_msg(lg_Debug, "PP stats: local_rules_firing=%d\n", pp->n_local_rules_firing); - kno = pp->knowledge; - - err_msg(lg_Debug, "\nPP stats: form_a_cycle_rules\n"); - rule_cnt += report_rule_use(kno->form_a_cycle_rules); - - err_msg(lg_Debug, "\nPP stats: contains_one_rules\n"); - rule_cnt += report_rule_use(kno->contains_one_rules); - - err_msg(lg_Debug, "\nPP stats: contains_none_rules\n"); - rule_cnt += report_rule_use(kno->contains_none_rules); - - err_msg(lg_Debug, "\nPP stats: bounded_rules\n"); - rule_cnt += report_rule_use(kno->bounded_rules); - - err_msg(lg_Debug, "\nPP stats: Rules that were not used:\n"); - unused_cnt += report_unused_rule(kno->form_a_cycle_rules); - unused_cnt += report_unused_rule(kno->contains_one_rules); - unused_cnt += report_unused_rule(kno->contains_none_rules); - unused_cnt += report_unused_rule(kno->bounded_rules); - - err_msg(lg_Debug, "\nPP stats: %zd of %zd rules unused\n", unused_cnt, rule_cnt); -} - -/** - * Takes a linkage and returns: - * . for each link, the domain structure of that link - * . a list of the violation strings - * NB: linkage->link[i]->l=-1 means that this connector is to be ignored. - */ -PP_node *do_post_process(Postprocessor *pp, Linkage sublinkage, bool is_long) -{ - const char *msg; - PP_data *pp_data; - - if (pp == NULL) return NULL; - pp_data = &pp->pp_data; - - // XXX wtf .. why is this not leaking memory ? - pp_data->links_to_ignore = NULL; - - pp_data->num_words = sublinkage->num_words; - - /* Grab more memory if needed */ - if (pp_data->vlength <= pp_data->num_words) - { - size_t newsz; - pp_data->vlength += pp_data->num_words; - newsz = pp_data->vlength * sizeof(bool); - pp_data->visited = (bool *) realloc(pp_data->visited, newsz); - } - clear_visited(pp_data); - - /* In the name of responsible memory management, we retain a copy of the - * returned data structure pp_node as a field in pp, so that we can clear - * it out after every call, without relying on the user to do so. */ - clear_pp_node(pp); - - /* For long sentences, we can save some time by pruning the rules - * which can't possibly be used during postprocessing the linkages - * of this sentence. For short sentences, this is pointless. */ - if (is_long && pp->q_pruned_rules == false) - { - prune_irrelevant_rules(pp); - } - pp->q_pruned_rules = true; - - switch (internal_process(pp, sublinkage, &msg)) - { - case -1: - /* some global test failed even before we had to build the domains */ - pp->n_global_rules_firing++; - pp->pp_node->violation = msg; - report_pp_stats(pp); - return pp->pp_node; - break; - case 1: - /* one of the "normal" post processing tests failed */ - pp->n_local_rules_firing++; - pp->pp_node->violation = msg; - break; - case 0: - /* This linkage is legal according to the post processing rules */ - pp->pp_node->violation = NULL; - break; - } - - report_pp_stats(pp); - - return pp->pp_node; -} - -/* OLD COMMENTS (OUT OF DATE): - This file does the post-processing. - The main routine is "post_process()". It uses the link names only, - and not the connectors. - - A domain is a set of links. Each domain has a defining link. - Only certain types of links serve to define a domain. These - parameters are set by the lists of link names in a separate, - human-readable file referred to herein as the 'knowledge file.' - - The domains are nested: given two domains, either they're disjoint, - or one contains the other, i.e. they're tree structured. The set of links - in a domain (but in no smaller domain) are called the "group" of the - domain. Data structures are built to store all this stuff. - The tree structured property is not mathematically guaranteed by - the domain construction algorithm. Davy simply claims that because - of how he built the dictionary, the domains will always be so - structured. The program checks this and gives an error message - if it's violated. - - Define the "root word" of a link (or domain) to be the word at the - left end of the link. The other end of the defining link is called - the "right word". - - The domain corresponding to a link is defined to be the set of links - reachable by starting from the right word, following links and never - using the root word or any word to its left. - - There are some minor exceptions to this. The "restricted_link" lists - those connectors that, even if they point back before the root word, - are included in the domain. Some of the starting links are included - in their domain, these are listed in the "domain_contains_links" list. - - Such was the way it was. Now Davy tells me there should be another type - of domain that's quite different. Let's call these "urfl" domains. - Certain type of connectors start urfl domains. They're listed below. - In a urfl domain, the search includes the root word. It does a separate - search to find urfl domains. - - Restricted links should work just as they do with ordinary domains. If they - come out of the right word, or anything to the right of it (that's - in the domain), they should be included but should not be traced - further. If they come out of the root word, they should not be - included. - */ - -/* - I also, unfortunately, want to propose a new type of domain. These - would include everything that can be reached from the root word of the - link, to the right, that is closer than the right word of the link. - (They would not include the link itself.) - - In the following sentence, then, the "Urfl_Only Domain" of the G link - would include only the "O" link: - - +-----G----+ - +---O--+ +-AI+ - | | | | - hitting dogs is fun.a - - In the following sentence it would include the "O", the "TT", the "I", - the second "O", and the "A". - - +----------------G---------------+ - +-----TT-----+ +-----O-----+ | - +---O---+ +-I+ +---A--+ +-AI+ - | | | | | | | | - telling people to do stupid things is fun.a - - This would allow us to judge the following: - - kicking dogs bores me - *kicking dogs kicks dogs - explaining the program is easy - *explaining the program is running - - (These are distinctions that I thought we would never be able to make, - so I told myself they were semantic rather than syntactic. But with - domains, they should be easy.) - */ - - /* Modifications, 6/96 ALB: - 1) Rules and link sets are relegated to a separate, user-written - file(s), herein referred to as the 'knowledge file' - 2) This information is read by a lexer, in pp_lexer.l (lex code) - whose exported routines are all prefixed by 'pp_lexer' - 3) when postprocessing a sentence, the links of each domain are - placed in a set for quick lookup, ('contains one' and 'contains none') - 4) Functions which were never called have been eliminated: - link_inhabits(), match_in_list(), group_type_contains(), - group_type_contains_one(), group_type_contains_all() - 5) Some 'one-by-one' initializations have been replaced by faster - block memory operations (memset etc.) - 6) The above comments are correct but incomplete! (1/97) - 7) observation: the 'contains one' is, empirically, by far the most - violated rule, so it should come first in applying the rules. - - Modifications, 9/97 ALB: - Deglobalization. Made code consistent with api. - */ - -/* ------ Deprecated functions, remove these someday ------ */ -PostProcessor * post_process_open(const char *path) -{ - pp_knowledge *kno = pp_knowledge_open(path); - if (NULL == kno) return NULL; - return post_process_new(kno); -} - -void post_process_close(PostProcessor *pp) -{ - pp_knowledge_close(pp->knowledge); - post_process_free(pp); -} - -void linkage_post_process(Linkage lkg, Postprocessor * pp) {} diff -Nru link-grammar-5.3.16/link-grammar/post-process.h link-grammar-5.5.0/link-grammar/post-process.h --- link-grammar-5.3.16/link-grammar/post-process.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/post-process.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,50 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ -/********************************************************************** - Calling paradigm: - . call post_process_new() with the handle of a knowledge set. This - returns a handle, used for all subsequent calls to post-process. - . Do for each sentence: - - Do for each generated linkage of a sentence: - + call post_process_scan_linkage() - - Do for each generated linkage of a sentence: - + call do_post_process() - - Call post_process_free() -***********************************************************************/ - -#ifndef _POSTPROCESS_H_ -#define _POSTPROCESS_H_ - -#include "api-types.h" -#include "structures.h" - -void pp_new_domain_array(PP_data *); -void pp_free_domain_array(PP_data *); - -Postprocessor * post_process_new(pp_knowledge *); -void post_process_free(Postprocessor *); - -void post_process_free_data(PP_data * ppd); -void post_process_scan_linkage(Postprocessor *, Linkage); -PP_node *do_post_process(Postprocessor *, Linkage, bool); -bool post_process_match(const char *, const char *); /* utility function */ - -bool sane_linkage_morphism(Sentence, Linkage, Parse_Options); - -void linkage_free_pp_info(Linkage); - -void build_type_array(Postprocessor*); -void linkage_set_domain_names(Postprocessor*, Linkage); -void exfree_domain_names(PP_info *); - -#endif diff -Nru link-grammar-5.3.16/link-grammar/pp_knowledge.c link-grammar-5.5.0/link-grammar/pp_knowledge.c --- link-grammar-5.3.16/link-grammar/pp_knowledge.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/pp_knowledge.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,428 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/*********************************************************************** - pp_knowledge.c - 7/97 - Contains rules and associated information for post processing. This - information is supplied in a human-readable file and is parsed by - pp_lexer.h -***********************************************************************/ - -#include "externs.h" -#include "pp_knowledge.h" -#include "pp_lexer.h" -#include "pp_linkset.h" -#include "string-set.h" - -#define D_PPK 10 /* verbosity level for this file */ -#define PP_MAX_UNIQUE_LINK_NAMES 1024 /* just needs to be approximate */ - -/****************** non-exported functions ***************************/ - -static bool check_domain_is_legal(pp_knowledge *k, const char *p) -{ - if (0x0 != p[1]) - { - prt_error("Error: File %s: Domain (%s) must be a single character\n", - k->path, p); - return false; - } - return true; -} - -static void initialize_set_of_links_starting_bounded_domain(pp_knowledge *k) -{ - int i,j,d,domain_of_rule; - k->set_of_links_starting_bounded_domain = - pp_linkset_open(PP_MAX_UNIQUE_LINK_NAMES); - for (i=0; k->bounded_rules[i].msg!=0; i++) - { - domain_of_rule = k->bounded_rules[i].domain; - for (j=0; (d=(k->starting_link_lookup_table[j].domain))!=-1; j++) - if (d==domain_of_rule) - pp_linkset_add(k->set_of_links_starting_bounded_domain, - k->starting_link_lookup_table[j].starting_link); - } -} - -/** - * Read table of [link, domain type]. - * This tells us what domain type each link belongs to. - * This lookup table *must* be defined in the knowledge file. - */ -static bool read_starting_link_table(pp_knowledge *k) -{ - const char *p; - const char label[] = "STARTING_LINK_TYPE_TABLE"; - size_t i, even; - int n_tokens; - - if (!pp_lexer_set_label(k->lt, label)) - { - prt_error("Error: File %s: Couldn't find starting link table %s\n", - k->path, label); - return false; - } - - n_tokens = pp_lexer_count_tokens_of_label(k->lt); - if (-1 == n_tokens) return false; - even = n_tokens % 2; - if(0 != even) - { - prt_error("Error: Link table must have format [ ]+\n"); - return false; - } - - k->nStartingLinks = n_tokens/2; - k->starting_link_lookup_table = (StartingLinkAndDomain*) - xalloc((1+k->nStartingLinks)*sizeof(StartingLinkAndDomain)); - for (i=0; inStartingLinks; i++) - { - /* read the starting link itself */ - k->starting_link_lookup_table[i].starting_link = - string_set_add(pp_lexer_get_next_token_of_label(k->lt),k->string_set); - - /* read the domain type of the link */ - p = pp_lexer_get_next_token_of_label(k->lt); - if (!check_domain_is_legal(k, p)) return false; - k->starting_link_lookup_table[i].domain = (int) p[0]; - } - - /* end sentinel */ - k->starting_link_lookup_table[k->nStartingLinks].domain = -1; - return true; -} - -static pp_linkset LINK_SET_ERROR; /* sentinel value for error */ - -static pp_linkset *read_link_set(pp_knowledge *k, - const char *label, String_set *ss) -{ - /* read link set, marked by label in knowledge file, into a set of links - whose handle is returned. Return NULL if link set not defined in file, - in which case the set is taken to be empty. */ - int n_strings,i; - pp_linkset *ls; - if (!pp_lexer_set_label(k->lt, label)) - { - if (verbosity_level(+D_PPK)) - prt_error("Warning: File %s: Link set %s not defined: assuming empty\n", - k->path, label); - n_strings = 0; - } - else - { - n_strings = pp_lexer_count_tokens_of_label(k->lt); - if (-1 == n_strings) return &LINK_SET_ERROR; - } - ls = pp_linkset_open(n_strings); - for (i=0; ilt),ss)); - return ls; -} - -static bool read_link_sets(pp_knowledge *k) -{ - String_set *ss = k->string_set; /* shorthand */ - k->domain_starter_links =read_link_set(k,"DOMAIN_STARTER_LINKS",ss); - if (&LINK_SET_ERROR == k->domain_starter_links) return false; - - k->urfl_domain_starter_links=read_link_set(k,"URFL_DOMAIN_STARTER_LINKS",ss); - if (&LINK_SET_ERROR == k->urfl_domain_starter_links) return false; - - k->domain_contains_links =read_link_set(k,"DOMAIN_CONTAINS_LINKS",ss); - if (&LINK_SET_ERROR == k->domain_contains_links) return false; - - k->ignore_these_links =read_link_set(k,"IGNORE_THESE_LINKS",ss); - if (&LINK_SET_ERROR == k->ignore_these_links) return false; - - k->restricted_links =read_link_set(k,"RESTRICTED_LINKS",ss); - if (&LINK_SET_ERROR == k->domain_starter_links) return false; - - k->must_form_a_cycle_links =read_link_set(k,"MUST_FORM_A_CYCLE_LINKS",ss); - if (&LINK_SET_ERROR == k->must_form_a_cycle_links) return false; - - k->urfl_only_domain_starter_links= - read_link_set(k,"URFL_ONLY_DOMAIN_STARTER_LINKS",ss); - if (&LINK_SET_ERROR == k->urfl_only_domain_starter_links) return false; - - k->left_domain_starter_links=read_link_set(k,"LEFT_DOMAIN_STARTER_LINKS",ss); - if (&LINK_SET_ERROR == k->left_domain_starter_links) return false; - - return true; -} - -static void free_link_sets(pp_knowledge *k) -{ - pp_linkset_close(k->domain_starter_links); - pp_linkset_close(k->urfl_domain_starter_links); - pp_linkset_close(k->domain_contains_links); - pp_linkset_close(k->ignore_these_links); - pp_linkset_close(k->restricted_links); - pp_linkset_close(k->must_form_a_cycle_links); - pp_linkset_close(k->urfl_only_domain_starter_links); - pp_linkset_close(k->left_domain_starter_links); -} - -static bool read_form_a_cycle_rules(pp_knowledge *k, const char *label) -{ - size_t n_commas, n_tokens; - size_t r, i; - pp_linkset *lsHandle; - const char **tokens; - if (!pp_lexer_set_label(k->lt, label)) { - k->n_form_a_cycle_rules = 0; - if (verbosity_level(+D_PPK)) - prt_error("Warning: File %s: Not using any 'form a cycle' rules\n", - k->path); - } - else { - n_commas = pp_lexer_count_commas_of_label(k->lt); - k->n_form_a_cycle_rules = (n_commas + 1)/2; - } - k->form_a_cycle_rules= - (pp_rule*) xalloc ((1+k->n_form_a_cycle_rules)*sizeof(pp_rule)); - for (r=0; rn_form_a_cycle_rules; r++) - { - /* read link set */ - tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); - if (n_tokens <= 0) - { - prt_error("Error: File %s: Syntax error\n", k->path); - return false; - } - lsHandle = pp_linkset_open(n_tokens); - for (i=0; istring_set)); - k->form_a_cycle_rules[r].link_set = lsHandle; - - /* read error message */ - tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); - if (n_tokens > 1) - { - prt_error("Error: File %s: Invalid syntax (rule %zu of %s)\n", - k->path, r+1,label); - return false; - } - k->form_a_cycle_rules[r].msg = string_set_add(tokens[0], k->string_set); - k->form_a_cycle_rules[r].use_count = 0; - } - - /* sentinel entry */ - k->form_a_cycle_rules[k->n_form_a_cycle_rules].msg = 0; - k->form_a_cycle_rules[k->n_form_a_cycle_rules].use_count = 0; - - return true; -} - -static bool read_bounded_rules(pp_knowledge *k, const char *label) -{ - const char **tokens; - size_t n_commas, n_tokens; - size_t r; - if (!pp_lexer_set_label(k->lt, label)) { - k->n_bounded_rules = 0; - if (verbosity_level(+D_PPK)) - prt_error("Warning: File %s: Not using any 'bounded' rules\n", k->path); - } - else { - n_commas = pp_lexer_count_commas_of_label(k->lt); - k->n_bounded_rules = (n_commas + 1)/2; - } - k->bounded_rules = (pp_rule*) xalloc ((1+k->n_bounded_rules)*sizeof(pp_rule)); - for (r=0; rn_bounded_rules; r++) - { - /* read domain */ - tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); - if (n_tokens!=1) - { - prt_error("Error: File %s: Invalid syntax: rule %zu of %s\n", - k->path, r+1,label); - return false; - } - k->bounded_rules[r].domain = (int) tokens[0][0]; - - /* read error message */ - tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); - if (n_tokens!=1) - { - prt_error("Error: File %s: Invalid syntax: rule %zu of %s\n", - k->path, r+1,label); - return false; - } - k->bounded_rules[r].msg = string_set_add(tokens[0], k->string_set); - k->bounded_rules[r].use_count = 0; - } - - /* sentinel entry */ - k->bounded_rules[k->n_bounded_rules].msg = 0; - k->bounded_rules[k->n_bounded_rules].use_count = 0; - - return true; -} - -static bool read_contains_rules(pp_knowledge *k, const char *label, - pp_rule **rules, size_t *nRules) -{ - /* Reading the 'contains_one_rules' and reading the - 'contains_none_rules' into their respective arrays */ - size_t n_tokens, i, r; - int n_commas; - const char *p; - const char **tokens; - if (!pp_lexer_set_label(k->lt, label)) { - *nRules = 0; - if (verbosity_level(+D_PPK)) - prt_error("Warning: File %s: Not using any %s rules\n", k->path, label); - } - else { - n_commas = pp_lexer_count_commas_of_label(k->lt); - if (-1 == n_commas) return false; - *nRules = (n_commas + 1)/3; - } - *rules = (pp_rule*) xalloc ((1+*nRules)*sizeof(pp_rule)); - for (r=0; r<*nRules; r++) - { - /* first read link */ - tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); - if (n_tokens > 1) - { - prt_error("Error: File %s: Invalid syntax in %s (rule %zu)\n", - k->path, label, r+1); - return false; - } - - (*rules)[r].selector = string_set_add(tokens[0], k->string_set); - - /* read link set */ - tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); - (*rules)[r].link_set = pp_linkset_open(n_tokens); - (*rules)[r].link_set_size = n_tokens; - (*rules)[r].link_array = (const char **) xalloc((1+n_tokens)*sizeof(const char*)); - for (i=0; istring_set); - pp_linkset_add((*rules)[r].link_set, p); - (*rules)[r].link_array[i] = p; - } - (*rules)[r].link_array[i]=0; /* NULL-terminator */ - - /* read error message */ - tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); - if (n_tokens > 1) - { - prt_error("Error: File %s: Invalid syntax in %s (rule %zu)\n", - k->path, label, r+1); - return false; - } - - (*rules)[r].msg = string_set_add(tokens[0], k->string_set); - (*rules)[r].use_count = 0; - } - - /* sentinel entry */ - (*rules)[*nRules].msg = 0; - (*rules)[*nRules].use_count = 0; - - return true; -} - - -static bool read_rules(pp_knowledge *k) -{ - if (!read_form_a_cycle_rules(k, "FORM_A_CYCLE_RULES")) return false; - if (!read_bounded_rules(k, "BOUNDED_RULES")) return false; - if (!read_contains_rules(k, "CONTAINS_ONE_RULES" , - &(k->contains_one_rules), &(k->n_contains_one_rules))) - return false; - if (!read_contains_rules(k, "CONTAINS_NONE_RULES", - &(k->contains_none_rules), &(k->n_contains_none_rules))) - return false; - return true; -} - -static void free_rules(pp_knowledge *k) -{ - size_t r; - size_t rs = sizeof(pp_rule); - pp_rule *rule; - if (NULL != k->contains_one_rules) - { - for (r=0; k->contains_one_rules[r].msg!=0; r++) - { - rule = &(k->contains_one_rules[r]); /* shorthand */ - xfree((void*) rule->link_array, (1+rule->link_set_size)*sizeof(char*)); - pp_linkset_close(rule->link_set); - } - for (r=0; k->contains_none_rules[r].msg!=0; r++) - { - rule = &(k->contains_none_rules[r]); /* shorthand */ - xfree((void *)rule->link_array, (1+rule->link_set_size)*sizeof(char*)); - pp_linkset_close(rule->link_set); - } - } - - for (r = 0; r < k->n_form_a_cycle_rules; r++) - pp_linkset_close(k->form_a_cycle_rules[r].link_set); - xfree((void*)k->bounded_rules, rs*(1+k->n_bounded_rules)); - xfree((void*)k->form_a_cycle_rules, rs*(1+k->n_form_a_cycle_rules)); - xfree((void*)k->contains_one_rules, rs*(1+k->n_contains_one_rules)); - xfree((void*)k->contains_none_rules, rs*(1+k->n_contains_none_rules)); -} - -/********************* exported functions ***************************/ - -pp_knowledge *pp_knowledge_open(const char *path) -{ - /* read knowledge from disk into pp_knowledge */ - FILE *f = dictopen(path, "r"); - if (NULL == f) - { - prt_error("Error: Couldn't find post-process knowledge file %s\n", path); - return NULL; - } - pp_knowledge *k = (pp_knowledge *) xalloc (sizeof(pp_knowledge)); - *k = (pp_knowledge){0}; - k->lt = pp_lexer_open(f); - fclose(f); - if (NULL == k->lt) goto failure; - k->string_set = string_set_create(); - k->path = string_set_add(path, k->string_set); - if (!read_starting_link_table(k)) goto failure; - - if (!read_link_sets(k)) goto failure; - if (!read_rules(k)) goto failure; - initialize_set_of_links_starting_bounded_domain(k); - return k; - -failure: - prt_error("Error: Unable to open knowledge file %s.\n", path); - pp_knowledge_close(k); - return NULL; -} - -void pp_knowledge_close(pp_knowledge *k) -{ - if (!k) return; - /* clear the memory taken up by k */ - xfree((void*)k->starting_link_lookup_table, - ((1+k->nStartingLinks)*sizeof(StartingLinkAndDomain))); - free_link_sets(k); - free_rules(k); - pp_linkset_close(k->set_of_links_starting_bounded_domain); - string_set_delete(k->string_set); - if (NULL != k->lt) pp_lexer_close(k->lt); - xfree((void*)k, sizeof(pp_knowledge)); -} - diff -Nru link-grammar-5.3.16/link-grammar/pp_knowledge.h link-grammar-5.5.0/link-grammar/pp_knowledge.h --- link-grammar-5.3.16/link-grammar/pp_knowledge.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/pp_knowledge.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "api-types.h" - -pp_knowledge *pp_knowledge_open(const char *path); -void pp_knowledge_close(pp_knowledge *knowledge); diff -Nru link-grammar-5.3.16/link-grammar/pp_lexer.c link-grammar-5.5.0/link-grammar/pp_lexer.c --- link-grammar-5.3.16/link-grammar/pp_lexer.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/pp_lexer.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1394 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2009, 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/* XXX the original flex source for this file is missing !! */ - -#include -#include -#include -#include - -#include "error.h" -#include "pp_lexer.h" -#include "string-set.h" - - -/* If YY_ALWAYS_INTERACTIVE is set, then input is read as wide-char, - * instead of multi-byte. This perhaps fixes MS windows bugs ... - */ - -#ifdef _MSC_VER -#define YY_ALWAYS_INTERACTIVE 1 -#endif - -static int yywrap(void); /* --DS */ - -#define yy_create_buffer pp_lexer__create_buffer -#define yy_delete_buffer pp_lexer__delete_buffer -#define yy_flex_debug pp_lexer__flex_debug -#define yy_init_buffer pp_lexer__init_buffer -#define yy_flush_buffer pp_lexer__flush_buffer -#define yy_load_buffer_state pp_lexer__load_buffer_state -#define yy_switch_to_buffer pp_lexer__switch_to_buffer -#define yyin pp_lexer_in -#define yyleng pp_lexer_leng -#define yylex pp_lexer_lex -#define yyout pp_lexer_out -#define yyrestart pp_lexer_restart -#define yytext pp_lexer_text -#define yylineno pp_lexer_lineno - -/* A lexical scanner generated by flex */ -#define FLEX_SCANNER -#define YY_FLEX_MAJOR_VERSION 2 -#define YY_FLEX_MINOR_VERSION 5 - - -#define YY_PROTO(proto) proto - -/* Returned upon end-of-file. */ -#define YY_NULL 0 - -/* Promotes a possibly negative, possibly signed char to an unsigned - * integer for use as an array index. If the signed char is negative, - * we want to instead treat it as an 8-bit unsigned char, hence the - * double cast. - */ -#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) - -/* Enter a start condition. This macro really ought to take a parameter, - * but we do it the disgusting crufty way forced on us by the ()-less - * definition of BEGIN. - */ -#define BEGIN yy_start = 1 + 2 * - -/* Translate the current start state into a value that can be later handed - * to BEGIN to return to the state. The YYSTATE alias is for lex - * compatibility. - */ -#define YY_START ((yy_start - 1) / 2) -#define YYSTATE YY_START - -/* Action number for EOF rule of a given start state. */ -#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) - -/* Special action meaning "start processing a new file". */ -#define YY_NEW_FILE yyrestart( yyin ) - -#define YY_END_OF_BUFFER_CHAR 0 - -/* Size of default input buffer. */ -#define YY_BUF_SIZE 16384 - -typedef struct yy_buffer_state *YY_BUFFER_STATE; - -#define EOB_ACT_CONTINUE_SCAN 0 -#define EOB_ACT_END_OF_FILE 1 -#define EOB_ACT_LAST_MATCH 2 - -/* The funky do-while in the following #define is used to turn the definition - * int a single C statement (which needs a semi-colon terminator). This - * avoids problems with code like: - * - * if ( condition_holds ) - * yyless( 5 ); - * else - * do_something_else(); - * - * Prior to using the do-while the compiler would get upset at the - * "else" because it interpreted the "if" statement as being all - * done when it reached the ';' after the yyless() call. - */ - -/* Return all but the first 'n' matched characters back to the input stream. */ - -#define yyless(n) \ - do \ - { \ - /* Undo effects of setting up yytext. */ \ - *yy_cp = yy_hold_char; \ - yy_c_buf_p = yy_cp = yy_bp + n - YY_MORE_ADJ; \ - YY_DO_BEFORE_ACTION; /* set up yytext again */ \ - } \ - while ( 0 ) - -/* The following is because we cannot portably get our hands on size_t - * (without autoconf's help, which isn't available because we want - * flex-generated scanners to compile on their own). - */ -typedef unsigned int yy_size_t; - - -struct yy_buffer_state - { - FILE *yy_input_file; - - char *yy_ch_buf; /* input buffer */ - char *yy_buf_pos; /* current position in input buffer */ - - /* Size of input buffer in bytes, not including room for EOB - * characters. - */ - yy_size_t yy_buf_size; - - /* Number of characters read into yy_ch_buf, not including EOB - * characters. - */ - int yy_n_chars; - - /* Whether we "own" the buffer - i.e., we know we created it, - * and can realloc() it to grow it, and should free() it to - * delete it. - */ - int yy_is_our_buffer; - - /* Whether this is an "interactive" input source; if so, and - * if we're using stdio for input, then we want to use getwc() - * instead of fread(), to make sure we stop fetching input after - * each newline. - */ - int yy_is_interactive; - - /* Whether we're considered to be at the beginning of a line. - * If so, '^' rules will be active on the next match, otherwise - * not. - */ - int yy_at_bol; - - /* Whether to try to fill the input buffer when we reach the - * end of it. - */ - int yy_fill_buffer; - - int yy_buffer_status; -#define YY_BUFFER_NEW 0 -#define YY_BUFFER_NORMAL 1 - /* When an EOF's been seen but there's still some text to process - * then we mark the buffer as YY_EOF_PENDING, to indicate that we - * shouldn't try reading from the input source any more. We might - * still have a bunch of tokens to match, though, because of - * possible backing-up. - * - * When we actually see the EOF, we change the status to "new" - * (via yyrestart()), so that the user can continue scanning by - * just pointing yyin at a new input file. - */ -#define YY_BUFFER_EOF_PENDING 2 - }; - -static YY_BUFFER_STATE yy_current_buffer = 0; - -/* We provide macros for accessing buffer states in case in the - * future we want to put the buffer states in a more general - * "scanner state". - */ -#define YY_CURRENT_BUFFER yy_current_buffer - - -/* yy_hold_char holds the character lost when yytext is formed. */ -static char yy_hold_char; - -static int yy_n_chars; /* number of characters read into yy_ch_buf */ - - -static int yyleng; - -/* Points to current character in buffer. */ -static char *yy_c_buf_p = (char *) 0; -static int yy_init = 1; /* whether we need to initialize */ -static int yy_start = 0; /* start state number */ - -/* Flag which is used to allow yywrap()'s to do buffer switches - * instead of setting up a fresh yyin. A bit of a hack ... - */ -static int yy_did_buffer_switch_on_eof; - -void yyrestart YY_PROTO(( FILE *input_file )); - -void yy_switch_to_buffer YY_PROTO(( YY_BUFFER_STATE new_buffer )); -void yy_load_buffer_state YY_PROTO(( void )); -YY_BUFFER_STATE yy_create_buffer YY_PROTO(( FILE *file, int size )); -void yy_delete_buffer YY_PROTO(( YY_BUFFER_STATE b )); -void yy_init_buffer YY_PROTO(( YY_BUFFER_STATE b, FILE *file )); -void yy_flush_buffer YY_PROTO(( YY_BUFFER_STATE b )); -#define YY_FLUSH_BUFFER yy_flush_buffer( yy_current_buffer ) - -static void *yy_flex_alloc YY_PROTO(( yy_size_t )); -/* static void *yy_flex_realloc YY_PROTO(( void *, yy_size_t )); --DS */ -static void yy_flex_free YY_PROTO(( void * )); - -#define yy_new_buffer yy_create_buffer - -#define yy_set_bol(at_bol) \ - { \ - if ( ! yy_current_buffer ) \ - yy_current_buffer = yy_create_buffer( yyin, YY_BUF_SIZE ); \ - yy_current_buffer->yy_at_bol = at_bol; \ - } - -#define YY_AT_BOL() (yy_current_buffer->yy_at_bol) - - -#define YY_USES_REJECT -typedef unsigned char YY_CHAR; -static FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0; -typedef int yy_state_type; -static int yylineno = 1; -static char *yytext; -#define yytext_ptr yytext - -static yy_state_type yy_get_previous_state YY_PROTO(( void )); -static yy_state_type yy_try_NUL_trans YY_PROTO(( yy_state_type current_state )); -static int yy_get_next_buffer YY_PROTO(( void )); -static void yy_fatal_error YY_PROTO(( const char msg[] )); - -/* Done after the current pattern has been matched and before the - * corresponding action - sets up yytext. - */ -#define YY_DO_BEFORE_ACTION \ - yytext_ptr = yy_bp; \ - yyleng = (int) (yy_cp - yy_bp); \ - yy_hold_char = *yy_cp; \ - *yy_cp = '\0'; \ - yy_c_buf_p = yy_cp; - -#define YY_NUM_RULES 8 -#define YY_END_OF_BUFFER 9 -static const short int yy_acclist[39] = - { 0, - 2, 2, 2, 2, 9, 5, 7, 8, 2, 7, - 8, 2, 8, 7, 8, 7, 8, 5, 7, 8, - 2, 7, 8, 5, 7, 8, 7, 8, 5, 3, - 2, 4, 5, 2, 5, 1, 3, 6 - } ; - -static const short int yy_accept[29] = - { 0, - 1, 2, 3, 4, 5, 6, 9, 12, 14, 16, - 18, 21, 24, 27, 29, 30, 31, 32, 32, 34, - 35, 35, 36, 36, 37, 38, 39, 39 - } ; - -static const int yy_ec[256] = - { 0, - 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 4, 1, 5, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 6, 7, 1, - 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1 - } ; - -static const int yy_meta[9] = - { 0, - 1, 2, 2, 3, 4, 5, 1, 1 - } ; - -static const short int yy_base[32] = - { 0, - 0, 7, 14, 21, 18, 11, 0, 0, 0, 66, - 10, 27, 34, 42, 7, 66, 0, 7, 4, 0, - 0, 0, 0, 66, 0, 66, 66, 50, 54, 57, - 60 - } ; - -static const short int yy_def[32] = - { 0, - 27, 1, 1, 3, 27, 28, 29, 29, 30, 27, - 31, 27, 27, 13, 28, 27, 29, 30, 31, 12, - 14, 13, 14, 27, 14, 27, 0, 27, 27, 27, - 27 - } ; - -static const short int yy_nxt[75] = - { 0, - 6, 7, 8, 7, 9, 10, 6, 11, 12, 16, - 12, 26, 16, 13, 10, 16, 16, 27, 10, 27, - 10, 10, 12, 27, 12, 27, 27, 14, 20, 17, - 20, 27, 27, 21, 22, 23, 24, 23, 23, 25, - 22, 22, 23, 27, 27, 27, 27, 23, 23, 23, - 15, 27, 27, 27, 15, 17, 17, 18, 27, 18, - 19, 27, 27, 27, 19, 5, 27, 27, 27, 27, - 27, 27, 27, 27 - } ; - -static const short int yy_chk[75] = - { 0, - 1, 1, 1, 1, 1, 1, 1, 1, 2, 19, - 2, 18, 15, 2, 3, 11, 6, 5, 3, 0, - 3, 3, 4, 0, 4, 0, 0, 4, 12, 12, - 12, 0, 0, 12, 13, 13, 13, 13, 13, 13, - 13, 13, 14, 0, 0, 0, 0, 14, 14, 14, - 28, 0, 0, 0, 28, 29, 29, 30, 0, 30, - 31, 0, 0, 0, 31, 27, 27, 27, 27, 27, - 27, 27, 27, 27 - } ; - -static yy_state_type yy_state_buf[YY_BUF_SIZE + 2], *yy_state_ptr; -static char *yy_full_match; -static int yy_lp; -#define REJECT \ -{ \ -*yy_cp = yy_hold_char; /* undo effects of setting up yytext */ \ -yy_cp = yy_full_match; /* restore poss. backed-over text */ \ -++yy_lp; \ -goto find_rule; \ -} -#define yymore() yymore_used_but_not_detected -#define YY_MORE_ADJ 0 -static char *yytext; -#define INITIAL 0 -/************************************************************************** - Lex specification for post-process knowledge file - 6/96 ALB - Updated 8/97 to allow multiple instances - Compile with either - 1) flex pp_lexer.fl (on systems which support %option prefix) OR - 2) flex pp_lexer.fl - mv lex.yy.c pp_lexer.tmp.c - cat pp_lexer.tmp.c | sed "s/yy/pp_lexer_/g" > pp_lexer.c - rm -f pp_lexer.tmp.c - (on systems which do not) - - In the case of (1), uncomment the three %option lines below. -**************************************************************************/ - -/* forward references for non-exported functions (and static variable) */ -static bool check_string(const char *str); -static void setup(PPLexTable *lt); -static bool set_label(PPLexTable *lt, const char *label); -static bool add_string_to_label(PPLexTable *lt, const char *str); -static bool add_set_of_strings_to_label(PPLexTable *lt,const char *label_of_set); -static int get_index_of_label(PPLexTable *lt, const char *label); -static PPLexTable *clt=NULL; /* ptr to lex table we're currently filling in */ -/* see above */ -#define INCLUDE 1 - -/* Macros after this point can all be overridden by user definitions in - * section 1. - */ -#include - -/* Amount of stuff to slurp up with each read. */ -#ifndef YY_READ_BUF_SIZE -#define YY_READ_BUF_SIZE 8192 -#endif - -/* Copy whatever the last rule matched to the standard output. */ - -#ifndef ECHO -/* This used to be an fputs(), but since the string might contain NUL's, - * we now use fwrite(). - */ -#define ECHO (void) fwrite( yytext, yyleng, 1, yyout ) -#endif - -/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, - * is returned in "result". - */ -#ifndef YY_INPUT -#define YY_INPUT(buf,result,max_size) \ - if ( yy_current_buffer->yy_is_interactive ) \ - { \ - int c = '*'; \ - int n; \ - for ( n = 0; n < max_size && \ - (c = lg_fgetc( yyin )) != EOF && c != '\n'; ) \ - buf[n++] = c; \ - if ( c == '\n' ) \ - buf[n++] = '\n'; \ - if ( c == EOF && ferror( yyin ) ) \ - {prt_error("Fatal error: Input in flex scanner failed\n"); reterror();} \ - result = n; \ - } \ - else if ( ((result = fread( buf, 1, max_size, yyin )) == 0) \ - && ferror( yyin ) ) \ - {prt_error("Fatal error: Input in flex scanner failed\n"); reterror();} -#endif - -/* No semi-colon after return; correct usage is to write "yyterminate();" - - * we don't want an extra ';' after the "return" because that will cause - * some compilers to complain about unreachable statements. - */ -#ifndef yyterminate -#define yyterminate() return YY_NULL -#endif - -/* Definitions for error recovery. */ -#define FATAL_ERROR (-1) -#define reterror() return FATAL_ERROR - -/* Number of entries by which start-condition stack grows. */ -#ifndef YY_START_STACK_INCR -#define YY_START_STACK_INCR 25 -#endif - -/* Report a fatal error. */ -#ifndef YY_FATAL_ERROR -#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) -#endif - -/* Default declaration of generated scanner - a define so the user can - * easily add parameters. - */ -#ifndef YY_DECL -#define YY_DECL int yylex YY_PROTO(( void )) -#endif - -/* Code executed at the beginning of each rule, after yytext and yyleng - * have been set up. - */ -#ifndef YY_USER_ACTION -#define YY_USER_ACTION -#endif - -/* Code executed at the end of each rule. */ -#ifndef YY_BREAK -#define YY_BREAK break; -#endif - -#define YY_RULE_SETUP \ - if ( yyleng > 0 ) \ - yy_current_buffer->yy_at_bol = \ - (yytext[yyleng - 1] == '\n'); \ - YY_USER_ACTION - -YY_DECL; -YY_DECL -{ - yy_state_type yy_current_state; - char *yy_cp, *yy_bp; - int yy_act; - - /* #line 56 "pp_lexer.fl" --DS */ - - if ( yy_init ) - { - yy_init = 0; - -#ifdef YY_USER_INIT - YY_USER_INIT; -#endif - - if ( ! yy_start ) - yy_start = 1; /* first start state */ - - if ( ! yyin ) - yyin = stdin; - - if ( ! yyout ) - yyout = stdout; - - if ( ! yy_current_buffer ) - yy_current_buffer = - yy_create_buffer( yyin, YY_BUF_SIZE ); - - yy_load_buffer_state(); - } - - while ( 1 ) /* loops until end-of-file is reached */ - { - yy_cp = yy_c_buf_p; - - /* Support of yytext. */ - *yy_cp = yy_hold_char; - - /* yy_bp points to the position in yy_ch_buf of the start of - * the current run. - */ - yy_bp = yy_cp; - - yy_current_state = yy_start; - yy_current_state += YY_AT_BOL(); - yy_state_ptr = yy_state_buf; - *yy_state_ptr++ = yy_current_state; -yy_match: - do - { - register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; - while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) - { - yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 28 ) - yy_c = yy_meta[(unsigned int) yy_c]; - } - yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; - *yy_state_ptr++ = yy_current_state; - ++yy_cp; - } - while ( yy_base[yy_current_state] != 66 ); - -yy_find_action: - yy_current_state = *--yy_state_ptr; - yy_lp = yy_accept[yy_current_state]; -/* find_rule: --DS */ /* we branch to this label when backing up */ - for ( ; ; ) /* until we find what rule we matched */ - { - if ( yy_lp && yy_lp < yy_accept[yy_current_state + 1] ) - { - yy_act = yy_acclist[yy_lp]; - { - yy_full_match = yy_cp; - break; - } - } - --yy_cp; - yy_current_state = *--yy_state_ptr; - yy_lp = yy_accept[yy_current_state]; - } - - YY_DO_BEFORE_ACTION; - - if ( yy_act != YY_END_OF_BUFFER ) - { - int yyl; - for ( yyl = 0; yyl < yyleng; ++yyl ) - if ( yytext[yyl] == '\n' ) - ++yylineno; - } - -do_action: /* This label is used only to access EOF actions. */ - - - switch ( yy_act ) - { /* beginning of action switch */ -case 1: -*yy_cp = yy_hold_char; /* undo effects of setting up yytext */ -yy_cp -= 1; -YY_DO_BEFORE_ACTION; /* set up yytext again */ -YY_RULE_SETUP - /* #line 58 "pp_lexer.fl" --DS */ -; - YY_BREAK -case 2: -YY_RULE_SETUP - /* #line 59 "pp_lexer.fl" --DS */ -; - YY_BREAK -case 3: -YY_RULE_SETUP - /* #line 61 "pp_lexer.fl" --DS */ -{ if (!set_label(clt, yytext)) reterror(); } - YY_BREAK -case 4: -YY_RULE_SETUP - /* #line 62 "pp_lexer.fl" --DS */ -{ if (!add_set_of_strings_to_label(clt, &(yytext[1]))) reterror(); } - YY_BREAK -case 5: -YY_RULE_SETUP - /* #line 63 "pp_lexer.fl" --DS */ -{ add_string_to_label(clt, yytext); } - YY_BREAK -case 6: -YY_RULE_SETUP - /* #line 64 "pp_lexer.fl" --DS */ -{ if (!add_string_to_label(clt, yytext)) reterror(); } - YY_BREAK -case 7: -YY_RULE_SETUP - /* #line 66 "pp_lexer.fl" --DS */ -{ prt_error("Error: pp_lexer: unable to parse file (line %i).\n", yylineno); reterror(); } - YY_BREAK -case 8: -YY_RULE_SETUP - /* #line 68 "pp_lexer.fl" --DS */ -ECHO; - YY_BREAK - /* #line 754 "pp_lexer.c" --DS */ - case YY_STATE_EOF(INITIAL): - case YY_STATE_EOF(INCLUDE): - yyterminate(); - - case YY_END_OF_BUFFER: - { - /* Amount of text matched not including the EOB char. */ - int yy_amount_of_matched_text = (int) (yy_cp - yytext_ptr) - 1; - - /* Undo the effects of YY_DO_BEFORE_ACTION. */ - *yy_cp = yy_hold_char; - - if ( yy_current_buffer->yy_buffer_status == YY_BUFFER_NEW ) - { - /* We're scanning a new file or input source. It's - * possible that this happened because the user - * just pointed yyin at a new source and called - * yylex(). If so, then we have to assure - * consistency between yy_current_buffer and our - * globals. Here is the right place to do so, because - * this is the first action (other than possibly a - * back-up) that will match for the new input source. - */ - yy_n_chars = yy_current_buffer->yy_n_chars; - yy_current_buffer->yy_input_file = yyin; - yy_current_buffer->yy_buffer_status = YY_BUFFER_NORMAL; - } - - /* Note that here we test for yy_c_buf_p "<=" to the position - * of the first EOB in the buffer, since yy_c_buf_p will - * already have been incremented past the NUL character - * (since all states make transitions on EOB to the - * end-of-buffer state). Contrast this with the test - * in input(). - */ - if ( yy_c_buf_p <= &yy_current_buffer->yy_ch_buf[yy_n_chars] ) - { /* This was really a NUL. */ - yy_state_type yy_next_state; - - yy_c_buf_p = yytext_ptr + yy_amount_of_matched_text; - - yy_current_state = yy_get_previous_state(); - - /* Okay, we're now positioned to make the NUL - * transition. We couldn't have - * yy_get_previous_state() go ahead and do it - * for us because it doesn't know how to deal - * with the possibility of jamming (and we don't - * want to build jamming into it because then it - * will run more slowly). - */ - - yy_next_state = yy_try_NUL_trans( yy_current_state ); - - yy_bp = yytext_ptr + YY_MORE_ADJ; - - if ( yy_next_state ) - { - /* Consume the NUL. */ - yy_cp = ++yy_c_buf_p; - yy_current_state = yy_next_state; - goto yy_match; - } - - else - { - yy_cp = yy_c_buf_p; - goto yy_find_action; - } - } - - else switch ( yy_get_next_buffer() ) - { - case EOB_ACT_END_OF_FILE: - { - yy_did_buffer_switch_on_eof = 0; - - if ( yywrap() ) - { - /* Note: because we've taken care in - * yy_get_next_buffer() to have set up - * yytext, we can now set up - * yy_c_buf_p so that if some total - * hoser (like flex itself) wants to - * call the scanner after we return the - * YY_NULL, it'll still work - another - * YY_NULL will get returned. - */ - yy_c_buf_p = yytext_ptr + YY_MORE_ADJ; - - yy_act = YY_STATE_EOF(YY_START); - goto do_action; - } - - else - { - if ( ! yy_did_buffer_switch_on_eof ) - YY_NEW_FILE; - } - break; - } - - case EOB_ACT_CONTINUE_SCAN: - yy_c_buf_p = - yytext_ptr + yy_amount_of_matched_text; - - yy_current_state = yy_get_previous_state(); - - yy_cp = yy_c_buf_p; - yy_bp = yytext_ptr + YY_MORE_ADJ; - goto yy_match; - - case EOB_ACT_LAST_MATCH: - yy_c_buf_p = - &yy_current_buffer->yy_ch_buf[yy_n_chars]; - - yy_current_state = yy_get_previous_state(); - - yy_cp = yy_c_buf_p; - yy_bp = yytext_ptr + YY_MORE_ADJ; - goto yy_find_action; - - case FATAL_ERROR: - return FATAL_ERROR; - } - break; - } - - default: - prt_error("Fatal error: flex scanner internal error--no action found\n"); - reterror(); - } /* end of action switch */ - } /* end of scanning one token */ - } /* end of yylex */ - - -/* yy_get_next_buffer - try to read in a new buffer - * - * Returns a code representing an action: - * EOB_ACT_LAST_MATCH - - * EOB_ACT_CONTINUE_SCAN - continue scanning from current position - * EOB_ACT_END_OF_FILE - end of file - */ - -static int yy_get_next_buffer(void) -{ - char *dest = yy_current_buffer->yy_ch_buf; - char *source = yytext_ptr; - int number_to_move, i; - int ret_val; - - if ( yy_c_buf_p > &yy_current_buffer->yy_ch_buf[yy_n_chars + 1] ) - { - prt_error("Fatal error: flex scanner internal error--end of buffer missed\n"); - reterror(); - } - - if ( yy_current_buffer->yy_fill_buffer == 0 ) - { /* Don't try to fill the buffer, so this is an EOF. */ - if ( yy_c_buf_p - yytext_ptr - YY_MORE_ADJ == 1 ) - { - /* We matched a singled characater, the EOB, so - * treat this as a final EOF. - */ - return EOB_ACT_END_OF_FILE; - } - - else - { - /* We matched some text prior to the EOB, first - * process it. - */ - return EOB_ACT_LAST_MATCH; - } - } - - /* Try to read more data. */ - - /* First move last chars to start of buffer. */ - number_to_move = (int) (yy_c_buf_p - yytext_ptr) - 1; - - for ( i = 0; i < number_to_move; ++i ) - *(dest++) = *(source++); - - if ( yy_current_buffer->yy_buffer_status == YY_BUFFER_EOF_PENDING ) - /* don't do the read, it's not guaranteed to return an EOF, - * just force an EOF - */ - yy_n_chars = 0; - - else - { - int num_to_read = - yy_current_buffer->yy_buf_size - number_to_move - 1; - - while ( num_to_read <= 0 ) - { /* Not enough room in the buffer - grow it. */ -#ifdef YY_USES_REJECT - prt_error("Fatal error: " -"Input buffer overflow, can't enlarge buffer because scanner uses REJECT\n"); - reterror(); -#else - - /* just a shorter name for the current buffer */ - YY_BUFFER_STATE b = yy_current_buffer; - - int yy_c_buf_p_offset = - (int) (yy_c_buf_p - b->yy_ch_buf); - - if ( b->yy_is_our_buffer ) - { - int new_size = b->yy_buf_size * 2; - - if ( new_size <= 0 ) - b->yy_buf_size += b->yy_buf_size / 8; - else - b->yy_buf_size *= 2; - - b->yy_ch_buf = (char *) - /* Include room in for 2 EOB chars. */ - yy_flex_realloc( (void *) b->yy_ch_buf, - b->yy_buf_size + 2 ); - } - else - /* Can't grow it, we don't own it. */ - b->yy_ch_buf = 0; - - if ( ! b->yy_ch_buf ) - { - prt_error("Fatal error: scanner input buffer overflow\n"); - reterror(); - } - - yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset]; - - num_to_read = yy_current_buffer->yy_buf_size - - number_to_move - 1; -#endif - } - - if ( num_to_read > YY_READ_BUF_SIZE ) - num_to_read = YY_READ_BUF_SIZE; - - /* Read in more data. */ - YY_INPUT( (&yy_current_buffer->yy_ch_buf[number_to_move]), - yy_n_chars, num_to_read); - } - - if ( yy_n_chars == 0 ) - { - if ( number_to_move == YY_MORE_ADJ ) - { - ret_val = EOB_ACT_END_OF_FILE; - yyrestart( yyin ); - } - - else - { - ret_val = EOB_ACT_LAST_MATCH; - yy_current_buffer->yy_buffer_status = - YY_BUFFER_EOF_PENDING; - } - } - - else - ret_val = EOB_ACT_CONTINUE_SCAN; - - yy_n_chars += number_to_move; - yy_current_buffer->yy_ch_buf[yy_n_chars] = YY_END_OF_BUFFER_CHAR; - yy_current_buffer->yy_ch_buf[yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR; - - yytext_ptr = &yy_current_buffer->yy_ch_buf[0]; - - return ret_val; - } - - -/* yy_get_previous_state - get the state just before the EOB char was reached */ - -static yy_state_type yy_get_previous_state(void) -{ - register yy_state_type yy_current_state; - register char *yy_cp; - - yy_current_state = yy_start; - yy_current_state += YY_AT_BOL(); - yy_state_ptr = yy_state_buf; - *yy_state_ptr++ = yy_current_state; - - for ( yy_cp = yytext_ptr + YY_MORE_ADJ; yy_cp < yy_c_buf_p; ++yy_cp ) - { - register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); - while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) - { - yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 28 ) - yy_c = yy_meta[(unsigned int) yy_c]; - } - yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; - *yy_state_ptr++ = yy_current_state; - } - - return yy_current_state; - } - - -/* yy_try_NUL_trans - try to make a transition on the NUL character - * - * synopsis - * next_state = yy_try_NUL_trans( current_state ); - */ - -static yy_state_type yy_try_NUL_trans( yy_state_type yy_current_state ) - { - register int yy_is_jam; - - register YY_CHAR yy_c = 1; - while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) - { - yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 28 ) - yy_c = yy_meta[(unsigned int) yy_c]; - } - yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; - *yy_state_ptr++ = yy_current_state; - yy_is_jam = (yy_current_state == 27); - - return yy_is_jam ? 0 : yy_current_state; - } - - -void yyrestart( FILE *input_file ) - { - if ( ! yy_current_buffer ) - yy_current_buffer = yy_create_buffer( yyin, YY_BUF_SIZE ); - - yy_init_buffer( yy_current_buffer, input_file ); - yy_load_buffer_state(); - } - - -void yy_switch_to_buffer( YY_BUFFER_STATE new_buffer ) - { - if ( yy_current_buffer == new_buffer ) - return; - - if ( yy_current_buffer ) - { - /* Flush out information for old buffer. */ - *yy_c_buf_p = yy_hold_char; - yy_current_buffer->yy_buf_pos = yy_c_buf_p; - yy_current_buffer->yy_n_chars = yy_n_chars; - } - - yy_current_buffer = new_buffer; - yy_load_buffer_state(); - - /* We don't actually know whether we did this switch during - * EOF (yywrap()) processing, but the only time this flag - * is looked at is after yywrap() is called, so it's safe - * to go ahead and always set it. - */ - yy_did_buffer_switch_on_eof = 1; - } - - -void yy_load_buffer_state( void ) - { - yy_n_chars = yy_current_buffer->yy_n_chars; - yytext_ptr = yy_c_buf_p = yy_current_buffer->yy_buf_pos; - yyin = yy_current_buffer->yy_input_file; - yy_hold_char = *yy_c_buf_p; - } - - -YY_BUFFER_STATE yy_create_buffer( FILE *file, int size ) - { - YY_BUFFER_STATE b; - - b = (YY_BUFFER_STATE) yy_flex_alloc( sizeof( struct yy_buffer_state ) ); - if ( ! b ) - YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); - - b->yy_buf_size = size; - - /* yy_ch_buf has to be 2 characters longer than the size given because - * we need to put in 2 end-of-buffer characters. - */ - b->yy_ch_buf = (char *) yy_flex_alloc( b->yy_buf_size + 2 ); - if ( ! b->yy_ch_buf ) - YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); - - b->yy_is_our_buffer = 1; - - yy_init_buffer( b, file ); - - return b; - } - - -void yy_delete_buffer( YY_BUFFER_STATE b ) - { - if ( b == NULL ) return; - - if ( b == yy_current_buffer ) - yy_current_buffer = (YY_BUFFER_STATE) 0; - - if ( b->yy_is_our_buffer ) - { - yy_flex_free( (void *) b->yy_ch_buf ); - b->yy_ch_buf = NULL; - } - - yy_flex_free( (void *) b ); - b = NULL; - } - - -void yy_init_buffer( YY_BUFFER_STATE b, FILE *file ) - { - yy_flush_buffer( b ); - - b->yy_input_file = file; - b->yy_fill_buffer = 1; - - b->yy_is_interactive = 0; -#if YY_ALWAYS_INTERACTIVE - b->yy_is_interactive = 1; -#endif - } - - -void yy_flush_buffer( YY_BUFFER_STATE b ) - { - b->yy_n_chars = 0; - - /* We always need two end-of-buffer characters. The first causes - * a transition to the end-of-buffer state. The second causes - * a jam in that state. - */ - b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; - b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; - - b->yy_buf_pos = &b->yy_ch_buf[0]; - - b->yy_at_bol = 1; - b->yy_buffer_status = YY_BUFFER_NEW; - - if ( b == yy_current_buffer ) - yy_load_buffer_state(); - } - - -#ifndef YY_EXIT_FAILURE -#define YY_EXIT_FAILURE 2 -#endif - -static void yy_fatal_error( const char msg[] ) - { - (void) fprintf( stderr, "%s\n", msg ); - exit( YY_EXIT_FAILURE ); - } - - - -/* Redefine yyless() so it works in section 3 code. */ - -#undef yyless -#define yyless(n) \ - do \ - { \ - /* Undo effects of setting up yytext. */ \ - yytext[yyleng] = yy_hold_char; \ - yy_c_buf_p = yytext + n - YY_MORE_ADJ; \ - yy_hold_char = *yy_c_buf_p; \ - *yy_c_buf_p = '\0'; \ - yyleng = n; \ - } \ - while ( 0 ) - - -/* Internal utility routines. */ - -static void *yy_flex_alloc( yy_size_t size ) - { - return (void *) malloc( size ); - } - - -static void yy_flex_free( void *ptr ) - { - free( ptr ); - } - -/************************ exported functions ******************************/ - -void pp_lexer_close(PPLexTable *lt) -{ - int i; - pp_label_node *node,*next; - for (i=0; inodes_of_label[i]; - while (node) - { - next = node->next; - xfree(node, sizeof(pp_label_node)); - node=next; - } - } - string_set_delete(lt->string_set); - if (lt->tokens) free(lt->tokens); - xfree(lt, sizeof(PPLexTable)); - yy_delete_buffer(yy_current_buffer); - yy_current_buffer = NULL; - yy_c_buf_p = NULL; -} - -PPLexTable *pp_lexer_open(FILE *f) -{ - PPLexTable *lt; - bool yylex_ok = false; - - assert(f, "pp_lexer_open: passed a NULL file pointer"); - - // initialiaze all yy static globals!! We do this here, so - // that we can be called multiple times w/o crash. - yy_current_buffer = NULL; - yy_c_buf_p = NULL; - yy_init = 1; - yy_start = 0; - - yyin = f; /* redirect lex to look at the specified file */ - lt = (PPLexTable*) xalloc (sizeof(PPLexTable)); - setup(lt); - clt = lt; /* set lt to be the current table, which yylex will fill in */ - if (YY_NULL == yylex()) yylex_ok = true; - clt = NULL; - lt->idx_of_active_label=-1; - - yy_delete_buffer(yy_current_buffer); - yy_current_buffer = NULL; - if (!yylex_ok) - { - pp_lexer_close(lt); - return NULL; - } - return lt; -} - -int pp_lexer_set_label(PPLexTable *lt, const char *label) -{ - /* set lexer state to first node of this label */ - lt->idx_of_active_label = get_index_of_label(lt, label); - if (lt->idx_of_active_label==-1) return 0; /* label not found */ - lt->current_node_of_active_label=lt->nodes_of_label[lt->idx_of_active_label]; - return 1; -} - -int pp_lexer_count_tokens_of_label(PPLexTable *lt) -{ - /* counts all tokens, even the commas */ - int n; - pp_label_node *p; - if (lt->idx_of_active_label==-1) - { - prt_error("Error: pp_lexer: current label is invalid\n"); - return -1; - } - for (n=0, p=lt->nodes_of_label[lt->idx_of_active_label]; p;p=p->next, n++){} - return n; -} - - -const char *pp_lexer_get_next_token_of_label(PPLexTable *lt) -{ - /* retrieves next token of set label, or NULL if list exhausted */ - static const char *p; - if (lt->current_node_of_active_label==NULL) return NULL; - p = lt->current_node_of_active_label->str; - lt->current_node_of_active_label=lt->current_node_of_active_label->next; - return p; -} - -int pp_lexer_count_commas_of_label(PPLexTable *lt) -{ - int n; - pp_label_node *p; - if (lt->idx_of_active_label==-1) - { - prt_error("Error: pp_lexer: current label is invalid\n"); - return -1; - } - for (n=0,p=lt->nodes_of_label[lt->idx_of_active_label];p!=NULL;p=p->next) - if (!strcmp(p->str, ",")) n++; - return n; -} - -const char **pp_lexer_get_next_group_of_tokens_of_label(PPLexTable *lt, size_t *n_tokens) -{ - /* all tokens until next comma, null-terminated */ - int n; - pp_label_node *p; - - p = lt->current_node_of_active_label; - for (n=0; p!=NULL && strcmp(p->str,","); n++, p=p->next) {} - if (n > lt->extents) { - lt->extents = n; - lt->tokens = (const char **) realloc (lt->tokens, n * sizeof(const char*)); - } - - p = lt->current_node_of_active_label; - for (n=0; p!=NULL && strcmp(p->str,","); n++, p=p->next) - lt->tokens[n] = string_set_add(p->str, lt->string_set); - - /* advance "current node of label" state */ - lt->current_node_of_active_label = p; - if (p!=NULL) lt->current_node_of_active_label = p->next; - - *n_tokens = n; - return lt->tokens; -} - - -static int yywrap(void) -{ - /* must return 1 for end of input, 0 otherwise */ - return 1; -} - -/********************** non-exported functions ************************/ - -static void setup(PPLexTable *lt) -{ - int i; - for (i=0; inodes_of_label[i] = NULL; - lt->last_node_of_label[i] = NULL; - lt->labels[i] = NULL; - } - lt->string_set = string_set_create(); - lt->tokens = NULL; - lt->extents = 0; -} - -static bool set_label(PPLexTable *lt, const char *label) -{ - int i; - char *c; - char *label_sans_colon; - - /* check for and then slice off the trailing colon */ - label_sans_colon = strdup(label); - c=&(label_sans_colon[strlen(label_sans_colon)-1]); - if (*c != ':') - { - prt_error("Error: Label %s must end with :\n", label); - return false; - } - *c = 0; - - /* have we seen this label already? If so, abort */ - for (i=0;lt->labels[i]!=NULL && strcmp(lt->labels[i],label_sans_colon);i++) {} - if (lt->labels[i]!=NULL) - { - prt_error("Error: pp_lexer: label %s multiply defined!\n", label_sans_colon); - return false; - } - - /* new label. Store it */ - if (i == PP_LEXER_MAX_LABELS-1) - { - prt_error("Error: pp_lexer: too many labels. Raise PP_LEXER_MAX_LABELS\n"); - return false; - } - lt->labels[i] = string_set_add(label_sans_colon, lt->string_set); - lt->idx_of_active_label = i; - - free(label_sans_colon); - - return true; -} - - -static bool add_string_to_label(PPLexTable *lt, const char *str) -{ - /* add the single string str to the set of strings associated with label */ - pp_label_node *new_node; - - if (lt->idx_of_active_label == -1) - { - prt_error("Error: pp_lexer: invalid syntax (line %i)\n", yylineno); - return false; - } - - /* make sure string is legal */ - if (!check_string(str)) return false; - - /* create a new node in (as yet to be determined) linked list of strings */ - new_node = (pp_label_node *) xalloc (sizeof(pp_label_node)); - new_node->str = string_set_add(str, lt->string_set); - new_node->next = NULL; - - /* stick newly-created node at the *end* of the appropriate linked list */ - if (lt->last_node_of_label[lt->idx_of_active_label]==NULL) - { - /* first entry on linked list */ - lt->nodes_of_label[lt->idx_of_active_label] = new_node; - lt->last_node_of_label[lt->idx_of_active_label] = new_node; - } - else - { - /* non-first entry on linked list */ - lt->last_node_of_label[lt->idx_of_active_label]->next = new_node; - lt->last_node_of_label[lt->idx_of_active_label] = new_node; - } - - return true; -} - -static bool add_set_of_strings_to_label(PPLexTable *lt,const char *label_of_set) -{ - /* add the set of strings, defined earlier by label_of_set, to the set of - strings associated with the current label */ - pp_label_node *p; - int idx_of_label_of_set; - if (lt->idx_of_active_label==-1) - { - prt_error("Error: pp_lexer: invalid syntax (line %i)\n", yylineno); - return false; - } - if ((idx_of_label_of_set = get_index_of_label(lt, label_of_set))==-1) - { - prt_error("Error: pp_lexer: label %s must be defined before it's referred to (line %i)\n", - label_of_set, yylineno); - return false; - } - for (p=lt->nodes_of_label[idx_of_label_of_set]; p!=NULL; p=p->next) - add_string_to_label(lt, p->str); - - return true; -} - -static int get_index_of_label(PPLexTable *lt, const char *label) -{ - int i; - for (i=0; lt->labels[i]!=NULL; i++) - if (!strcmp(lt->labels[i], label)) return i; - return -1; -} - -static bool check_string(const char *str) -{ - if (strlen(str)>1 && strchr(str, ',')!=NULL) - { - prt_error("Error: pp_lexer: string %s contains a comma, which is a no-no.\n", str); - return false; - } - return true; -} - - diff -Nru link-grammar-5.3.16/link-grammar/pp_lexer.h link-grammar-5.5.0/link-grammar/pp_lexer.h --- link-grammar-5.3.16/link-grammar/pp_lexer.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/pp_lexer.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,34 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include "structures.h" - -struct PPLexTable_s -{ - String_set *string_set; - const char *labels[PP_LEXER_MAX_LABELS]; /* array of labels */ - pp_label_node *nodes_of_label[PP_LEXER_MAX_LABELS]; /*str. for each label*/ - pp_label_node *last_node_of_label[PP_LEXER_MAX_LABELS]; /* efficiency */ - pp_label_node *current_node_of_active_label;/* state: curr node of label */ - int idx_of_active_label; /* read state: current label */ - const char **tokens; - int extents; -}; - -PPLexTable *pp_lexer_open(FILE *f); -void pp_lexer_close (PPLexTable *lt); -int pp_lexer_set_label (PPLexTable *lt, const char *label); -int pp_lexer_count_tokens_of_label (PPLexTable *lt); -const char *pp_lexer_get_next_token_of_label(PPLexTable *lt); -int pp_lexer_count_commas_of_label (PPLexTable *lt); -const char **pp_lexer_get_next_group_of_tokens_of_label(PPLexTable *lt, size_t *n_toks); diff -Nru link-grammar-5.3.16/link-grammar/pp_linkset.c link-grammar-5.5.0/link-grammar/pp_linkset.c --- link-grammar-5.3.16/link-grammar/pp_linkset.c 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/pp_linkset.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,161 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/*********************************************************************** -pp_linkset.c -maintains sets of pointers to link names -Similar to string-set, except that the comparison and hashing functions are -tailored for links. More importantly, all we store here is pointers. It's up -to the caller to ensure that the pointers always point to something useful. -**********************************************************************/ - -#include - -#include "post-process.h" -#include "pp_linkset.h" - -#define LINKSET_SPARSENESS 2 -#define LINKSET_SEED_VALUE 37 - -static void clear_hash_table(pp_linkset *ls) -{ - memset(ls->hash_table,0,ls->hash_table_size*sizeof(pp_linkset_node *)); -} - -static void initialize(pp_linkset *ls, int size) -{ - ls->hash_table_size = size*LINKSET_SPARSENESS; - ls->population = 0; - ls->hash_table = - (pp_linkset_node**) xalloc (ls->hash_table_size*sizeof(pp_linkset_node *)); - clear_hash_table(ls); -} - -static unsigned int compute_hash(pp_linkset *ls, const char *str) -{ - /* hash is computed from capitalized prefix only */ - unsigned int i, hashval; - hashval = LINKSET_SEED_VALUE; - i = 0; - if (islower((int)str[0])) i++; /* skip head-dependent indicator */ - for (; isupper((int)str[i]); i++) - hashval = str[i] + 31*hashval; - hashval %= ls->hash_table_size; - return hashval; -} - -static pp_linkset_node *add_internal(pp_linkset *ls, const char *str) -{ - pp_linkset_node *p, *n; - unsigned int hashval; - - /* look for str (exactly) in linkset */ - hashval = compute_hash(ls, str); - for (p=ls->hash_table[hashval]; p!=0; p=p->next) - if (!strcmp(p->str,str)) return NULL; /* already present */ - - /* create a new node for u; stick it at head of linked list */ - n = (pp_linkset_node *) xalloc (sizeof(pp_linkset_node)); - n->next = ls->hash_table[hashval]; - n->str = str; - ls->hash_table[hashval] = n; - return n; -} - -pp_linkset *pp_linkset_open(int size) -{ - pp_linkset *ls; - if (size==0) return NULL; - ls = (pp_linkset *) xalloc (sizeof(pp_linkset)); - initialize(ls, size); - return ls; -} - -void pp_linkset_close(pp_linkset *ls) -{ - if (ls == NULL) return; - pp_linkset_clear(ls); /* free memory taken by linked lists */ - xfree((void*) ls->hash_table, ls->hash_table_size*sizeof(pp_linkset_node*)); - xfree((void*) ls, sizeof(pp_linkset)); -} - -void pp_linkset_clear(pp_linkset *ls) -{ - /* clear dangling linked lists, but retain hash table itself */ - unsigned int i; - pp_linkset_node *p; - if (ls == NULL) return; - for (i=0; ihash_table_size; i++) - { - p = ls->hash_table[i]; - while (p) - { - pp_linkset_node *q = p; - p = p->next; - xfree((void*) q, sizeof(pp_linkset_node)); - } - } - clear_hash_table(ls); - ls->population = 0; -} - -/** - * returns 0 if already there, 1 if new. Stores only the pointer - */ -bool pp_linkset_add(pp_linkset *ls, const char *str) -{ - assert(ls != NULL, - "Fatal Error: pp_linkset internal error: Trying to add to a null set"); - - if (add_internal(ls, str) == NULL) return false; - ls->population++; - return true; -} - -/** - * Set query. Returns 1 if str pp-matches something in the set, 0 otherwise - */ -bool pp_linkset_match(pp_linkset *ls, const char *str) -{ - int hashval; - pp_linkset_node *p; - if (ls == NULL) return false; - hashval = compute_hash(ls, str); - p = ls->hash_table[hashval]; - while (p != 0) - { - if (post_process_match(p->str, str)) return true; - p = p->next; - } - return false; -} - -bool pp_linkset_match_bw(pp_linkset *ls, const char *str) -{ - unsigned int hashval; - pp_linkset_node *p; - if (ls == NULL) return false; - hashval = compute_hash(ls, str); - p = ls->hash_table[hashval]; - while (p != 0) - { - if (post_process_match(str, p->str)) return true; - p = p->next; - } - return false; -} - -size_t pp_linkset_population(pp_linkset *ls) -{ - return (ls == NULL) ? 0 : ls->population; -} - diff -Nru link-grammar-5.3.16/link-grammar/pp_linkset.h link-grammar-5.5.0/link-grammar/pp_linkset.h --- link-grammar-5.3.16/link-grammar/pp_linkset.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/pp_linkset.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,22 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "structures.h" - -pp_linkset *pp_linkset_open(int size); -void pp_linkset_close (pp_linkset *ls); -void pp_linkset_clear (pp_linkset *ls); -bool pp_linkset_add (pp_linkset *ls, const char *str); -bool pp_linkset_match (pp_linkset *ls, const char *str); -bool pp_linkset_match_bw (pp_linkset *ls, const char *str); -size_t pp_linkset_population(pp_linkset *ls); - diff -Nru link-grammar-5.3.16/link-grammar/preparation.c link-grammar-5.5.0/link-grammar/preparation.c --- link-grammar-5.3.16/link-grammar/preparation.c 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/preparation.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,171 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "api-structures.h" -#include "build-disjuncts.h" -#include "count.h" -#include "disjunct-utils.h" -#include "externs.h" -#include "preparation.h" -#include "print.h" -#include "prune.h" -#include "resources.h" -#include "string-set.h" -#include "structures.h" -#include "word-utils.h" - -static void -set_connector_list_length_limit(Connector *c, - Connector_set *conset, - int short_len, - bool all_short, - const char * ZZZ) -{ - for (; c!=NULL; c=c->next) - { - if (string_set_cmp (ZZZ, c->string)) - { - c->length_limit = 1; - } - else if (all_short || - (conset != NULL && !match_in_connector_set(conset, c))) - { - c->length_limit = short_len; - } - } -} - -static void -set_connector_length_limits(Sentence sent, Parse_Options opts) -{ - size_t i; - unsigned int len = opts->short_length; - bool all_short = opts->all_short; - Connector_set * ucs = sent->dict->unlimited_connector_set; - const char * ZZZ = string_set_add("ZZZ", sent->dict->string_set); - - if (0) - { - /* Not setting the length_limit saves observable time. However, if we - * would like to set the ZZZ connector length_limit to 1 for all - * sentences, we cannot do the following. */ - if (len >= sent->length) return; /* No point to enforce short_length. */ - } - - if (len > UNLIMITED_LEN) len = UNLIMITED_LEN; - - for (i=0; ilength; i++) - { - Disjunct *d; - for (d = sent->word[i].d; d != NULL; d = d->next) - { - set_connector_list_length_limit(d->left, ucs, len, all_short, ZZZ); - set_connector_list_length_limit(d->right, ucs, len, all_short, ZZZ); - } - } -} - - -/** - * Set c->nearest_word to the nearest word that this connector could - * possibly connect to. The connector *might*, in the end, - * connect to something more distant, but this is the nearest - * one that could be connected. - */ -static int set_dist_fields(Connector * c, size_t w, int delta) -{ - int i; - if (c == NULL) return (int) w; - i = set_dist_fields(c->next, w, delta) + delta; - c->nearest_word = i; - return i; -} - -/** - * Initialize the word fields of the connectors, and - * eliminate those disjuncts that are so long, that they - * would need to connect past the end of the sentence. - */ -static void setup_connectors(Sentence sent) -{ - size_t w; - Disjunct * d, * xd, * head; - for (w=0; wlength; w++) - { - head = NULL; - for (d=sent->word[w].d; d!=NULL; d=xd) - { - xd = d->next; - if ((set_dist_fields(d->left, w, -1) < 0) || - (set_dist_fields(d->right, w, 1) >= (int) sent->length)) - { - d->next = NULL; - free_disjuncts(d); - } - else - { - d->next = head; - head = d; - } - } - sent->word[w].d = head; - } -} - -/** - * Record the wordgraph word in each of its connectors. - * It is used for checking alternatives consistency. - */ -static void gword_record_in_connector(Sentence sent) -{ - for (size_t w = 0; w < sent->length; w++) { - for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next) { - for (Connector *c = d->right; NULL != c; c = c->next) - c->originating_gword = d->originating_gword; - for (Connector *c = d->left; NULL != c; c = c->next) - c->originating_gword = d->originating_gword; - } - } -} - -/** - * Assumes that the sentence expression lists have been generated. - */ -void prepare_to_parse(Sentence sent, Parse_Options opts) -{ - size_t i; - - build_sentence_disjuncts(sent, opts->disjunct_cost); - if (verbosity_level(5)) { - printf("After expanding expressions into disjuncts:"); - print_disjunct_counts(sent); - } - print_time(opts, "Built disjuncts"); - - for (i=0; ilength; i++) { - sent->word[i].d = eliminate_duplicate_disjuncts(sent->word[i].d); - - /* Some long Russian sentences can really blow up, here. */ - if (resources_exhausted(opts->resources)) - return; - } - print_time(opts, "Eliminated duplicate disjuncts"); - - if (verbosity_level(5)) { - printf("\nAfter expression pruning and duplicate elimination:\n"); - print_disjunct_counts(sent); - } - - gword_record_in_connector(sent); - set_connector_length_limits(sent, opts); - setup_connectors(sent); -} diff -Nru link-grammar-5.3.16/link-grammar/preparation.h link-grammar-5.5.0/link-grammar/preparation.h --- link-grammar-5.3.16/link-grammar/preparation.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/preparation.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,19 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _PREPARATION_H -#define _PREPARATION_H -#include "link-includes.h" - -void prepare_to_parse(Sentence, Parse_Options); -#endif /* _PREPARATION_H */ diff -Nru link-grammar-5.3.16/link-grammar/prepare/build-disjuncts.c link-grammar-5.5.0/link-grammar/prepare/build-disjuncts.c --- link-grammar-5.3.16/link-grammar/prepare/build-disjuncts.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/prepare/build-disjuncts.c 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,391 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +/* stuff for transforming a dictionary entry into a disjunct list */ + +#include +#include "build-disjuncts.h" +#include "connectors.h" +//#include "dict-common/dict-api.h" // for print_expression +#include "dict-common/dict-structures.h" // for Exp_struct +#include "disjunct-utils.h" +#include "utilities.h" + +/* Temporary connectors used while converting expressions into disjunct lists */ +typedef struct Tconnector_struct Tconnector; +struct Tconnector_struct +{ + char multi; /* TRUE if this is a multi-connector */ + char dir; /* '-' for left and '+' for right */ + Tconnector * next; + const condesc_t * condesc; +}; + +typedef struct clause_struct Clause; +struct clause_struct +{ + Clause * next; + double cost; + double maxcost; + Tconnector * c; +}; + +static void free_Tconnectors(Tconnector *e) +{ + Tconnector * n; + for(;e != NULL; e=n) + { + n = e->next; + xfree((char *)e, sizeof(Tconnector)); + } +} + +static void free_clause_list(Clause *c) +{ + Clause *c1; + while (c != NULL) + { + c1 = c->next; + free_Tconnectors(c->c); + xfree((char *)c, sizeof(Clause)); + c = c1; + } +} + +/** + * reverse the order of the list e. destructive + */ +static Tconnector * Treverse(Tconnector *e) +{ + Tconnector * head, *x; + head = NULL; + while (e != NULL) { + x = e->next; + e->next = head; + head = e; + e = x; + } + return head; +} + +/** + * reverse the order of the list e. destructive + */ +static Connector * reverse(Connector *e) +{ + Connector * head, *x; + head = NULL; + while (e != NULL) { + x = e->next; + e->next = head; + head = e; + e = x; + } + return head; +} + +/** + * Builds a new list of connectors that is the catenation of e1 with e2. + * does not effect lists e1 or e2. Order is maintained. + */ +static Tconnector * catenate(Tconnector * e1, Tconnector * e2) +{ + Tconnector * e, * head; + head = NULL; + for (;e1 != NULL; e1 = e1->next) { + e = (Tconnector *) xalloc(sizeof(Tconnector)); + *e = *e1; + e->next = head; + head = e; + } + for (;e2 != NULL; e2 = e2->next) { + e = (Tconnector *) xalloc(sizeof(Tconnector)); + *e = *e2; + e->next = head; + head = e; + } + return Treverse(head); +} + +/** + * build the connector for the terminal node n + */ +static Tconnector * build_terminal(Exp * e) +{ + Tconnector * c; + c = (Tconnector *) xalloc(sizeof(Tconnector)); + c->condesc = e->u.condesc; + c->multi = e->multi; + c->dir = e->dir; + c->next = NULL; + return c; +} + +/** + * Build the clause for the expression e. Does not change e + */ +static Clause * build_clause(Exp *e) +{ + Clause *c = NULL, *c1, *c2, *c3, *c4, *c_head; + E_list * e_list; + + assert(e != NULL, "build_clause called with null parameter"); + if (e->type == AND_type) + { + c1 = (Clause *) xalloc(sizeof (Clause)); + c1->c = NULL; + c1->next = NULL; + c1->cost = 0.0; + c1->maxcost = 0.0; + for (e_list = e->u.l; e_list != NULL; e_list = e_list->next) + { + c2 = build_clause(e_list->e); + c_head = NULL; + for (c3 = c1; c3 != NULL; c3 = c3->next) + { + for (c4 = c2; c4 != NULL; c4 = c4->next) + { + c = (Clause *) xalloc(sizeof (Clause)); + c->cost = c3->cost + c4->cost; + c->maxcost = fmaxf(c3->maxcost,c4->maxcost); + c->c = catenate(c3->c, c4->c); + c->next = c_head; + c_head = c; + } + } + free_clause_list(c1); + free_clause_list(c2); + c1 = c_head; + } + c = c1; + } + else if (e->type == OR_type) + { + /* we'll catenate the lists of clauses */ + c = NULL; + for (e_list = e->u.l; e_list != NULL; e_list = e_list->next) + { + c1 = build_clause(e_list->e); + while(c1 != NULL) { + c3 = c1->next; + c1->next = c; + c = c1; + c1 = c3; + } + } + } + else if (e->type == CONNECTOR_type) + { + c = (Clause *) xalloc(sizeof(Clause)); + c->c = build_terminal(e); + c->cost = 0.0; + c->maxcost = 0.0; + c->next = NULL; + } + else + { + assert(false, "an expression node with no type"); + } + + /* c now points to the list of clauses */ + for (c1 = c; c1 != NULL; c1 = c1->next) + { + c1->cost += e->cost; + /* c1->maxcost = MAX(c1->maxcost,e->cost); */ + /* Above is how Dennis had it. Someone changed it to below. + * However, this can sometimes lead to a maxcost that is less + * than the cost ! -- which seems wrong to me ... seems Dennis + * had it right!? + */ + c1->maxcost += e->cost; + } + return c; +} + +// #define DEBUG +#ifdef DEBUG +/* Misc printing functions, useful for debugging */ + +static void print_Tconnector_list(Tconnector * e) +{ + for (;e != NULL; e=e->next) { + if (e->multi) printf("@"); + printf("%s", e->condesc->string); + printf("%c", e->dir); + if (e->next != NULL) printf(" "); + } +} + +GNUC_UNUSED static void print_clause_list(Clause * c) +{ + for (;c != NULL; c=c->next) { + printf(" Clause: "); + printf("(%4.2f, %4.2f) ", c->cost, c->maxcost); + print_Tconnector_list(c->c); + printf("\n"); + } +} + +static void print_connector_list(Connector * e) +{ + for (;e != NULL; e=e->next) + { + printf("%s", connector_string(e)); + if (e->next != NULL) printf(" "); + } +} + +GNUC_UNUSED static void print_disjunct_list(Disjunct * dj) +{ + for (;dj != NULL; dj=dj->next) { + printf("%10s: ", dj->word_string); + printf("(%f) ", dj->cost); + print_connector_list(dj->left); + printf(" <--> "); + print_connector_list(dj->right); + printf("\n"); + } +} +#endif /* DEBUG */ + +/** + * Build a new list of connectors starting from the Tconnectors + * in the list pointed to by e. Keep only those whose strings whose + * direction has the value c. + */ +static Connector * extract_connectors(Tconnector *e, int c, Parse_Options opts) +{ + Connector *e1; + if (e == NULL) return NULL; + if (e->dir == c) + { + e1 = connector_new(e->condesc, opts); + e1->multi = e->multi; + e1->nearest_word = 0; + e1->next = extract_connectors(e->next, c, opts); + return e1; + } + else + { + return extract_connectors(e->next, c, opts); + } +} + +/** + * Build a disjunct list out of the clause list c. + * string is the print name of word that generated this disjunct. + */ +static Disjunct * +build_disjunct(Clause * cl, const char * string, double cost_cutoff, + Parse_Options opts) +{ + Disjunct *dis, *ndis; + dis = NULL; + for (; cl != NULL; cl = cl->next) + { + if (cl->maxcost <= cost_cutoff) + { + ndis = (Disjunct *) xalloc(sizeof(Disjunct)); + ndis->left = reverse(extract_connectors(cl->c, '-', opts)); + ndis->right = reverse(extract_connectors(cl->c, '+', opts)); + ndis->word_string = string; + ndis->cost = cl->cost; + ndis->next = dis; + dis = ndis; + } + } + return dis; +} + +Disjunct * build_disjuncts_for_exp(Exp* exp, const char *word, + double cost_cutoff, Parse_Options opts) +{ + Clause *c ; + Disjunct * dis; + // print_expression(exp); printf("\n"); + c = build_clause(exp); + // print_clause_list(c); + dis = build_disjunct(c, word, cost_cutoff, opts); + // print_disjunct_list(dis); + free_clause_list(c); + return dis; +} + +#ifdef DEBUG +/* There is a much better print_expression elsewhere + * This one is for low-level debug. */ +GNUC_UNUSED void prt_exp(Exp *e, int i) +{ + if (e == NULL) return; + + for(int j =0; jtype, e->dir, e->multi, e->cost); + if (e->type != CONNECTOR_type) + { + E_list *l = e->u.l; + while(l) + { + prt_exp(l->e, i+2); + l = l->next; + } + } + else + { + for(int j =0; ju.condesc->string); + } +} + +GNUC_UNUSED void prt_exp_mem(Exp *e, int i) +{ + char unknown_type[32] = ""; + const char *type; + + if (e == NULL) return; + + if (e->type > 0 && e->type <= 3) + { + type = ((const char *[]) {"OR_type", "AND_type", "CONNECTOR_type"}) [e->type-1]; + } + else + { + snprintf(unknown_type, sizeof(unknown_type)-1, "unknown-%d", e->type); + type = unknown_type; + } + + for(int j =0; jcost); + if (e->type != CONNECTOR_type) + { + E_list *l; + for(int j =0; ju.l); + for (l = e->u.l; NULL != l; l = l->next) + { + printf("%p", l->e); + if (NULL != l->next) printf(" "); + } + printf(")\n"); + + for (l = e->u.l; NULL != l; l = l->next) + { + prt_exp_mem(l->e, i+2); + } + } + else + { + for(int j =0; ju.condesc->string, e->dir, e->multi); + } +} +#endif diff -Nru link-grammar-5.3.16/link-grammar/prepare/build-disjuncts.h link-grammar-5.5.0/link-grammar/prepare/build-disjuncts.h --- link-grammar-5.3.16/link-grammar/prepare/build-disjuncts.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/prepare/build-disjuncts.h 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,27 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2012 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _LINKGRAMMAR_BUILD_DISJUNCTS_H +#define _LINKGRAMMAR_BUILD_DISJUNCTS_H + +#include "api-types.h" +#include "link-includes.h" + +Disjunct * build_disjuncts_for_exp(Exp*, const char*, double cost_cutoff, Parse_Options opts); + +#ifdef DEBUG +void prt_exp(Exp *, int); +void prt_exp_mem(Exp *, int); +#endif /* DEBUG */ + +#endif /* _LINKGRAMMAR_BUILD_DISJUNCTS_H */ diff -Nru link-grammar-5.3.16/link-grammar/prepare/expand.c link-grammar-5.5.0/link-grammar/prepare/expand.c --- link-grammar-5.3.16/link-grammar/prepare/expand.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/prepare/expand.c 2017-05-31 07:51:33.000000000 +0000 @@ -0,0 +1,67 @@ +/*************************************************************************/ +/* Copyright (c) 2009 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ +/* + * expand.c + * + * Enlarge the range of possible disjunct to consider while parsing. + */ + +#include "api-structures.h" +#include "connectors.h" +#include "dict-common/dict-common.h" // for X_node_struct +#include "disjunct-utils.h" +#include "expand.h" +#include "tokenize/word-structures.h" // For Word_struct +#include "corpus/cluster.h" + +/* ========================================================= */ + +static Disjunct * build_expansion_disjuncts(Cluster *clu, const char *xstr) +{ + Disjunct *dj; + dj = lg_cluster_get_disjuncts(clu, xstr); + if (dj && (verbosity > 0)) prt_error("Expanded %s \n", xstr); + return dj; +} + +/** + * Increase the number of disjuncts associated to each word in the + * sentence by working with word-clusters. Return true if the number + * of disjuncts were expanded, else return false. + */ +bool lg_expand_disjunct_list(Sentence sent) +{ + size_t w; + + Cluster *clu = lg_cluster_new(); + + bool expanded = false; + for (w = 0; w < sent->length; w++) + { + X_node * x; + Disjunct * d = sent->word[w].d; + for (x = sent->word[w].x; x != NULL; x = x->next) + { + Disjunct *dx = build_expansion_disjuncts(clu, x->string); + if (dx) + { + unsigned int cnt = count_disjuncts(d); + d = catenate_disjuncts(dx, d); + d = eliminate_duplicate_disjuncts(d); + if (cnt < count_disjuncts(d)) expanded = true; + } + } + sent->word[w].d = d; + } + lg_cluster_delete(clu); + + return expanded; +} diff -Nru link-grammar-5.3.16/link-grammar/prepare/expand.h link-grammar-5.5.0/link-grammar/prepare/expand.h --- link-grammar-5.3.16/link-grammar/prepare/expand.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/prepare/expand.h 2017-05-29 03:52:31.000000000 +0000 @@ -0,0 +1,14 @@ +/*************************************************************************/ +/* Copyright (c) 2009 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this softwares. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +/* Defined in link-includes.h */ +/* int lg_expand_disjunct_list(Sentence sent); */ + diff -Nru link-grammar-5.3.16/link-grammar/prepare/exprune.c link-grammar-5.5.0/link-grammar/prepare/exprune.c --- link-grammar-5.3.16/link-grammar/prepare/exprune.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/prepare/exprune.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,745 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2009, 2013, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-structures.h" // for Sentence_s +#include "connectors.h" +#include "dict-common/dict-api.h" // expression_stringify +#include "dict-common/dict-utils.h" // size_of_expression +#include "print/print-util.h" // dyn_str functions +#include "string-set.h" +#include "tokenize/word-structures.h" // for Word_struct +#include "exprune.h" + +#define D_EXPRUNE 9 + +#ifdef DEBUG +#define DBG(p, w, X) \ + if (verbosity_level(+D_EXPRUNE))\ + {\ + char *e = expression_stringify(x->exp);\ + err_msg(lg_Trace, "pass%d w%zu: ", p, w);\ + err_msg(lg_Trace, X ": %s\n", e);\ + free(e);\ + } +#else /* !DEBUG */ +#define DBG(p, w, X) +#endif /* DEBUG */ + +#define DBG_EXPSIZES(...) \ + if (verbosity_level(+D_EXPRUNE))\ + {\ + char *e = print_expression_sizes(sent);\ + err_msg(lg_Trace, __VA_ARGS__);\ + free(e);\ + } + +typedef struct connector_table_s connector_table; +struct connector_table_s +{ + condesc_t *condesc; + connector_table *next; + int farthest_word; +}; + +#define CT_BLKSIZE 512 +/* The connector table elements are allocated in a kind of an unrolled + * linked list with fixed blocks, when the first block is pre-allocated + * (this simplifies the handling). Additional blocks are + * dynamically allocated, but they are rarely needed. The existing + * allocation is reused on each pass, and freed only at the end of the + * expression pruning. */ +// connector_table_element-> ... CT_BLKSIZE-1 elements +// ... +// ... +// block connecting element ->---+ +// | +// (additional block) CT_BLKSIZE-1 elements <---+ +// ... +// current_element -> ... +// ... +// end_current_block -> block connecting element ->---+ +// | +// (additional block) CT_BLKSIZE-1 elements <---+ +// ... +// ... +// ... +// block connecting element + +typedef struct exprune_context_s exprune_context; +struct exprune_context_s +{ + connector_table **ct; + size_t ct_size; + Parse_Options opts; + connector_table *current_element; + connector_table *end_current_block; + connector_table connector_table_element[CT_BLKSIZE]; +}; + +static connector_table *ct_element_new(exprune_context *ctxt) +{ + if (ctxt->current_element == ctxt->end_current_block) + { + if (ctxt->end_current_block->next == NULL) + { + connector_table *newblock = + malloc(CT_BLKSIZE * sizeof(*ctxt->current_element)); + newblock[CT_BLKSIZE-1].next = NULL; + ctxt->end_current_block->next = newblock; + } /* else - reuse next block. */ + + ctxt->current_element = ctxt->end_current_block->next; + ctxt->end_current_block = &ctxt->current_element[CT_BLKSIZE-1]; + } + + return ctxt->current_element++; +} + +static void free_connector_table(exprune_context *ctxt) +{ + connector_table *x; + connector_table *t = ctxt->connector_table_element[CT_BLKSIZE-1].next; + + while (t != NULL) + { + x = t[CT_BLKSIZE-1].next; + free(t); + t = x; + } + + free(ctxt->ct); + ctxt->ct = NULL; + ctxt->ct_size = 0; +} + +/* ================================================================= */ +/** + * Here is expression pruning. This is done even before the expressions + * are turned into lists of disjuncts. + * + * This uses many of the same data structures and functions that are used + * by prune. + * + * The purge operations remove all irrelevant stuff from the expression, + * and free the purged stuff. A connector is deemed irrelevant if its + * string pointer has been set to NULL. The passes through the sentence + * have the job of doing this. + * + * If an OR or AND type expression node has one child, we can replace it + * by its child. This, of course, is not really necessary, except for + * performance(?). + */ + +static Exp* purge_Exp(Exp *); + +/** + * Get rid of the current_elements with null expressions + */ +static E_list * or_purge_E_list(E_list * l) +{ + E_list * el; + if (l == NULL) return NULL; + if ((l->e = purge_Exp(l->e)) == NULL) + { + el = or_purge_E_list(l->next); + xfree((char *)l, sizeof(E_list)); + return el; + } + l->next = or_purge_E_list(l->next); + return l; +} + +/** + * Returns true iff the length of the disjunct list is 0. + * If this is the case, it frees the structure rooted at l. + */ +static bool and_purge_E_list(E_list * l) +{ + if (l == NULL) return true; + if ((l->e = purge_Exp(l->e)) == NULL) + { + free_E_list(l->next); + xfree((char *)l, sizeof(E_list)); + return false; + } + if (!and_purge_E_list(l->next)) + { + free_Exp(l->e); + xfree((char *)l, sizeof(E_list)); + return false; + } + return true; +} + +/** + * Must be called with a non-null expression. + * Return NULL iff the expression has no disjuncts. + */ +static Exp* purge_Exp(Exp *e) +{ + if (e->type == CONNECTOR_type) + { + if (e->u.condesc == NULL) + { + xfree((char *)e, sizeof(Exp)); + return NULL; + } + else + { + return e; + } + } + if (e->type == AND_type) + { + if (!and_purge_E_list(e->u.l)) + { + xfree((char *)e, sizeof(Exp)); + return NULL; + } + } + else /* if we are here, its OR_type */ + { + e->u.l = or_purge_E_list(e->u.l); + if (e->u.l == NULL) + { + xfree((char *)e, sizeof(Exp)); + return NULL; + } + } + +/* This code makes it kill off nodes that have just one child + (1) It's going to give an insignificant speed-up + (2) Costs have not been handled correctly here. + The code is excised for these reasons. +*/ +/* + if ((e->u.l != NULL) && (e->u.l->next == NULL)) + { + ne = e->u.l->e; + xfree((char *) e->u.l, sizeof(E_list)); + xfree((char *) e, sizeof(Exp)); + return ne; + } +*/ + return e; +} + +/** + * This hash function only looks at the leading upper case letters of + * the connector string, and the label fields. This ensures that if two + * strings match (formally), then they must hash to the same place. + */ +static inline unsigned int hash_S(condesc_t * c) +{ + return c->uc_num; +} + +/** + * Returns TRUE if c can match anything in the set S (err. the connector table ct). + */ +static inline bool matches_S(connector_table **ct, int w, condesc_t * c) +{ + connector_table *e; + + for (e = ct[hash_S(c)]; e != NULL; e = e->next) + { + if (e->farthest_word <= 0) + { + if (w < -e->farthest_word) continue; + } + else + { + if (w > e->farthest_word) continue; + } + if (easy_match_desc(e->condesc, c)) return true; + } + return false; +} + +static void zero_connector_table(exprune_context *ctxt) +{ + memset(ctxt->ct, 0, sizeof(*ctxt->ct) * ctxt->ct_size); + ctxt->current_element = ctxt->connector_table_element; + ctxt->end_current_block = &ctxt->connector_table_element[CT_BLKSIZE-1]; +} + +/** + * Mark as dead all of the dir-pointing connectors + * in e that are not matched by anything in the current set. + * Returns the number of connectors so marked. + */ +static int mark_dead_connectors(connector_table **ct, int w, Exp * e, char dir) +{ + int count; + count = 0; + if (e->type == CONNECTOR_type) + { + if (e->dir == dir) + { + if (!matches_S(ct, w, e->u.condesc)) + { + e->u.condesc = NULL; + count++; + } + } + } + else + { + E_list *l; + for (l = e->u.l; l != NULL; l = l->next) + { + count += mark_dead_connectors(ct, w, l->e, dir); + } + } + return count; +} + +/** + * This function puts connector c into the connector table + * if one like it isn't already there. + */ +static void insert_connector(exprune_context *ctxt, int farthest_word, condesc_t * c) +{ + unsigned int h; + connector_table *e; + + h = hash_S(c); + + for (e = ctxt->ct[h]; e != NULL; e = e->next) + { + if (c == e->condesc) + { + { + if (e->farthest_word < farthest_word) e->farthest_word = farthest_word; + } + return; + } + } + + e = ct_element_new(ctxt); + e->condesc = c; + e->farthest_word = farthest_word; + e->next = ctxt->ct[h]; + ctxt->ct[h] = e; +} +/** + * Put into the set S all of the dir-pointing connectors still in e. + * Return a list of allocated dummy connectors; these will need to be + * freed. + */ +static void insert_connectors(exprune_context *ctxt, int w, Exp * e, int dir) +{ + if (e->type == CONNECTOR_type) + { + if (e->dir == dir) + { + assert(NULL != e->u.condesc, "NULL connector"); + Connector c = { .desc = e->u.condesc }; + + set_connector_length_limit(&c, ctxt->opts); + int farthest_word = (dir == '-') ? -MAX(0, w-c.length_limit) : + w+c.length_limit; + insert_connector(ctxt, farthest_word, e->u.condesc); + } + } + else + { + E_list *l; + for (l=e->u.l; l!=NULL; l=l->next) + { + insert_connectors(ctxt, w, l->e, dir); + } + } +} + +/** + * This removes the expressions that are empty from the list corresponding + * to word w of the sentence. + */ +static void clean_up_expressions(Sentence sent, int w) +{ + X_node head_node, *d, *d1; + d = &head_node; + d->next = sent->word[w].x; + while (d->next != NULL) + { + if (d->next->exp == NULL) + { + d1 = d->next; + d->next = d1->next; + xfree((char *)d1, sizeof(X_node)); + } + else + { + d = d->next; + } + } + sent->word[w].x = head_node.next; +} + +static char *print_expression_sizes(Sentence sent) +{ + X_node * x; + size_t w, size; + dyn_str *e = dyn_str_new(); + + for (w=0; wlength; w++) { + size = 0; + for (x=sent->word[w].x; x!=NULL; x = x->next) { + size += size_of_expression(x->exp); + } + /* XXX alternatives[0] is not really correct, here .. */ + append_string(e, "%s[%zu] ", sent->word[w].alternatives[0], size); + } + append_string(e, "\n\n"); + return dyn_str_take(e); +} + +void expression_prune(Sentence sent, Parse_Options opts) +{ + int N_deleted; + X_node * x; + size_t w; + exprune_context ctxt; + + ctxt.opts = opts; + ctxt.ct_size = sent->dict->contable.num_uc; + ctxt.ct = malloc(ctxt.ct_size * sizeof(*ctxt.ct)); + zero_connector_table(&ctxt); + ctxt.end_current_block->next = NULL; + + N_deleted = 1; /* a lie to make it always do at least 2 passes */ + + DBG_EXPSIZES("Initial expression sizes\n%s", e); + + int pass = -1; + while (pass++) + { + /* Left-to-right pass */ + /* For every word */ + for (w = 0; w < sent->length; w++) + { + /* For every expression in word */ + for (x = sent->word[w].x; x != NULL; x = x->next) + { + DBG(pass, w, "l->r pass before marking"); + N_deleted += mark_dead_connectors(ctxt.ct, w, x->exp, '-'); + DBG(pass, w, "l->r pass after marking"); + } + for (x = sent->word[w].x; x != NULL; x = x->next) + { + DBG(pass, w, "l->r pass before purging"); + x->exp = purge_Exp(x->exp); + DBG(pass, w, "l->r pass after purging"); + } + + /* gets rid of X_nodes with NULL exp */ + clean_up_expressions(sent, w); + for (x = sent->word[w].x; x != NULL; x = x->next) + { + insert_connectors(&ctxt, w, x->exp, '+'); + } + } + + DBG_EXPSIZES("l->r pass removed %d\n%s", N_deleted, e); + + zero_connector_table(&ctxt); + if (N_deleted == 0) break; + + /* Right-to-left pass */ + N_deleted = 0; + for (w = sent->length-1; w != (size_t) -1; w--) + { + for (x = sent->word[w].x; x != NULL; x = x->next) + { + DBG(pass, w, "r->l pass before marking"); + N_deleted += mark_dead_connectors(ctxt.ct, w, x->exp, '+'); + DBG(pass, w, "r->l pass after marking"); + } + for (x = sent->word[w].x; x != NULL; x = x->next) + { + DBG(pass, w, "r->l pass before purging"); + x->exp = purge_Exp(x->exp); + DBG(pass, w, "r->l pass after purging"); + } + clean_up_expressions(sent, w); /* gets rid of X_nodes with NULL exp */ + for (x = sent->word[w].x; x != NULL; x = x->next) + { + insert_connectors(&ctxt, w, x->exp, '-'); + } + } + + DBG_EXPSIZES("r->l pass removed %d\n%s", N_deleted, e); + + zero_connector_table(&ctxt); + if (N_deleted == 0) break; + N_deleted = 0; + } + + free_connector_table(&ctxt); +} + +#if 0 // VERY_DEAD_NO_GOOD_IDEA + +/* ============================================================x */ +/* + The second algorithm eliminates disjuncts that are dominated by + another. It works by hashing them all, and checking for domination. + + Consider the idea of deleting a disjunct if it is dominated (in terms of + what it can match) by some other disjunct on the same word. This has + been implemented below. There are three problems with it: + + (1) It is almost never the case that any disjuncts are eliminated. + + (2) connector_matches_alam may not be exactly correct. + + (3) The linkage that is eliminated by this, might just be the one that + passes post-processing, as the following example shows. + This is pretty silly, and should probably be changed. + +> telling John how our program works would be stupid +Accepted (2 linkages, 1 with no P.P. violations) + Linkage 1, cost vector = (0, 0, 7) + + +------------------G-----------------+ + +-----R-----+----CL----+ | + +---O---+ | +---D--+---S---+ +--I-+-AI-+ + | | | | | | | | | +telling.g John how our program.n works would be stupid + + ///// CLg <---CLg---> CL telling.g + (g) telling.g G <---G-----> G would + (g) (d) telling.g R <---R-----> R how + (g) (d) telling.g O <---O-----> O John + (g) (d) how CLe <---CLe---> CL program.n + (g) (d) (e) our D <---Ds----> Ds program.n + (g) (d) (e) program.n Ss <---Ss----> Ss works + (g) would I <---Ix----> Ix be + (g) be AI <---AIi---> AIi stupid + +(press return for another) +> + Linkage 2 (bad), cost vector = (0, 0, 7) + + +------------------G-----------------+ + +-----R-----+----CL----+ | + +---O---+ | +---D--+---S---+ +--I-+-AI-+ + | | | | | | | | | +telling.g John how our program.n works would be stupid + + ///// CLg <---CLg---> CL telling.g + (g) telling.g G <---G-----> G would + (g) (d) telling.g R <---R-----> R how + (g) (d) telling.g O <---O-----> O John + (g) (d) how CLe <---CLe---> CL program.n + (g) (d) (e) our D <---Ds----> Ds program.n + (g) (d) (e) program.n Ss <---Ss----> Ss works + (g) would I <---Ix----> Ix be + (g) be AI <---AI----> AI stupid + +P.P. violations: + Special subject rule violated +*/ + +/** + * hash function that takes a string and a seed value i + */ +static int string_hash(disjunct_dup_table *dt, const char * s, int i) +{ + for(;*s != '\0';s++) i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)]; + return (i & (dt->dup_table_size-1)); +} + +/** + * This returns true if the connector a matches everything that b + * matches, and possibly more. (alam=at least as much) + * + * TRUE for equal connectors. + * remains TRUE if multi-match added to the first. + * remains TRUE if subsrcripts deleted from the first. + */ +static bool connector_matches_alam(Connector * a, Connector * b) +{ + char * s, * t, *u; + if (((!a->multi) && b->multi) || + (a->label != b->label)) return false; + s = a->string; + t = b->string; + + /* isupper -- connectors cannot be UTF8 at this time */ + while (isupper(*s) || isupper(*t)) + { + if (*s == *t) { + s++; + t++; + } else return false; + } + while ((*s != '\0') && (*t != '\0')) { + if ((*s == *t) || (*s == '*')) { + s++; + t++; + } else return false; + } + while ((*s != '\0') && (*s == '*')) s++; + return (*s == '\0'); +} + + +/** + * This hash function that takes a connector and a seed value i. + * It only looks at the leading upper case letters of + * the string, and the label. This ensures that if two connectors + * match, then they must hash to the same place. + */ +static int conn_hash(Connector * c, int i) +{ + int nb; + const char * s; + s = c->string; + + i = i + (i<<1) + randtable[(c->label + i) & (RTSIZE-1)]; + nb = is_utf8_upper(s); + while (nb) + { + i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)]; + s += nb; + nb = is_utf8_upper(s); + } + return i; +} + +static inline int pconnector_hash(disjunct_dup_table *dt, Connector * c, int i) +{ + i = conn_hash(c, i); + return (i & (ct->dup_table_size-1)); +} + +/** + * This is a hash function for disjuncts + */ +static int hash_disjunct(disjunct_dup_table *dt, Disjunct * d) +{ + int i; + Connector *e; + i = 0; + for (e = d->left ; e != NULL; e = e->next) + { + i = pconnector_hash(dt, e, i); + } + for (e = d->right ; e != NULL; e = e->next) + { + i = pconnector_hash(dt, e, i); + } + return string_hash(dt, d->string, i); +} + +/** + * Returns TRUE if disjunct d1 can match anything that d2 can + * if this happens, it constitutes a proof that there is absolutely + * no use for d2. + */ +static bool disjunct_matches_alam(Disjunct * d1, Disjunct * d2) +{ + Connector *e1, *e2; + if (d1->cost > d2->cost) return false; + e1 = d1->left; + e2 = d2->left; + while ((e1!=NULL) && (e2!=NULL)) + { + if (!connector_matches_alam(e1,e2)) break; + e1 = e1->next; + e2 = e2->next; + } + if ((e1!=NULL) || (e2!=NULL)) return false; + e1 = d1->right; + e2 = d2->right; + while ((e1!=NULL) && (e2!=NULL)) + { + if (!connector_matches_alam(e1,e2)) break; + e1 = e1->next; + e2 = e2->next; + } + if ((e1!=NULL) || (e2!=NULL)) return false; + return (strcmp(d1->string, d2->string) == 0); +} + +/** + * Takes the list of disjuncts pointed to by d, eliminates all + * duplicates, and returns a pointer to a new list. + * It frees the disjuncts that are eliminated. + */ +Disjunct * eliminate_duplicate_disjuncts(Disjunct * d) +{ + int i, h, count; + Disjunct *dn, *dx, *dxn, *front; + count = 0; + disjunct_dup_table *dt; + + dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(d))); + + for (;d!=NULL; d = dn) + { + dn = d->next; + h = hash_disjunct(d); + + front = NULL; + for (dx = dt->dup_table[h]; dx != NULL; dx = dxn) + { + dxn = dx->next; + if (disjunct_matches_alam(dx,d)) + { + /* we know that d should be killed */ + d->next = NULL; + free_disjuncts(d); + count++; + front = catenate_disjuncts(front, dx); + break; + } else if (disjunct_matches_alam(d,dx)) { + /* we know that dx should be killed off */ + dx->next = NULL; + free_disjuncts(dx); + count++; + } else { + /* neither should be killed off */ + dx->next = front; + front = dx; + } + } + if (dx == NULL) { + /* we put d in the table */ + d->next = front; + front = d; + } + dt->dup_table[h] = front; + } + + /* d is now NULL */ + for (i = 0; i < dt->dup_table_size; i++) + { + for (dx = dt->dup_table[i]; dx != NULL; dx = dxn) + { + dxn = dx->next; + dx->next = d; + d = dx; + } + } + + if ((verbosity > 2) && (count != 0)) printf("killed %d duplicates\n", count); + + disjunct_dup_table_delete(dt); + return d; +} + +/* ============================================================x */ +#endif /* VERY_DEAD_NO_GOOD_IDEA */ diff -Nru link-grammar-5.3.16/link-grammar/prepare/exprune.h link-grammar-5.5.0/link-grammar/prepare/exprune.h --- link-grammar-5.3.16/link-grammar/prepare/exprune.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/prepare/exprune.h 2018-02-15 03:03:12.000000000 +0000 @@ -0,0 +1,19 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _EXPRESSION_PRUNE_H +#define _EXPRESSION_PRUNE_H + +#include "link-includes.h" + +void expression_prune(Sentence, Parse_Options opts); +#endif /* _EXPRESSION_PRUNE_H */ diff -Nru link-grammar-5.3.16/link-grammar/print/print.c link-grammar-5.5.0/link-grammar/print/print.c --- link-grammar-5.3.16/link-grammar/print/print.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print/print.c 2018-02-22 01:37:52.000000000 +0000 @@ -0,0 +1,1542 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright 2013, 2014 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-structures.h" +#include "connectors.h" +#include "corpus/corpus.h" +#include "dict-common/dict-utils.h" // For size_of_expression() +#include "disjunct-utils.h" +#include "linkage/linkage.h" +#include "post-process/post-process.h" // for compute_domain_names() +#include "print.h" +#include "tokenize/word-structures.h" // For Word_struct +#include "wcwidth.h" + +#define LEFT_WALL_SUPPRESS ("Wd") /* If this connector is used on the wall, */ + /* then suppress the display of the wall. */ +#define RIGHT_WALL_SUPPRESS ("RW")/* If this connector is used on the wall, */ + /* then suppress the display of the wall. */ + +#define HEAD_CHR ('h') /* Single char marking head-word. */ +#define DEPT_CHR ('d') /* Single char marking dependent word */ + +/** + * Find the position, measured in column-widths, of the center of + * each word. Also find the offset of each word, relative to the + * previous one, needed to fully fit the names of the links between + * them. The offset is again measured in column-widths, not in bytes! + * + * By "column widths", it is meant the number of terminal columns + * needed to display a glyph. Almost all indoeuropean glyphs are one + * column wide. Almost all CJK glyphs (hanzi, kanji) are two columns + * wide. These widths are NOT to be confuxed with UTF-8 byte size. + * + * FIXME Long link names between more distant words may still not + * fit the space between these words. + * + * Return the number of bytes needed for the all the words, including + * the space needed for the link names as described above. Note that + * this byte length might be less than the glyph width! e.g. the + * ASCII chars in the rangel 01 to 1F usually print in two columns, + * but require only one byte to encode. + */ +static size_t +set_centers(const Linkage linkage, int center[], int word_offset[], + bool print_word_0, int N_words_to_print) +{ + int i, tot; + size_t n; + int start_word = print_word_0 ? 0 : 1; + int *link_len = alloca(linkage->num_words * sizeof(*link_len)); + size_t max_bytes_in_line = 0; /* Needed picture array line length */ + + memset(link_len, 0, linkage->num_words * sizeof(*link_len)); + + for (n = 0; n < linkage->num_links; n++) + { + Link *l = &linkage->link_array[n]; + + if ((l->lw + 1 == l->rw) && (NULL != l->link_name)) + { + link_len[l->rw] = strlen(l->link_name) + + (DEPT_CHR == connector_string(l->rc)[0]) + + (HEAD_CHR == connector_string(l->rc)[0]) + + (DEPT_CHR == connector_string(l->lc)[0]) + + (HEAD_CHR == connector_string(l->lc)[0]); + } + } + + tot = 0; + for (i = start_word; i < N_words_to_print; i++) + { + int len, middle; + + /* Centers obtained by counting the characters column widths, + * not the bytes in the string. */ + len = utf8_strwidth(linkage->word[i]); + + middle = tot + (len/2); +#if 1 /* Long labels - disable in order to compare output with old versions. */ + if (i > start_word) + center[i] = MAX(middle, center[i-1] + link_len[i] + 1); + else +#endif + center[i] = middle; + word_offset[i] = center[i] - middle; + tot += len+1 + word_offset[i]; + + // We use 2x strlen, because invalid UTF-8 chars get padding. + // But we don't know how much padding, right now. In general, + // this will sharply over-count, but is needed to deal with + // garbage data that is attempting to crash/buffer-overflow. + max_bytes_in_line += word_offset[i] + 2*strlen(linkage->word[i]) + 1; + } + + return max_bytes_in_line; +} + +/* The following are all for generating postscript */ +typedef struct +{ + int N_rows; /* N_rows -- the number of rows */ + /* tells the height of the links above the sentence */ + int * link_heights; + /* the word beginning each row of the display */ + int * row_starts; +} ps_ctxt_t; + +/** + * Prints s then prints the last |t|-|s| characters of t. + * If s is longer than t, it truncates s. + * Handles utf8 strings correctly. + * Assumes that the characters of t are all blanks, + * so that column-width of t == num bytes in t == num chars in t. + */ +static void left_append_string(dyn_str * string, const char * s, const char * t) +{ + /* These are column-widths. */ + size_t twidth = strlen(t); + size_t swidth = utf8_strwidth(s); + + for (size_t i = 0; i < twidth; ) + { + if (i < swidth && *s != '\0') + { + // The width might be negative, if the value is not a valid + // UTF-8 character. Assume that it will be printed with a + // two-column-wide "box font". + int sw = utf8_charwidth(s); + if (sw < 0) sw = 2; + i += sw; + t += sw; + + // Meanwhile, s advances by bytes, and not by column widths. + s += append_utf8_char(string, s); + } + else + { + append_utf8_char(string, t); + i++; + t++; + } + } +} + +static void print_a_link(dyn_str * s, const Linkage linkage, LinkIdx link) +{ + WordIdx l, r; + const char *label, *llabel, *rlabel; + + l = linkage_get_link_lword(linkage, link); + r = linkage_get_link_rword(linkage, link); + label = linkage_get_link_label(linkage, link); + llabel = linkage_get_link_llabel(linkage, link); + rlabel = linkage_get_link_rlabel(linkage, link); + + if (l == 0) + { + left_append_string(s, LEFT_WALL_DISPLAY, " "); + } + else if (l == (linkage_get_num_words(linkage) - 1)) + { + left_append_string(s, RIGHT_WALL_DISPLAY, " "); + } + else + { + left_append_string(s, linkage_get_word(linkage, l), " "); + } + + /* XXX FIXME -- the space allocated to a link name should depend + * on the longest link-name in the sentence! */ + left_append_string(s, llabel, " "); + if (DEPT_CHR == llabel[0]) + dyn_strcat(s, " <---"); + else if (HEAD_CHR == llabel[0]) + dyn_strcat(s, " >---"); + else + dyn_strcat(s, " ----"); + + left_append_string(s, label, "-----"); + if (DEPT_CHR == rlabel[0]) + dyn_strcat(s, "-> "); + else if (HEAD_CHR == rlabel[0]) + dyn_strcat(s, "-< "); + else + dyn_strcat(s, "-- "); + left_append_string(s, rlabel, " "); + append_string(s, " %s\n", linkage_get_word(linkage, r)); +} + + +/** + * To the left of each link, print the sequence of domains it is in. + * Printing a domain means printing its type. + * Takes info from pp_link_array and pp and chosen_words. + */ +char * linkage_print_links_and_domains(const Linkage linkage) +{ + int link, longest, j; + int N_links = linkage_get_num_links(linkage); + dyn_str * s = dyn_str_new(); + const char ** dname; + + compute_domain_names(linkage); + + longest = 0; + for (link=0; link longest) + longest = linkage_get_link_num_domains(linkage, link); + } + for (link=0; linkinfo; + Sense *sns; + size_t nwords; + WordIdx w; + + lg_corpus_linkage_senses(linkage); + + nwords = linkage->num_words; + for (w=0; wnum_words; + + /* Loop over each word in the sentence */ + for (w = 0; w < nwords; w++) + { + int pad = 21; + double cost; + const char *infword; + Disjunct *disj = linkage->chosen_disjuncts[w]; + if (NULL == disj) continue; + + /* Subscript mark will be cleaned up by append_string(). */ + // infword = disj->word_string; + + /* ... except that subscript marks confuse utf8_strwidth() */ + /* linkage->word[w] should be same as disj->word_string except + * that the subscript mark has been replaced by a dot. */ + infword = linkage->word[w]; + + /* Make sure the glyphs align during printing. */ + pad += strlen(infword) - utf8_strwidth(infword); + + dj = linkage_get_disjunct_str(linkage, w); + if (NULL == dj) dj = ""; + cost = linkage_get_disjunct_cost(linkage, w); + +#ifdef USE_CORPUS + score = linkage_get_disjunct_corpus_score(linkage, w); + append_string(s, "%*s %5.3f %6.3f %s\n", pad, infword, cost, score, dj); +#else + append_string(s, "%*s %5.3f %s\n", pad, infword, cost, dj); +#endif + } + return dyn_str_take(s); +} + +/** + * Postscript printing ... + * FIXME: + * 1. It is invoked after a call to linkage_print_diagram_ctxt() with a + * screen width of 8000. But it actually cannot handle a screen width + * greater than a page-width since it doesn't know to fold without the + * help of the row_starts array which tells it on which word each + * folded line starts (a garbled printout results). + * 2. It cannot handle utf-8 (garbage is printed). + * 3. Due to the added ability of folding long words, the row_starts + * array is not sufficient for telling where to start the next line + * (but this doesn't matter for now and can be fixed along with + * problem no. 1 above). + */ +static char * +build_linkage_postscript_string(const Linkage linkage, + bool display_walls, ps_ctxt_t *pctx) +{ + int link, i,j; + int d; + bool print_word_0, print_word_N; + int N_links = linkage->num_links; + Link *ppla = linkage->link_array; + dyn_str * string; + int N_words_to_print; + + string = dyn_str_new(); + + if (!display_walls) { + int N_wall_connectors = 0; + bool suppressor_used = false; + for (j=0; jnum_words-1) continue; + N_wall_connectors ++; + if (easy_match(connector_string(ppla[j].lc), LEFT_WALL_SUPPRESS)) { + suppressor_used = true; + } + } + } + print_word_0 = (((!suppressor_used) && (N_wall_connectors != 0)) + || (N_wall_connectors != 1)); + } + else print_word_0 = true; + + if (!display_walls) { + int N_wall_connectors = 0; + bool suppressor_used = false; + for (j=0; jnum_words-1) { + N_wall_connectors ++; + if (easy_match(connector_string(ppla[j].lc), RIGHT_WALL_SUPPRESS)) { + suppressor_used = true; + } + } + } + print_word_N = (((!suppressor_used) && (N_wall_connectors != 0)) + || (N_wall_connectors != 1)); + } + else print_word_N = true; + + if (print_word_0) d=0; else d=1; + + i = 0; + N_words_to_print = linkage->num_words; + if (!print_word_N) N_words_to_print--; + + dyn_strcat(string, "["); + for (j=d; j0)) dyn_strcat(string, "\n"); + i++; + append_string(string, "(%s)", linkage->word[j]); + } + dyn_strcat(string,"]"); + dyn_strcat(string,"\n"); + + dyn_strcat(string,"["); + j = 0; + for (link=0; linknum_words-1)) continue; + // if (ppla[link]->lw == SIZE_MAX) continue; + assert (ppla[link].lw != SIZE_MAX); + if ((j%7 == 0) && (j>0)) dyn_strcat(string,"\n"); + j++; + append_string(string,"[%zu %zu %d", + ppla[link].lw - d, ppla[link].rw - d, + pctx->link_heights[link]); + append_string(string," (%s)]", ppla[link].link_name); + } + dyn_strcat(string,"]"); + dyn_strcat(string,"\n"); + dyn_strcat(string,"["); + for (j=0; j < pctx->N_rows; j++ ) + { + if (j>0) append_string(string, " %d", pctx->row_starts[j]); + else append_string(string,"%d", pctx->row_starts[j]); + } + dyn_strcat(string,"]\n"); + + return dyn_str_take(string); +} + +#define HEIGHT_INC 10 + +static void diagram_alloc_tmpmem(size_t **start, char ***pic, char ***xpic, + size_t *cur_height, size_t max_height, + size_t max_bytes, size_t num_cols) +{ + assert(num_cols < max_bytes); + assert(max_height > *cur_height); + + *start = realloc(*start, max_height * sizeof(size_t)); + *pic = realloc(*pic, max_height * sizeof(char *)); + *xpic = realloc(*xpic, max_height * sizeof(char *)); + + for (size_t i = *cur_height; i < max_height; i++) + { + /* Allocate memory for both pic and xpic. */ + char *picmem = malloc(max_bytes * 2 + 2); + + (*pic)[i] = picmem; + (*xpic)[i] = picmem + max_bytes; + + memset((*pic)[i], ' ', num_cols); + (*pic)[i][num_cols] = '\0'; + } + + *cur_height = max_height; +} + +static void diagram_free_tmpmem(size_t *start, char **pic, char **xpic, + size_t cur_height) +{ + + for (size_t i = 0; i < cur_height; i++) + free(pic[i]); + + free(start); + free(pic); + free(xpic); +} + +/** + * Print the indicated linkage into a utf8-diagram. + * Works fine for general utf8 multi-byte sentences. + * Links and connectors are still mostly assumed to be ASCII, though; + * to fix this, grep for "strlen" in the code below, replace by utf8 len. + * + * Returned string is allocated with exalloc. + * Needs to be freed with linkage_free_diagram() + */ +static char * +linkage_print_diagram_ctxt(const Linkage linkage, + bool display_walls, + size_t x_screen_width, + ps_ctxt_t *pctx) +{ + if (x_screen_width <= 2) + { + prt_error("Error: Cannot print diagram: " + "Screen width %zu is too narrow.\n", x_screen_width); + return NULL; + } + + bool display_short = true; + unsigned int i, j, k, cl, cr, inc, row, top_row, top_row_p1; + const char *s; + char *t; + bool print_word_0 , print_word_N; + int *center = alloca((linkage->num_words+1)*sizeof(int)); + int *word_offset = alloca((linkage->num_words+1) * sizeof(*word_offset)); + unsigned int link_length; + unsigned int N_links = linkage->num_links; + Link *ppla = linkage->link_array; + dyn_str * string; + unsigned int N_words_to_print; + + // Avoid pathological case and the resulting crash. + if (0 == linkage->num_words) return strdup(""); + + string = dyn_str_new(); + + /* Do we want to print the left wall? */ + if (!display_walls) + { + int N_wall_connectors = 0; + bool suppressor_used = false; + for (j=0; jnum_words-1) continue; + N_wall_connectors ++; + if (easy_match(connector_string(ppla[j].lc), LEFT_WALL_SUPPRESS)) + { + suppressor_used = true; + } + } + } + print_word_0 = (((!suppressor_used) && (N_wall_connectors != 0)) + || (N_wall_connectors != 1)); + } + else print_word_0 = true; + + /* Do we want to print the right wall? */ + if (!display_walls) + { + int N_wall_connectors = 0; + bool suppressor_used = false; + for (j=0; jnum_words-1) + { + N_wall_connectors ++; + if (easy_match(connector_string(ppla[j].lc), RIGHT_WALL_SUPPRESS)) + { + suppressor_used = true; + } + } + } + print_word_N = (((!suppressor_used) && (N_wall_connectors != 0)) + || (N_wall_connectors != 1)); + } + else print_word_N = true; + + N_words_to_print = linkage->num_words; + if (!print_word_N) N_words_to_print--; + + size_t *start = NULL; + char **picture = NULL; + char **xpicture = NULL; + size_t max_height = 0; + size_t max_bytes = set_centers(linkage, center, word_offset, + print_word_0, N_words_to_print) +1; + + // num_cols is the total number of columns needed to display + // the ascii-art diagram, not counting the glyphs. It might + // be less, or it might be more than the total number of + // bytes in the UTF-8 string! Which one depends on just how + // much garage there might be in the string. Note also that, + // in general, the glyphy of the last word will stick out past + // the num_cols here. + unsigned int num_cols = center[N_words_to_print-1]+1; + + if (max_bytes < num_cols) max_bytes = num_cols; + diagram_alloc_tmpmem(&start, &picture, &xpicture, + &max_height, HEIGHT_INC, + max_bytes, num_cols); + + top_row = 0; + + // Longer links are printed above the lower links. + for (link_length = 1; link_length < N_words_to_print; link_length++) + { + for (j=0; jnum_words-1)) continue; + + /* Put it into the lowest position */ + /* Keep in mind that cl, cr are "columns" not "bytes" */ + cl = center[ppla[j].lw]; + cr = center[ppla[j].rw]; + for (row=0; row < max_height; row++) + { + for (k=cl+1; klink_heights[j] = row; + } + + if (2*row+2 > max_height-1) { + lgdebug(+9, "Extending rows up to %d.\n", (2*row+2)+HEIGHT_INC); + diagram_alloc_tmpmem(&start, &picture, &xpicture, + &max_height, (2*row+2)+HEIGHT_INC, + max_bytes, num_cols); + } + if (row > top_row) top_row = row; + + picture[row][cl] = '+'; + picture[row][cr] = '+'; + for (k=cl+1; klc->string[0]) { *(t-1) = '<'; } + if (DEPT_CHR == connector_string(ppla[j].lc)[0] && + (t > &picture[row][cl])) { picture[row][cl+1] = '<'; } + if (HEAD_CHR == connector_string(ppla[j].lc)[0]) { *(t-1) = '>'; } + + /* Copy connector name; stop short if no room */ + while ((*s != '\0') && (*t == '-')) *t++ = *s++; + + /* Add direction indicator */ + // if (DEPT_CHR == ppla[j]->rc->string[0]) { *t = '>'; } + if (DEPT_CHR == connector_string(ppla[j].rc)[0]) { picture[row][cr-1] = '>'; } + if (HEAD_CHR == connector_string(ppla[j].rc)[0]) { *t = '<'; } + + /* The direction indicators may have clobbered these. */ + picture[row][cl] = '+'; + picture[row][cr] = '+'; + + /* Now put in the | below this one, where needed */ + for (k=0; kword[k]; + while (*s != '\0') { *t++ = *s++; } + *t++ = ' '; + } + *t = '\0'; + + /* If display_short is NOT true, then the linkage diagram is printed + * in the "tall" style, with an extra row of vertical descenders + * between each level. */ + if (display_short) { + for (k=0; picture[0][k] != '\0'; k++) { + if ((picture[0][k] == '+') || (picture[0][k] == '|')) { + xpicture[1][k] = '|'; + } else { + xpicture[1][k] = ' '; + } + } + xpicture[1][k] = '\0'; + for (row=0; row < top_row+1; row++) { + strcpy(xpicture[row+2], picture[row]); + } + top_row += 2; + } else { + for (row=0; row < top_row+1; row++) { + strcpy(xpicture[2*row+2],picture[row]); + for (k=0; picture[row][k] != '\0'; k++) { + if ((picture[row][k] == '+') || (picture[row][k] == '|')) { + xpicture[2*row+1][k] = '|'; + } else { + xpicture[2*row+1][k] = ' '; + } + } + xpicture[2*row+1][k] = '\0'; + } + top_row = 2*top_row + 2; + } + + /* We've built the picture, now print it out. */ + + /* Start locations, for each row. These may vary, due to different + * column-widths of utf8 glyphs. */ + top_row_p1 = top_row + 1; + for (row = 0; row < top_row_p1; row++) + start[row] = 0; + + if (NULL != pctx) /* PS junk */ + { + pctx->N_rows = 0; + pctx->row_starts[pctx->N_rows] = 0; + pctx->N_rows++; + } + + if (print_word_0) i = 0; else i = 1; + unsigned int c = 0; /* Character offset in the last word on a row. */ +#define RIGHT_MARGIN 1 + while (i < N_words_to_print) + { + unsigned int revrs; + /* Count the column-widths of the words, up to the max screen width. + * Use word_offset only for the initial part of the word. */ + unsigned int uwidth = 0; + unsigned int wwid; + do { + wwid = ((c == 0) * word_offset[i]) + + utf8_strwidth(linkage->word[i]+c) + 1; + if (x_screen_width - RIGHT_MARGIN < uwidth + wwid) break; + uwidth += wwid; + c = 0; + i++; + } while (i < N_words_to_print); + + if (i < N_words_to_print) { + /* The whole word doesn't fit - fit as much as possible from it. */ + if (0 == uwidth) + { + uwidth = x_screen_width - RIGHT_MARGIN - (c == 0)*word_offset[i]-1; + + c += utf8_chars_in_width(linkage->word[i]+c, uwidth); + } + + if (NULL != pctx) /* PS junk */ + { + pctx->row_starts[pctx->N_rows] = i - (!print_word_0); + pctx->N_rows++; + } + } + dyn_strcat(string, "\n"); + + /* print each row of the picture */ + top_row_p1 = top_row + 1; + for (revrs = 0; revrs < top_row_p1; revrs++) + { + /* + * The `glyph_width` is the width, in columns, of the printable + * glyph. Chinese glyphs are almost always width two. + */ + size_t glyph_width = 0; + + row = top_row - revrs; + + // k is the number of bytes into the row that we previously + // printed. It should normally point at a valid UTF-8 char. + k = start[row]; + for (j = k; (glyph_width < uwidth) && (xpicture[row][j] != '\0'); ) + { + // If we don't have a glyph for this code-point, + // then assume the terminal will use a two-column- + // -wide "box font" with the hex code inside. + int gw = utf8_charwidth(&xpicture[row][j]); + if (gw < 0) gw = 2; + glyph_width += gw; + + /* Copy exactly one multi-byte character to buf */ + j += append_utf8_char(string, &xpicture[row][j]); + } + + // Record exactly how much we printed, so we can + // resume here, if the diagram wraps. + start[row] = j; + + // Hunt down and kill trailing blanks. + dyn_trimback(string); + dyn_strcat(string, "\n"); + } + dyn_strcat(string, "\n"); + } + + diagram_free_tmpmem(start, picture, xpicture, max_height); + return dyn_str_take(string); +} + +/** + * Print the indicated linkage as utf8-art into the given string. + * The width of the diagram is given by the terminal width, taken + * from the parse options. + * + * The returned string is malloced, and needs to be freed with + * linkage_free_diagram() + */ +char * linkage_print_diagram(const Linkage linkage, bool display_walls, size_t screen_width) +{ + if (!linkage) return NULL; + + return linkage_print_diagram_ctxt(linkage, display_walls, screen_width, NULL); +} + +void linkage_free_diagram(char * s) +{ + if (NULL == s) return; + exfree(s, strlen(s)+1); +} + +void linkage_free_disjuncts(char * s) +{ + exfree(s, strlen(s)+1); +} + +void linkage_free_links_and_domains(char * s) +{ + exfree(s, strlen(s)+1); +} + +void linkage_free_senses(char * s) +{ + exfree(s, strlen(s)+1); +} + +/* Forward declarations, the gunk is at the bottom. */ +static const char * trailer(bool print_ps_header); +static const char * header(bool print_ps_header); + +char * linkage_print_postscript(const Linkage linkage, bool display_walls, bool print_ps_header) +{ + char * ps, * qs, * ascii; + int size; + + /* call the ascii printer to initialize the row size stuff. */ + ps_ctxt_t ctx; + ctx.link_heights = (int *) alloca(linkage->num_links * sizeof(int)); + ctx.row_starts = (int *) alloca((linkage->num_words + 1) * sizeof(int)); + ascii = linkage_print_diagram_ctxt(linkage, display_walls, 8000, &ctx); + linkage_free_diagram(ascii); + + ps = build_linkage_postscript_string(linkage, display_walls, &ctx); + size = strlen(header(print_ps_header)) + strlen(ps) + strlen(trailer(print_ps_header)) + 1; + + qs = (char *) exalloc(sizeof(char)*size); + snprintf(qs, size, "%s%s%s", header(print_ps_header), ps, trailer(print_ps_header)); + exfree(ps, strlen(ps)+1); + + return qs; +} + +void linkage_free_postscript(char * s) +{ + exfree(s, strlen(s)+1); +} + +char * linkage_print_pp_msgs(Linkage linkage) +{ + if (linkage && linkage->lifo.pp_violation_msg) + return strdup(linkage->lifo.pp_violation_msg); + return strdup(""); +} + +void linkage_free_pp_msgs(char * s) +{ + exfree(s, strlen(s)+1); +} + +void print_disjunct_counts(Sentence sent) +{ + size_t i; + int dcnt; + int t = 0; + int lcnt = 0, rcnt = 0; + + for (i=0; ilength; i++) + { + Disjunct *d = sent->word[i].d; + dcnt = count_disjuncts(d); + rcnt += right_connector_count(d); + lcnt += left_connector_count(d); + t += dcnt; + + /* XXX alternatives[0] is not really correct, here .. */ + prt_error("%s(%d) ", sent->word[i].alternatives[0], dcnt); + } + + prt_error("\n\\"); + prt_error("Total: %d disjuncts, %d (%d+/%d-) connectors\n\n", + t, rcnt+lcnt, rcnt, lcnt); +} + +static const char * trailer(bool print_ps_header) +{ + static const char * trailer_string= + "diagram\n" + "\n" + "%%EndDocument\n" + ; + + if (print_ps_header) return trailer_string; + else return ""; +} + +static const char * header(bool print_ps_header) +{ + static const char * header_string= + "%!PS-Adobe-2.0 EPSF-1.2\n" + "%%Pages: 1\n" + "%%BoundingBox: 0 -20 500 200\n" + "%%EndComments\n" + "%%BeginDocument: \n" + "\n" + "% compute size of diagram by adding\n" + "% #rows x 8.5\n" + "% (#rows -1) x 10\n" + "% \\sum maxheight x 10\n" + "/nulllink () def % The symbol of a null link\n" + "/wordfontsize 11 def % the size of the word font\n" + "/labelfontsize 9 def % the size of the connector label font\n" + "/ex 10 def % the horizontal radius of all the links\n" + "/ey 10 def % the height of the level 0 links\n" + "/ed 10 def % amount to add to this height per level\n" + "/radius 10 def % radius for rounded arcs\n" + "/row-spacing 10 def % the space between successive rows of the diagram\n" + "\n" + "/gap wordfontsize .5 mul def % the gap between words\n" + "/top-of-words wordfontsize .85 mul def\n" + " % the delta y above where the text is written where\n" + " % the major axis of the ellipse is located\n" + "/label-gap labelfontsize .1 mul def\n" + "\n" + "/xwordfontsize 10 def % the size of the word font\n" + "/xlabelfontsize 10 def % the size of the connector label font\n" + "/xex 10 def % the horizontal radius of all the links\n" + "/xey 10 def % the height of the level 0 links\n" + "/xed 10 def % amount to add to this height per level\n" + "/xradius 10 def % radius for rounded arcs\n" + "/xrow-spacing 10 def % the space between successive rows of the diagram\n" + "/xgap wordfontsize .5 mul def % the gap between words\n" + "\n" + "/centerpage 6.5 72 mul 2 div def\n" + " % this number of points from the left margin is the center of page\n" + "\n" + "/rightpage 6.5 72 mul def\n" + " % number of points from the left margin is the right margin\n" + "\n" + "/show-string-centered-dict 5 dict def\n" + "\n" + "/show-string-centered {\n" + " show-string-centered-dict begin\n" + " /string exch def\n" + " /ycenter exch def\n" + " /xcenter exch def\n" + " xcenter string stringwidth pop 2 div sub\n" + " ycenter labelfontsize .3 mul sub\n" + " moveto\n" + " string show\n" + " end\n" + "} def\n" + "\n" + "/clear-word-box {\n" + " show-string-centered-dict begin\n" + " /string exch def\n" + " /ycenter exch def\n" + " /xcenter exch def\n" + " newpath\n" + " /urx string stringwidth pop 2 div def\n" + " /ury labelfontsize .3 mul def\n" + " xcenter urx sub ycenter ury sub moveto\n" + " xcenter urx add ycenter ury sub lineto\n" + " xcenter urx add ycenter ury add lineto\n" + " xcenter urx sub ycenter ury add lineto\n" + " closepath\n" + " 1 setgray fill\n" + " 0 setgray\n" + " end\n" + "} def\n" + "\n" + "/diagram-sentence-dict 20 dict def\n" + "\n" + "/diagram-sentence-circle\n" + "{diagram-sentence-dict begin \n" + " /links exch def\n" + " /words exch def\n" + " /n words length def\n" + " /Times-Roman findfont wordfontsize scalefont setfont\n" + " /x 0 def\n" + " /y 0 def\n" + "\n" + " /left-ends [x dup words {stringwidth pop add gap add dup}\n" + " forall pop pop] def\n" + " /right-ends [x words {stringwidth pop add dup gap add} forall pop] def\n" + " /centers [0 1 n 1 sub {/i exch def\n" + " left-ends i get\n" + " right-ends i get\n" + " add 2 div\n" + " } for ] def\n" + "\n" + " x y moveto\n" + " words {show gap 0 rmoveto} forall\n" + "\n" + " .5 setlinewidth \n" + "\n" + " links {dup 0 get /leftword exch def\n" + " dup 1 get /rightword exch def\n" + " dup 2 get /level exch def\n" + " 3 get /string exch def\n" + " newpath\n" + " string nulllink eq {[2] 1 setdash}{[] 0 setdash} ifelse\n" + "% string nulllink eq {.8 setgray}{0 setgray} ifelse\n" + " centers leftword get\n" + " y top-of-words add\n" + " moveto\n" + " \n" + " centers rightword get\n" + " centers leftword get\n" + " sub 2 div dup\n" + " radius \n" + " lt {/radiusx exch def}{pop /radiusx radius def} ifelse\n" + " \n" + " \n" + " \n" + " centers leftword get\n" + " y top-of-words add ey ed level mul add add\n" + " centers rightword get\n" + " y top-of-words add ey ed level mul add add\n" + " radiusx\n" + " arcto\n" + " 4 {pop} repeat\n" + " centers rightword get\n" + " y top-of-words add ey ed level mul add add\n" + " centers rightword get\n" + " y top-of-words add\n" + " radiusx\n" + " arcto\n" + " 4 {pop} repeat\n" + " centers rightword get\n" + " y top-of-words add\n" + " lineto\n" + "\n" + " stroke\n" + "\n" + " /radius-y ey ed level mul add def\n" + "\n" + " /center-arc-x\n" + " centers leftword get centers rightword get add 2 div\n" + " def\n" + " \n" + " /center-arc-y\n" + " y top-of-words radius-y add add\n" + " def\n" + "\n" + " /Courier-Bold findfont labelfontsize scalefont setfont \n" + " center-arc-x center-arc-y string clear-word-box\n" + " center-arc-x center-arc-y string show-string-centered\n" + " } forall\n" + " end\n" + " } def\n" + "\n" + "/diagramdict 20 dict def\n" + "\n" + "/diagram\n" + "{diagramdict begin\n" + " /break-words exch def\n" + " /links exch def\n" + " /words exch def\n" + " /n words length def\n" + " /n-rows break-words length def\n" + " /Times-Roman findfont wordfontsize scalefont setfont\n" + "\n" + " /left-ends [0 dup words {stringwidth pop add gap add dup}\n" + " forall pop pop] def\n" + " /right-ends [0 words {stringwidth pop add dup gap add} forall pop] def\n" + "\n" + " /lwindows [ break-words {left-ends exch get gap 2 div sub } forall ] def\n" + " /rwindows [1 1 n-rows 1 sub {/i exch def\n" + " lwindows i get } for\n" + " right-ends n 1 sub get gap 2 div add\n" + " ] def\n" + "\n" + "\n" + " /max 0 def\n" + " 0 1 links length 1 sub {\n" + " /i exch def\n" + " /t links i get 2 get def\n" + " t max gt {/max t def} if\n" + " } for\n" + "\n" + " /max-height ed max mul ey add top-of-words add row-spacing add def\n" + " /total-height n-rows max-height mul row-spacing sub def\n" + "\n" + " /max-width 0 def % compute the widest window\n" + " 0 1 n-rows 1 sub {\n" + " /i exch def\n" + " /t rwindows i get lwindows i get sub def\n" + " t max-width gt {/max-width t def} if\n" + " } for\n" + "\n" + " centerpage max-width 2 div sub 0 translate % centers it\n" + " % rightpage max-width sub 0 translate % right justified\n" + " % Delete both of these to make it left justified\n" + "\n" + " n-rows 1 sub -1 0\n" + " {/i exch def\n" + " gsave\n" + " newpath\n" + " %/centering centerpage rwindows i get lwindows i get sub 2 div sub def\n" + " % this line causes each row to be centered\n" + " /centering 0 def\n" + " % set centering to 0 to prevent centering of each row \n" + "\n" + " centering -100 moveto % -100 because some letters go below zero\n" + " centering max-height n-rows mul lineto\n" + " rwindows i get lwindows i get sub centering add\n" + " max-height n-rows mul lineto\n" + " rwindows i get lwindows i get sub centering add\n" + " -100 lineto\n" + " closepath\n" + " clip\n" + " lwindows i get neg n-rows i sub 1 sub max-height mul translate\n" + " centerpage centering 0 translate\n" + " words links diagram-sentence-circle\n" + " grestore\n" + " } for\n" + " end\n" + "} def \n" + "\n" + "/diagramx\n" + "{diagramdict begin\n" + " /break-words exch def\n" + " /links exch def\n" + " /words exch def\n" + " /n words length def\n" + " /n-rows break-words length def\n" + " /Times-Roman findfont xwordfontsize scalefont setfont\n" + "\n" + " /left-ends [0 dup words {stringwidth pop add gap add dup}\n" + " forall pop pop] def\n" + " /right-ends [0 words {stringwidth pop add dup gap add} forall pop] def\n" + "\n" + " /lwindows [ break-words {left-ends exch get gap 2 div sub } forall ] def\n" + " /rwindows [1 1 n-rows 1 sub {/i exch def\n" + " lwindows i get } for\n" + " right-ends n 1 sub get xgap 2 div add\n" + " ] def\n" + "\n" + "\n" + " /max 0 def\n" + " 0 1 links length 1 sub {\n" + " /i exch def\n" + " /t links i get 2 get def\n" + " t max gt {/max t def} if\n" + " } for\n" + "\n" + " /max-height xed max mul xey add top-of-words add xrow-spacing add def\n" + " /total-height n-rows max-height mul xrow-spacing sub def\n" + "\n" + " /max-width 0 def % compute the widest window\n" + " 0 1 n-rows 1 sub {\n" + " /i exch def\n" + " /t rwindows i get lwindows i get sub def\n" + " t max-width gt {/max-width t def} if\n" + " } for\n" + "\n" + " centerpage max-width 2 div sub 0 translate % centers it\n" + " % rightpage max-width sub 0 translate % right justified\n" + " % Delete both of these to make it left justified\n" + "\n" + " n-rows 1 sub -1 0\n" + " {/i exch def\n" + " gsave\n" + " newpath\n" + " %/centering centerpage rwindows i get lwindows i get sub 2 div sub def\n" + " % this line causes each row to be centered\n" + " /centering 0 def\n" + " % set centering to 0 to prevent centering of each row \n" + "\n" + " centering -100 moveto % -100 because some letters go below zero\n" + " centering max-height n-rows mul lineto\n" + " rwindows i get lwindows i get sub centering add\n" + " max-height n-rows mul lineto\n" + " rwindows i get lwindows i get sub centering add\n" + " -100 lineto\n" + " closepath\n" + " clip\n" + " lwindows i get neg n-rows i sub 1 sub max-height mul translate\n" + " centerpage centering 0 translate\n" + " words links diagram-sentence-circle\n" + " grestore\n" + " } for\n" + " end\n" + "} def \n" + "\n" + "/ldiagram\n" + "{diagramdict begin\n" + " /break-words exch def\n" + " /links exch def\n" + " /words exch def\n" + " /n words length def\n" + " /n-rows break-words length def\n" + " /Times-Roman findfont wordfontsize scalefont setfont\n" + "\n" + " /left-ends [0 dup words {stringwidth pop add gap add dup}\n" + " forall pop pop] def\n" + " /right-ends [0 words {stringwidth pop add dup gap add} forall pop] def\n" + "\n" + " /lwindows [ break-words {left-ends exch get gap 2 div sub } forall ] def\n" + " /rwindows [1 1 n-rows 1 sub {/i exch def\n" + " lwindows i get } for\n" + " right-ends n 1 sub get gap 2 div add\n" + " ] def\n" + "\n" + "\n" + " /max 0 def\n" + " 0 1 links length 1 sub {\n" + " /i exch def\n" + " /t links i get 2 get def\n" + " t max gt {/max t def} if\n" + " } for\n" + "\n" + " /max-height ed max mul ey add top-of-words add row-spacing add def\n" + " /total-height n-rows max-height mul row-spacing sub def\n" + "\n" + " /max-width 0 def % compute the widest window\n" + " 0 1 n-rows 1 sub {\n" + " /i exch def\n" + " /t rwindows i get lwindows i get sub def\n" + " t max-width gt {/max-width t def} if\n" + " } for\n" + "\n" + " % centerpage max-width 2 div sub 0 translate % centers it\n" + " % rightpage max-width sub 0 translate % right justified\n" + " % Delete both of these to make it left justified\n" + "\n" + " n-rows 1 sub -1 0\n" + " {/i exch def\n" + " gsave\n" + " newpath\n" + " %/centering centerpage rwindows i get lwindows i get sub 2 div sub def\n" + " % this line causes each row to be centered\n" + " /centering 0 def\n" + " % set centering to 0 to prevent centering of each row \n" + "\n" + " centering -100 moveto % -100 because some letters go below zero\n" + " centering max-height n-rows mul lineto\n" + " rwindows i get lwindows i get sub centering add\n" + " max-height n-rows mul lineto\n" + " rwindows i get lwindows i get sub centering add\n" + " -100 lineto\n" + " closepath\n" + " clip\n" + " lwindows i get neg n-rows i sub 1 sub max-height mul translate\n" + " centerpage centering 0 translate\n" + " words links diagram-sentence-circle\n" + " grestore\n" + " } for\n" + " end\n" + "} def \n" + ; + if (print_ps_header) return header_string; + else return ""; +} + +/** + * Print elements of the 2D-word-array produced for the parsers. + * + * - print_sentence_word_alternatives(s, sent, false, NULL, tokenpos) + * If a pointer to struct "tokenpos" is given, return through it the index of + * the first occurrence in the sentence of the given token. This is used to + * prevent duplicate information display for repeated morphemes (if there are + * multiples splits, each of several morphemes, otherwise some of them may + * repeat). + * + * - print_sentence_word_alternatives(s, sent, true, NULL, NULL) + * If debugprint is "true", this is a debug printout of the sentence. (The + * debug printouts are with level 0 because this function is invoked for debug + * on certain positive level.) + * + * + * - print_sentence_word_alternatives(s, sent, false, display_func, NULL) + * Iterate over the sentence words and their alternatives. Handle each + * alternative using the display_func function if it is supplied, or else (if it + * is NULL) just print them. It is used to display disjunct information when + * command !!word is used. + * FIXME In the current version (using Wordgraph) the "alternatives" in the + * word-array don't necessarily consist of real word alternatives. + * + */ + +struct tokenpos /* First position of the given token - to prevent duplicates */ +{ + const char * token; + size_t wi; + size_t ai; +}; + +void print_sentence_word_alternatives(dyn_str *s, Sentence sent, bool debugprint, + char * (*display)(Dictionary, const char *), struct tokenpos * tokenpos) +{ + size_t wi; /* Internal sentence word index */ + size_t ai; /* Index of a word alternative */ + size_t sentlen = sent->length; /* Shortened if there is a right-wall */ + size_t first_sentence_word = 0; /* Used for skipping a left-wall */ + bool word_split = false; /* !!word got split */ + Dictionary dict = sent->dict; + + if (0 == sentlen) + { + /* It should not happen, but if it actually happens due to some + * strange conditions, it's better not to abort the program. */ + prt_error("Error: Sentence length is 0 (reason unknown)\n"); + return; + } + + if (debugprint || (NULL != tokenpos)) + ; /* Do nothing */ + else + { + /* For analyzing words we need to ignore the left/right walls */ + if (dict->left_wall_defined && + (0 == strcmp(sent->word[0].unsplit_word, LEFT_WALL_WORD))) + first_sentence_word = 1; + if (dict->right_wall_defined && + ((NULL != sent->word[sentlen-1].unsplit_word)) && + (0 == strcmp(sent->word[sentlen-1].unsplit_word, RIGHT_WALL_WORD))) + sentlen--; + + /* Find if a word got split. This is indicated by: + * 1. More than one word in the sentence + * (no check if it actually results from !!w1 w2 ...). + * 2. A word containing more than one alternative. */ + if (sentlen - first_sentence_word > 1) + { + word_split = true; + } + else + { + for (wi=first_sentence_word; wiword[wi]; + + /* There should always be at least one alternative */ + assert((NULL != w.alternatives) && (NULL != w.alternatives[0]) && + ('\0' != w.alternatives[0][0]), "Missing alt for word %zu", wi); + + if (NULL != w.alternatives[1]) + { + word_split = true; + break; + } + } + } + /* "String", because it can be a word, morpheme, or (TODO) idiom */ + if (word_split && (NULL == display)) dyn_strcat(s, "String splits to:\n"); + /* We used to print the alternatives of the word here, one per line. + * In the current (Wordgraph) version, the alternatives may look + * like nonsense combination of tokens - not as the strict split + * possibilities of words as in previous versions. + * E.g.: For Hebrew word "הכלב", we now get these "alternatives": + * ה= כלב לב ב= + * ה= כ= ל= + * ה= כ= + * For "'50s," + * ' s s , + * '50 50 , + * '50s + * Clearly, this is not informative any more. Instead, one line with a + * list of tokens (without repetitions) is printed + * ה= כלב לב ב= כ= ל= + * + * FIXME Print the alternatives from the wordgraph. + */ + } + + /* Iterate over sentence input words */ + for (wi=first_sentence_word; wiword[wi]; + size_t w_start = wi; /* input word index */ + size_t max_nalt = 0; + +#if 0 /* In the Wordgraph version firstupper and post_quote don't exist. */ + if (debugprint) lgdebug(0, " word%d %c%c: %s\n ", + wi, w.firstupper ? 'C' : ' ', sent->post_quote[wi] ? 'Q' : ' ', +#endif + if (debugprint) append_string(s, " word%zu: %s\n", wi, w.unsplit_word); + + /* There should always be at least one alternative */ + assert((NULL != w.alternatives) && (NULL != w.alternatives[0]) && + ('\0' != w.alternatives[0][0]), "Missing alt for word %zu", wi); + + //err_msg(lg_Debug, "word%zu '%s' nalts %zu\n", + // wi, sent->word[wi].unsplit_word, altlen(sent->word[wi].alternatives)); + + for (wi = w_start; (wi == w_start) || + ((wi < sentlen) && (! sent->word[wi].unsplit_word)); wi++) + { + size_t nalt = altlen(sent->word[wi].alternatives); + + max_nalt = MAX(max_nalt, nalt); + } + + /* Iterate over alternatives */ + for (ai=0; ai < max_nalt; ai++) + { + if (debugprint) + { + if (0 < ai) dyn_strcat(s, "\n"); + append_string(s, " alt%zu:", ai); + } + + int missing = 0; + for (wi = w_start; (wi == w_start) || + ((wi < sentlen) && (! sent->word[wi].unsplit_word)); wi++) + { + size_t nalts = altlen(sent->word[wi].alternatives); + const char *wt; + + /* Some slots may be shorter then others. To avoid a + * misleading debug display, print "[missing]" for them, + * but only if there are tokens after them. */ + if (ai >= nalts) + { + missing++; + continue; + } + wt = sent->word[wi].alternatives[ai]; + + /* Don't display information again for the same word */ + if ((NULL != tokenpos) && (0 == strcmp(tokenpos->token, wt))) + { + tokenpos->wi = wi; + tokenpos->ai = ai; + return; + } + if (!debugprint) + { + struct tokenpos firstpos = { wt }; + + print_sentence_word_alternatives(s, sent, false, NULL, &firstpos); + if (((firstpos.wi != wi) || (firstpos.ai != ai)) && + firstpos.wi >= first_sentence_word) // allow !!LEFT_WORD + { + /* We encountered this token earlier */ + if (NULL != display) + lgdebug(6, "Skipping repeated %s\n", wt); + continue; + } + } + + if (debugprint) + { + while (missing-- > 0) append_string(s, " %s", "[missing]"); + + const char *opt_start = "", *opt_end = ""; + if (sent->word[wi].optional) + { + opt_start = "{"; + opt_end = "}"; + } + if ('\0' == wt[0]) wt = "'\\0'"; /* Reveal a bogus value. */ + append_string(s, " %s%s%s", opt_start, wt, opt_end); + } + + /* For now each word component is called "Token". + * TODO: Its type can be decoded and a more precise + * term (stem, prefix, etc.) can be used. + * Display the features of the token. */ + if ((NULL == tokenpos) && (NULL != display)) + { + char *info = display(sent->dict, wt); + + if (NULL == info) return; + append_string(s, "Token \"%s\" ", wt); + /* Cannot use append_string() for "info" because it may be + * a multi-MB string due to !!* + * (printing all dictionary words). */ + dyn_strcat(s, info); + free(info); + } + else if (word_split) append_string(s, " %s", wt); + } + + } + wi--; + + /* Line separation after the tokens printed by "String splits to:". */ + if (word_split && (NULL == display)) dyn_strcat(s, "\n\n"); + + if (debugprint) dyn_strcat(s, "\n"); + } + if (debugprint) dyn_strcat(s, "\n"); +} + +// Use for debug and error printing. +void print_sentence_context(Sentence sent, dyn_str *outbuf) +{ + size_t i, j; + const char **a, **b; + + dyn_strcat(outbuf, + "\tFailing sentence contains the following words/morphemes:\n\t"); + for (i=0; ilength; i++) + { + for (a = sent->word[i].alternatives; NULL != *a; a++) + { + bool next_word = false; + + for (j=0; jlength; j++) + { + for (b = sent->word[j].alternatives; NULL != *b; b++) + { + /* print only the first occurrence. */ + if (0 == strcmp(*a, *b)) + { + next_word = true; + if (a != b) break; + dyn_strcat(outbuf, *a); + dyn_strcat(outbuf, " "); + break; + } + } + if (next_word) break; + } + } + } + dyn_strcat(outbuf, "\n"); +} diff -Nru link-grammar-5.3.16/link-grammar/print/print.h link-grammar-5.5.0/link-grammar/print/print.h --- link-grammar-5.3.16/link-grammar/print/print.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print/print.h 2017-07-13 21:12:28.000000000 +0000 @@ -0,0 +1,30 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _PRINT_H +#define _PRINT_H + +#include "print/print-util.h" // For dyn_str +#include "link-includes.h" + +#define LEFT_WALL_DISPLAY ("LEFT-WALL") /* the string to use to show the wall */ +#define RIGHT_WALL_DISPLAY ("RIGHT-WALL") /* the string to use to show the wall */ + +void print_disjunct_counts(Sentence sent); +struct tokenpos; +void print_sentence_word_alternatives(dyn_str *, Sentence, bool, + char * (*)(Dictionary, const char *), struct tokenpos *); + +// Used for debug/error printing +void print_sentence_context(Sentence, dyn_str*); + +#endif /* _PRINT_H */ diff -Nru link-grammar-5.3.16/link-grammar/print/print-util.c link-grammar-5.5.0/link-grammar/print/print-util.c --- link-grammar-5.3.16/link-grammar/print/print-util.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print/print-util.c 2018-02-22 01:37:52.000000000 +0000 @@ -0,0 +1,247 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include +#include +#include + +#include "print-util.h" +#include "utilities.h" +#include "wcwidth.h" + +/** + * Return the width, in text-column-widths, of the utf8-encoded + * string. This is needed when printing formatted strings. + * European languages will typically have widths equal to the + * `mblen` value below (returned by mbsrtowcs); they occupy one + * column-width per code-point. The CJK ideographs occupy two + * column-widths per code-point. No clue about what happens for + * Arabic, or others. See wcwidth.c for details. + */ +size_t utf8_strwidth(const char *s) +{ + size_t mblen; + +#ifdef _WIN32 + mblen = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0) - 1; +#else + mblen = mbsrtowcs(NULL, &s, 0, NULL); +#endif + if ((int)mblen < 0) + { + prt_error("Warning: Error in utf8_strwidth(%s)\n", s); + return 1 /* XXX */; + } + + wchar_t *ws = alloca((mblen + 1) * sizeof(wchar_t)); + +#ifdef _WIN32 + MultiByteToWideChar(CP_UTF8, 0, s, -1, ws, mblen); +#else + mbstate_t mbss; + memset(&mbss, 0, sizeof(mbss)); + mbsrtowcs(ws, &s, mblen, &mbss); +#endif /* _WIN32 */ + + int glyph_width = 0; + for (size_t i = 0; i < mblen; i++) + { + int w = mk_wcwidth(ws[i]); + + // If w<0 then we do not know what the correct glyph + // width should be for this codepoint. Many terminals + // will print this with a weird boxed font that is two + // columns wide, showing the hex value in it. + if (w < 0) w = 2; + glyph_width += w; + } + return glyph_width; +} + +/** + * Return the width, in text-column-widths, of the utf8-encoded + * character. The will return a NEGATIVE VALUE if the character + * width is not know (no glyph for a valid UTF-8 codepoint) or + * if the input is not a valid UTF-8 character! + * + * The mbstate_t argument is not used, since we convert only from utf-8. + * FIXME: This function (along with other places that use mbrtowc()) need + * to be fixed for Windows (utf-16 wchar_t). + * Use char32_t with mbrtoc32() instead of mbrtowc(). + */ +int utf8_charwidth(const char *s) +{ + wchar_t wc; + + int n = mbrtowc(&wc, s, MB_LEN_MAX, NULL); + if (n == 0) return 0; + if (n < 0) + { + // prt_error("Error: charwidth(%s): mbrtowc() returned %d\n", s, n); + return -2 /* Yes, we want this! It signals the error! */; + } + + return mk_wcwidth(wc); +} + +/** + * Return the number of characters in the longest initial substring + * which has a text-column-width of not greater than max_width. + */ +size_t utf8_chars_in_width(const char *s, size_t max_width) +{ + size_t total_bytes = 0; + size_t glyph_width = 0; + int n = 0; + wchar_t wc; + + do + { + total_bytes += n; + n = mbrtowc(&wc, s+total_bytes, MB_LEN_MAX, NULL); + if (n == 0) break; + if (n < 0) + { + // Allow for double-column-wide box-font printing + // i.e. box with the hex code inside. + glyph_width += 2; + n = 1; + } + else + { + // If we are here, it was a valid UTF-8 code point, + // but we do not know the width of the corresponding + // glyph. Just like above, assume a double-wide box + // font will be printed. + int gw = mk_wcwidth(wc); + if (0 <= gw) + glyph_width += gw; + else + glyph_width += 2; + } + //printf("N %zu G %zu;", total_bytes, glyph_width); + } + while (glyph_width <= max_width); + // printf("\n"); + + return total_bytes; +} + +/* ============================================================= */ + +/** + * Append to a dynamic string with vprintf-like formatting. + * @return The number of appended bytes, or a negative value on error. + * + * Note: As in the rest of the LG library, we assume here C99 library + * compliance (without it, this code would be buggy). + */ +int vappend_string(dyn_str * string, const char *fmt, va_list args) +{ +#define TMPLEN 1024 /* Big enough for a possible error message, see below */ + char temp_buffer[TMPLEN]; + char *temp_string = temp_buffer; + int templen; + va_list copy_args; + + va_copy(copy_args, args); + templen = vsnprintf(temp_string, TMPLEN, fmt, copy_args); + va_end(copy_args); + + if (templen < 0) goto error; + if (0) + { + if (fmt[0] == '(') { errno=2; goto error;} /* Test the error reporting. */ + } + + if (templen >= TMPLEN) + { + /* TMPLEN is too small - use a bigger buffer. This may happen + * when printing dictionary words using !! with a wildcard. */ + temp_string = alloca(templen+1); + templen = vsnprintf(temp_string, templen+1, fmt, args); + if (templen < 0) goto error; + } + va_end(args); + + patch_subscript_marks(temp_string); + dyn_strcat(string, temp_string); + return templen; + +error: + { + /* Some error has occurred */ + const char msg[] = "[vappend_string(): "; + strcpy(temp_buffer, msg); + strerror_r(errno, temp_buffer+sizeof(msg)-1, TMPLEN-sizeof(msg)); + strcat(temp_buffer, "]"); + dyn_strcat(string, temp_string); + va_end(args); + return templen; + } +} + +/** + * Append to a dynamic string with printf-like formatting. + * @return The number of appended bytes, or a negative value on error. + */ +int append_string(dyn_str * string, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + + return vappend_string(string, fmt, args); +} + +/** + * Append exactly one UTF-8 character to the string. + * Return the number of bytes to advance, until the + * next UTF-8 character. This might NOT be the same as + * number of bytes actually appended. Two things might + * happen: + * a) Invalid UTF-8 values are copied, but only one byte, + * followed by an additional blank. + * b) Valid UTF-8 code-points that do not have a known + * glyph are copied, followed by an additional blank. + * This additional blanks allows proper printing of these + * two cases, by allowing the terminal to display with + * "box fonts" - boxes containing hex code, usually two + * column-widths wide. + */ +size_t append_utf8_char(dyn_str * string, const char * mbs) +{ + /* Copy exactly one multi-byte character to buf */ + char buf[12]; + int nb = utf8_charlen(mbs); + int n = nb; + if (n < 0) n = 1; // charlen is negative if its not a valid UTF-8 + + assert(n<10, "Multi-byte character is too long!"); + strncpy(buf, mbs, n); + + // Whitepsace pad if its a bad value + if (nb < 0) { buf[n] = ' '; n++; } + + // Whitespace pad if not a known UTF-8 glyph. + if (0 < nb && utf8_charwidth(mbs) < 0) { buf[n] = ' '; n++; } + + // Null terminate. + buf[n] = 0; + dyn_strcat(string, buf); + + // How many bytes did we hoover in? + n = nb; + if (n < 0) n = 1; // advance exactly one byte, if invalid UTF-8 + return n; +} diff -Nru link-grammar-5.3.16/link-grammar/print/print-util.h link-grammar-5.5.0/link-grammar/print/print-util.h --- link-grammar-5.3.16/link-grammar/print/print-util.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print/print-util.h 2018-01-23 08:54:14.000000000 +0000 @@ -0,0 +1,53 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ +#ifndef LG_PRINT_UTIL_H_ +#define LG_PRINT_UTIL_H_ + +#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4) +#define GNUC_PRINTF( format_idx, arg_idx ) \ + __attribute__((__format__ (__printf__, format_idx, arg_idx))) +#else +#define GNUC_PRINTF( format_idx, arg_idx ) +#endif + +#include +#include + +#include "dict-common/dict-defines.h" /* SUBSCRIPT_MARK, SUBSCRIPT_DOT */ +#include "utilities.h" + +int append_string(dyn_str *, const char *fmt, ...) GNUC_PRINTF(2,3); +int vappend_string(dyn_str *, const char *fmt, va_list args) + GNUC_PRINTF(2,0); +size_t append_utf8_char(dyn_str *, const char * mbs); +size_t utf8_chars_in_width(const char *, size_t); +int utf8_charwidth(const char *); + +static inline void patch_subscript_mark(char *s) +{ + s = strchr(s, SUBSCRIPT_MARK); + if (NULL != s) + *s = SUBSCRIPT_DOT; +} + +static inline void patch_subscript_marks(char *s) +{ + while (NULL != (s = strchr(s, SUBSCRIPT_MARK))) + *s = SUBSCRIPT_DOT; +} + +static inline int display_width(int width, const char *s) +{ + return width + strlen(s) - utf8_strwidth(s); +} + +#endif diff -Nru link-grammar-5.3.16/link-grammar/print/wcwidth.c link-grammar-5.5.0/link-grammar/print/wcwidth.c --- link-grammar-5.3.16/link-grammar/print/wcwidth.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print/wcwidth.c 2017-05-28 11:14:43.000000000 +0000 @@ -0,0 +1,574 @@ +/* + * This is an implementation of wcwidth() and wcswidth() (defined in + * IEEE Std 1002.1-2001) for Unicode. + * + * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html + * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html + * + * In fixed-width output devices, Latin characters all occupy a single + * "cell" position of equal width, whereas ideographic CJK characters + * occupy two such cells. Interoperability between terminal-line + * applications and (teletype-style) character terminals using the + * UTF-8 encoding requires agreement on which character should advance + * the cursor by how many cell positions. No established formal + * standards exist at present on which Unicode character shall occupy + * how many cell positions on character terminals. These routines are + * a first attempt of defining such behavior based on simple rules + * applied to data provided by the Unicode Consortium. + * + * For some graphical characters, the Unicode standard explicitly + * defines a character-cell width via the definition of the East Asian + * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. + * In all these cases, there is no ambiguity about which width a + * terminal shall use. For characters in the East Asian Ambiguous (A) + * class, the width choice depends purely on a preference of backward + * compatibility with either historic CJK or Western practice. + * Choosing single-width for these characters is easy to justify as + * the appropriate long-term solution, as the CJK practice of + * displaying these characters as double-width comes from historic + * implementation simplicity (8-bit encoded characters were displayed + * single-width and 16-bit ones double-width, even for Greek, + * Cyrillic, etc.) and not any typographic considerations. + * + * Much less clear is the choice of width for the Not East Asian + * (Neutral) class. Existing practice does not dictate a width for any + * of these characters. It would nevertheless make sense + * typographically to allocate two character cells to characters such + * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be + * represented adequately with a single-width glyph. The following + * routines at present merely assign a single-cell width to all + * neutral characters, in the interest of simplicity. This is not + * entirely satisfactory and should be reconsidered before + * establishing a formal standard in this area. At the moment, the + * decision which Not East Asian (Neutral) characters should be + * represented by double-width glyphs cannot yet be answered by + * applying a simple rule from the Unicode database content. Setting + * up a proper standard for the behavior of UTF-8 character terminals + * will require a careful analysis not only of each Unicode character, + * but also of each presentation form, something the author of these + * routines has avoided to do so far. + * + * http://www.unicode.org/unicode/reports/tr11/ + * + * Markus Kuhn -- 2007-05-26 (Unicode 5.0) + * Copyright (c) 2014 Jeff Quast + * Copyright (C) Fredrik Fornwall 2016. + * Copyright (c) 2017 Linas Vepstas - updated for Unicode 9. + * + * Original version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c + * + * Updated with latest Unicode-9 tables from + * https://github.com/termux/wcwidth/blob/master/wcwidth.c + */ + +#include +#include +#include "wcwidth.h" + +struct interval { + int first; + int last; +}; + +/* auxiliary function for binary search in interval table */ +static bool bisearch(wchar_t ucs, const struct interval *table, int max) +{ + int min = 0; + int mid; + + if (ucs < table[0].first || ucs > table[max].last) + return false; + while (max >= min) { + mid = (min + max) / 2; + if (ucs > table[mid].last) + min = mid + 1; + else if (ucs < table[mid].first) + max = mid - 1; + else + return true; + } + + return false; +} + + +/* The following two functions define the column width of an ISO 10646 + * character as follows: + * + * - The null character (U+0000) has a column width of 0. + * + * - Other C0/C1 control characters and DEL will lead to a return + * value of -1. + * + * - Non-spacing and enclosing combining characters (general + * category code Mn or Me in the Unicode database) have a + * column width of 0. + * + * - SOFT HYPHEN (U+00AD) has a column width of 1. + * + * - Other format characters (general category code Cf in the Unicode + * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. + * + * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) + * have a column width of 0. + * + * - Spacing characters in the East Asian Wide (W) or East Asian + * Full-width (F) category as defined in Unicode Technical + * Report #11 have a column width of 2. + * + * - All remaining characters (including all printable + * ISO 8859-1 and WGL4 characters, Unicode control characters, + * etc.) have a column width of 1. + * + * This implementation assumes that wchar_t characters are encoded + * in ISO 10646. + */ + +int mk_wcwidth(wchar_t ucs) +{ + /* sorted list of non-overlapping intervals of non-spacing characters */ + /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ + + // From https://github.com/jquast/wcwidth/blob/master/wcwidth/table_zero.py + // at commit 0d7de112202cc8b2ebe9232ff4a5c954f19d561a (2016-07-02): + // Original source: DerivedGeneralCategory-9.0.0.txt + static const struct interval combining[] = { + {0x0300, 0x036f}, // Combining Grave Accent ..Combining Latin Small Le + {0x0483, 0x0489}, // Combining Cyrillic Titlo..Combining Cyrillic Milli + {0x0591, 0x05bd}, // Hebrew Accent Etnahta ..Hebrew Point Meteg + {0x05bf, 0x05bf}, // Hebrew Point Rafe ..Hebrew Point Rafe + {0x05c1, 0x05c2}, // Hebrew Point Shin Dot ..Hebrew Point Sin Dot + {0x05c4, 0x05c5}, // Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot + {0x05c7, 0x05c7}, // Hebrew Point Qamats Qata..Hebrew Point Qamats Qata + {0x0610, 0x061a}, // Arabic Sign Sallallahou ..Arabic Small Kasra + {0x064b, 0x065f}, // Arabic Fathatan ..Arabic Wavy Hamza Below + {0x0670, 0x0670}, // Arabic Letter Superscrip..Arabic Letter Superscrip + {0x06d6, 0x06dc}, // Arabic Small High Ligatu..Arabic Small High Seen + {0x06df, 0x06e4}, // Arabic Small High Rounde..Arabic Small High Madda + {0x06e7, 0x06e8}, // Arabic Small High Yeh ..Arabic Small High Noon + {0x06ea, 0x06ed}, // Arabic Empty Centre Low ..Arabic Small Low Meem + {0x0711, 0x0711}, // Syriac Letter Superscrip..Syriac Letter Superscrip + {0x0730, 0x074a}, // Syriac Pthaha Above ..Syriac Barrekh + {0x07a6, 0x07b0}, // Thaana Abafili ..Thaana Sukun + {0x07eb, 0x07f3}, // Nko Combining Sh||t High..Nko Combining Double Dot + {0x0816, 0x0819}, // Samaritan Mark In ..Samaritan Mark Dagesh + {0x081b, 0x0823}, // Samaritan Mark Epentheti..Samaritan Vowel Sign A + {0x0825, 0x0827}, // Samaritan Vowel Sign Sho..Samaritan Vowel Sign U + {0x0829, 0x082d}, // Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa + {0x0859, 0x085b}, // Mandaic Affrication Mark..Mandaic Gemination Mark + {0x08d4, 0x08e1}, // (nil) .. + {0x08e3, 0x0902}, // Arabic Turned Damma Belo..Devanagari Sign Anusvara + {0x093a, 0x093a}, // Devanagari Vowel Sign Oe..Devanagari Vowel Sign Oe + {0x093c, 0x093c}, // Devanagari Sign Nukta ..Devanagari Sign Nukta + {0x0941, 0x0948}, // Devanagari Vowel Sign U ..Devanagari Vowel Sign Ai + {0x094d, 0x094d}, // Devanagari Sign Virama ..Devanagari Sign Virama + {0x0951, 0x0957}, // Devanagari Stress Sign U..Devanagari Vowel Sign Uu + {0x0962, 0x0963}, // Devanagari Vowel Sign Vo..Devanagari Vowel Sign Vo + {0x0981, 0x0981}, // Bengali Sign Candrabindu..Bengali Sign Candrabindu + {0x09bc, 0x09bc}, // Bengali Sign Nukta ..Bengali Sign Nukta + {0x09c1, 0x09c4}, // Bengali Vowel Sign U ..Bengali Vowel Sign Vocal + {0x09cd, 0x09cd}, // Bengali Sign Virama ..Bengali Sign Virama + {0x09e2, 0x09e3}, // Bengali Vowel Sign Vocal..Bengali Vowel Sign Vocal + {0x0a01, 0x0a02}, // Gurmukhi Sign Adak Bindi..Gurmukhi Sign Bindi + {0x0a3c, 0x0a3c}, // Gurmukhi Sign Nukta ..Gurmukhi Sign Nukta + {0x0a41, 0x0a42}, // Gurmukhi Vowel Sign U ..Gurmukhi Vowel Sign Uu + {0x0a47, 0x0a48}, // Gurmukhi Vowel Sign Ee ..Gurmukhi Vowel Sign Ai + {0x0a4b, 0x0a4d}, // Gurmukhi Vowel Sign Oo ..Gurmukhi Sign Virama + {0x0a51, 0x0a51}, // Gurmukhi Sign Udaat ..Gurmukhi Sign Udaat + {0x0a70, 0x0a71}, // Gurmukhi Tippi ..Gurmukhi Addak + {0x0a75, 0x0a75}, // Gurmukhi Sign Yakash ..Gurmukhi Sign Yakash + {0x0a81, 0x0a82}, // Gujarati Sign Candrabind..Gujarati Sign Anusvara + {0x0abc, 0x0abc}, // Gujarati Sign Nukta ..Gujarati Sign Nukta + {0x0ac1, 0x0ac5}, // Gujarati Vowel Sign U ..Gujarati Vowel Sign Cand + {0x0ac7, 0x0ac8}, // Gujarati Vowel Sign E ..Gujarati Vowel Sign Ai + {0x0acd, 0x0acd}, // Gujarati Sign Virama ..Gujarati Sign Virama + {0x0ae2, 0x0ae3}, // Gujarati Vowel Sign Voca..Gujarati Vowel Sign Voca + {0x0b01, 0x0b01}, // ||iya Sign Candrabindu ..||iya Sign Candrabindu + {0x0b3c, 0x0b3c}, // ||iya Sign Nukta ..||iya Sign Nukta + {0x0b3f, 0x0b3f}, // ||iya Vowel Sign I ..||iya Vowel Sign I + {0x0b41, 0x0b44}, // ||iya Vowel Sign U ..||iya Vowel Sign Vocalic + {0x0b4d, 0x0b4d}, // ||iya Sign Virama ..||iya Sign Virama + {0x0b56, 0x0b56}, // ||iya Ai Length Mark ..||iya Ai Length Mark + {0x0b62, 0x0b63}, // ||iya Vowel Sign Vocalic..||iya Vowel Sign Vocalic + {0x0b82, 0x0b82}, // Tamil Sign Anusvara ..Tamil Sign Anusvara + {0x0bc0, 0x0bc0}, // Tamil Vowel Sign Ii ..Tamil Vowel Sign Ii + {0x0bcd, 0x0bcd}, // Tamil Sign Virama ..Tamil Sign Virama + {0x0c00, 0x0c00}, // Telugu Sign Combining Ca..Telugu Sign Combining Ca + {0x0c3e, 0x0c40}, // Telugu Vowel Sign Aa ..Telugu Vowel Sign Ii + {0x0c46, 0x0c48}, // Telugu Vowel Sign E ..Telugu Vowel Sign Ai + {0x0c4a, 0x0c4d}, // Telugu Vowel Sign O ..Telugu Sign Virama + {0x0c55, 0x0c56}, // Telugu Length Mark ..Telugu Ai Length Mark + {0x0c62, 0x0c63}, // Telugu Vowel Sign Vocali..Telugu Vowel Sign Vocali + {0x0c81, 0x0c81}, // Kannada Sign Candrabindu..Kannada Sign Candrabindu + {0x0cbc, 0x0cbc}, // Kannada Sign Nukta ..Kannada Sign Nukta + {0x0cbf, 0x0cbf}, // Kannada Vowel Sign I ..Kannada Vowel Sign I + {0x0cc6, 0x0cc6}, // Kannada Vowel Sign E ..Kannada Vowel Sign E + {0x0ccc, 0x0ccd}, // Kannada Vowel Sign Au ..Kannada Sign Virama + {0x0ce2, 0x0ce3}, // Kannada Vowel Sign Vocal..Kannada Vowel Sign Vocal + {0x0d01, 0x0d01}, // Malayalam Sign Candrabin..Malayalam Sign Candrabin + {0x0d41, 0x0d44}, // Malayalam Vowel Sign U ..Malayalam Vowel Sign Voc + {0x0d4d, 0x0d4d}, // Malayalam Sign Virama ..Malayalam Sign Virama + {0x0d62, 0x0d63}, // Malayalam Vowel Sign Voc..Malayalam Vowel Sign Voc + {0x0dca, 0x0dca}, // Sinhala Sign Al-lakuna ..Sinhala Sign Al-lakuna + {0x0dd2, 0x0dd4}, // Sinhala Vowel Sign Ketti..Sinhala Vowel Sign Ketti + {0x0dd6, 0x0dd6}, // Sinhala Vowel Sign Diga ..Sinhala Vowel Sign Diga + {0x0e31, 0x0e31}, // Thai Character Mai Han-a..Thai Character Mai Han-a + {0x0e34, 0x0e3a}, // Thai Character Sara I ..Thai Character Phinthu + {0x0e47, 0x0e4e}, // Thai Character Maitaikhu..Thai Character Yamakkan + {0x0eb1, 0x0eb1}, // Lao Vowel Sign Mai Kan ..Lao Vowel Sign Mai Kan + {0x0eb4, 0x0eb9}, // Lao Vowel Sign I ..Lao Vowel Sign Uu + {0x0ebb, 0x0ebc}, // Lao Vowel Sign Mai Kon ..Lao Semivowel Sign Lo + {0x0ec8, 0x0ecd}, // Lao Tone Mai Ek ..Lao Niggahita + {0x0f18, 0x0f19}, // Tibetan Astrological Sig..Tibetan Astrological Sig + {0x0f35, 0x0f35}, // Tibetan Mark Ngas Bzung ..Tibetan Mark Ngas Bzung + {0x0f37, 0x0f37}, // Tibetan Mark Ngas Bzung ..Tibetan Mark Ngas Bzung + {0x0f39, 0x0f39}, // Tibetan Mark Tsa -phru ..Tibetan Mark Tsa -phru + {0x0f71, 0x0f7e}, // Tibetan Vowel Sign Aa ..Tibetan Sign Rjes Su Nga + {0x0f80, 0x0f84}, // Tibetan Vowel Sign Rever..Tibetan Mark Halanta + {0x0f86, 0x0f87}, // Tibetan Sign Lci Rtags ..Tibetan Sign Yang Rtags + {0x0f8d, 0x0f97}, // Tibetan Subjoined Sign L..Tibetan Subjoined Letter + {0x0f99, 0x0fbc}, // Tibetan Subjoined Letter..Tibetan Subjoined Letter + {0x0fc6, 0x0fc6}, // Tibetan Symbol Padma Gda..Tibetan Symbol Padma Gda + {0x102d, 0x1030}, // Myanmar Vowel Sign I ..Myanmar Vowel Sign Uu + {0x1032, 0x1037}, // Myanmar Vowel Sign Ai ..Myanmar Sign Dot Below + {0x1039, 0x103a}, // Myanmar Sign Virama ..Myanmar Sign Asat + {0x103d, 0x103e}, // Myanmar Consonant Sign M..Myanmar Consonant Sign M + {0x1058, 0x1059}, // Myanmar Vowel Sign Vocal..Myanmar Vowel Sign Vocal + {0x105e, 0x1060}, // Myanmar Consonant Sign M..Myanmar Consonant Sign M + {0x1071, 0x1074}, // Myanmar Vowel Sign Geba ..Myanmar Vowel Sign Kayah + {0x1082, 0x1082}, // Myanmar Consonant Sign S..Myanmar Consonant Sign S + {0x1085, 0x1086}, // Myanmar Vowel Sign Shan ..Myanmar Vowel Sign Shan + {0x108d, 0x108d}, // Myanmar Sign Shan Counci..Myanmar Sign Shan Counci + {0x109d, 0x109d}, // Myanmar Vowel Sign Aiton..Myanmar Vowel Sign Aiton + {0x135d, 0x135f}, // Ethiopic Combining Gemin..Ethiopic Combining Gemin + {0x1712, 0x1714}, // Tagalog Vowel Sign I ..Tagalog Sign Virama + {0x1732, 0x1734}, // Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod + {0x1752, 0x1753}, // Buhid Vowel Sign I ..Buhid Vowel Sign U + {0x1772, 0x1773}, // Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U + {0x17b4, 0x17b5}, // Khmer Vowel Inherent Aq ..Khmer Vowel Inherent Aa + {0x17b7, 0x17bd}, // Khmer Vowel Sign I ..Khmer Vowel Sign Ua + {0x17c6, 0x17c6}, // Khmer Sign Nikahit ..Khmer Sign Nikahit + {0x17c9, 0x17d3}, // Khmer Sign Muusikatoan ..Khmer Sign Bathamasat + {0x17dd, 0x17dd}, // Khmer Sign Atthacan ..Khmer Sign Atthacan + {0x180b, 0x180d}, // Mongolian Free Variation..Mongolian Free Variation + {0x1885, 0x1886}, // Mongolian Letter Ali Gal..Mongolian Letter Ali Gal + {0x18a9, 0x18a9}, // Mongolian Letter Ali Gal..Mongolian Letter Ali Gal + {0x1920, 0x1922}, // Limbu Vowel Sign A ..Limbu Vowel Sign U + {0x1927, 0x1928}, // Limbu Vowel Sign E ..Limbu Vowel Sign O + {0x1932, 0x1932}, // Limbu Small Letter Anusv..Limbu Small Letter Anusv + {0x1939, 0x193b}, // Limbu Sign Mukphreng ..Limbu Sign Sa-i + {0x1a17, 0x1a18}, // Buginese Vowel Sign I ..Buginese Vowel Sign U + {0x1a1b, 0x1a1b}, // Buginese Vowel Sign Ae ..Buginese Vowel Sign Ae + {0x1a56, 0x1a56}, // Tai Tham Consonant Sign ..Tai Tham Consonant Sign + {0x1a58, 0x1a5e}, // Tai Tham Sign Mai Kang L..Tai Tham Consonant Sign + {0x1a60, 0x1a60}, // Tai Tham Sign Sakot ..Tai Tham Sign Sakot + {0x1a62, 0x1a62}, // Tai Tham Vowel Sign Mai ..Tai Tham Vowel Sign Mai + {0x1a65, 0x1a6c}, // Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B + {0x1a73, 0x1a7c}, // Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue + {0x1a7f, 0x1a7f}, // Tai Tham Combining Crypt..Tai Tham Combining Crypt + {0x1ab0, 0x1abe}, // Combining Doubled Circum..Combining Parentheses Ov + {0x1b00, 0x1b03}, // Balinese Sign Ulu Ricem ..Balinese Sign Surang + {0x1b34, 0x1b34}, // Balinese Sign Rerekan ..Balinese Sign Rerekan + {0x1b36, 0x1b3a}, // Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R + {0x1b3c, 0x1b3c}, // Balinese Vowel Sign La L..Balinese Vowel Sign La L + {0x1b42, 0x1b42}, // Balinese Vowel Sign Pepe..Balinese Vowel Sign Pepe + {0x1b6b, 0x1b73}, // Balinese Musical Symbol ..Balinese Musical Symbol + {0x1b80, 0x1b81}, // Sundanese Sign Panyecek ..Sundanese Sign Panglayar + {0x1ba2, 0x1ba5}, // Sundanese Consonant Sign..Sundanese Vowel Sign Pan + {0x1ba8, 0x1ba9}, // Sundanese Vowel Sign Pam..Sundanese Vowel Sign Pan + {0x1bab, 0x1bad}, // Sundanese Sign Virama ..Sundanese Consonant Sign + {0x1be6, 0x1be6}, // Batak Sign Tompi ..Batak Sign Tompi + {0x1be8, 0x1be9}, // Batak Vowel Sign Pakpak ..Batak Vowel Sign Ee + {0x1bed, 0x1bed}, // Batak Vowel Sign Karo O ..Batak Vowel Sign Karo O + {0x1bef, 0x1bf1}, // Batak Vowel Sign U F|| S..Batak Consonant Sign H + {0x1c2c, 0x1c33}, // Lepcha Vowel Sign E ..Lepcha Consonant Sign T + {0x1c36, 0x1c37}, // Lepcha Sign Ran ..Lepcha Sign Nukta + {0x1cd0, 0x1cd2}, // Vedic Tone Karshana ..Vedic Tone Prenkha + {0x1cd4, 0x1ce0}, // Vedic Sign Yajurvedic Mi..Vedic Tone Rigvedic Kash + {0x1ce2, 0x1ce8}, // Vedic Sign Visarga Svari..Vedic Sign Visarga Anuda + {0x1ced, 0x1ced}, // Vedic Sign Tiryak ..Vedic Sign Tiryak + {0x1cf4, 0x1cf4}, // Vedic Tone Candra Above ..Vedic Tone Candra Above + {0x1cf8, 0x1cf9}, // Vedic Tone Ring Above ..Vedic Tone Double Ring A + {0x1dc0, 0x1df5}, // Combining Dotted Grave A..Combining Up Tack Above + {0x1dfb, 0x1dff}, // (nil) ..Combining Right Arrowhea + {0x20d0, 0x20f0}, // Combining Left Harpoon A..Combining Asterisk Above + {0x2cef, 0x2cf1}, // Coptic Combining Ni Abov..Coptic Combining Spiritu + {0x2d7f, 0x2d7f}, // Tifinagh Consonant Joine..Tifinagh Consonant Joine + {0x2de0, 0x2dff}, // Combining Cyrillic Lette..Combining Cyrillic Lette + {0x302a, 0x302d}, // Ideographic Level Tone M..Ideographic Entering Ton + {0x3099, 0x309a}, // Combining Katakana-hirag..Combining Katakana-hirag + {0xa66f, 0xa672}, // Combining Cyrillic Vzmet..Combining Cyrillic Thous + {0xa674, 0xa67d}, // Combining Cyrillic Lette..Combining Cyrillic Payer + {0xa69e, 0xa69f}, // Combining Cyrillic Lette..Combining Cyrillic Lette + {0xa6f0, 0xa6f1}, // Bamum Combining Mark Koq..Bamum Combining Mark Tuk + {0xa802, 0xa802}, // Syloti Nagri Sign Dvisva..Syloti Nagri Sign Dvisva + {0xa806, 0xa806}, // Syloti Nagri Sign Hasant..Syloti Nagri Sign Hasant + {0xa80b, 0xa80b}, // Syloti Nagri Sign Anusva..Syloti Nagri Sign Anusva + {0xa825, 0xa826}, // Syloti Nagri Vowel Sign ..Syloti Nagri Vowel Sign + {0xa8c4, 0xa8c5}, // Saurashtra Sign Virama .. + {0xa8e0, 0xa8f1}, // Combining Devanagari Dig..Combining Devanagari Sig + {0xa926, 0xa92d}, // Kayah Li Vowel Ue ..Kayah Li Tone Calya Plop + {0xa947, 0xa951}, // Rejang Vowel Sign I ..Rejang Consonant Sign R + {0xa980, 0xa982}, // Javanese Sign Panyangga ..Javanese Sign Layar + {0xa9b3, 0xa9b3}, // Javanese Sign Cecak Telu..Javanese Sign Cecak Telu + {0xa9b6, 0xa9b9}, // Javanese Vowel Sign Wulu..Javanese Vowel Sign Suku + {0xa9bc, 0xa9bc}, // Javanese Vowel Sign Pepe..Javanese Vowel Sign Pepe + {0xa9e5, 0xa9e5}, // Myanmar Sign Shan Saw ..Myanmar Sign Shan Saw + {0xaa29, 0xaa2e}, // Cham Vowel Sign Aa ..Cham Vowel Sign Oe + {0xaa31, 0xaa32}, // Cham Vowel Sign Au ..Cham Vowel Sign Ue + {0xaa35, 0xaa36}, // Cham Consonant Sign La ..Cham Consonant Sign Wa + {0xaa43, 0xaa43}, // Cham Consonant Sign Fina..Cham Consonant Sign Fina + {0xaa4c, 0xaa4c}, // Cham Consonant Sign Fina..Cham Consonant Sign Fina + {0xaa7c, 0xaa7c}, // Myanmar Sign Tai Laing T..Myanmar Sign Tai Laing T + {0xaab0, 0xaab0}, // Tai Viet Mai Kang ..Tai Viet Mai Kang + {0xaab2, 0xaab4}, // Tai Viet Vowel I ..Tai Viet Vowel U + {0xaab7, 0xaab8}, // Tai Viet Mai Khit ..Tai Viet Vowel Ia + {0xaabe, 0xaabf}, // Tai Viet Vowel Am ..Tai Viet Tone Mai Ek + {0xaac1, 0xaac1}, // Tai Viet Tone Mai Tho ..Tai Viet Tone Mai Tho + {0xaaec, 0xaaed}, // Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign + {0xaaf6, 0xaaf6}, // Meetei Mayek Virama ..Meetei Mayek Virama + {0xabe5, 0xabe5}, // Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign + {0xabe8, 0xabe8}, // Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign + {0xabed, 0xabed}, // Meetei Mayek Apun Iyek ..Meetei Mayek Apun Iyek + {0xfb1e, 0xfb1e}, // Hebrew Point Judeo-spani..Hebrew Point Judeo-spani + {0xfe00, 0xfe0f}, // Variation Select||-1 ..Variation Select||-16 + {0xfe20, 0xfe2f}, // Combining Ligature Left ..Combining Cyrillic Titlo + {0x101fd, 0x101fd}, // Phaistos Disc Sign Combi..Phaistos Disc Sign Combi + {0x102e0, 0x102e0}, // Coptic Epact Thousands M..Coptic Epact Thousands M + {0x10376, 0x1037a}, // Combining Old Permic Let..Combining Old Permic Let + {0x10a01, 0x10a03}, // Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo + {0x10a05, 0x10a06}, // Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O + {0x10a0c, 0x10a0f}, // Kharoshthi Vowel Length ..Kharoshthi Sign Visarga + {0x10a38, 0x10a3a}, // Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo + {0x10a3f, 0x10a3f}, // Kharoshthi Virama ..Kharoshthi Virama + {0x10ae5, 0x10ae6}, // Manichaean Abbreviation ..Manichaean Abbreviation + {0x11001, 0x11001}, // Brahmi Sign Anusvara ..Brahmi Sign Anusvara + {0x11038, 0x11046}, // Brahmi Vowel Sign Aa ..Brahmi Virama + {0x1107f, 0x11081}, // Brahmi Number Joiner ..Kaithi Sign Anusvara + {0x110b3, 0x110b6}, // Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai + {0x110b9, 0x110ba}, // Kaithi Sign Virama ..Kaithi Sign Nukta + {0x11100, 0x11102}, // Chakma Sign Candrabindu ..Chakma Sign Visarga + {0x11127, 0x1112b}, // Chakma Vowel Sign A ..Chakma Vowel Sign Uu + {0x1112d, 0x11134}, // Chakma Vowel Sign Ai ..Chakma Maayyaa + {0x11173, 0x11173}, // Mahajani Sign Nukta ..Mahajani Sign Nukta + {0x11180, 0x11181}, // Sharada Sign Candrabindu..Sharada Sign Anusvara + {0x111b6, 0x111be}, // Sharada Vowel Sign U ..Sharada Vowel Sign O + {0x111ca, 0x111cc}, // Sharada Sign Nukta ..Sharada Extra Sh||t Vowe + {0x1122f, 0x11231}, // Khojki Vowel Sign U ..Khojki Vowel Sign Ai + {0x11234, 0x11234}, // Khojki Sign Anusvara ..Khojki Sign Anusvara + {0x11236, 0x11237}, // Khojki Sign Nukta ..Khojki Sign Shadda + {0x1123e, 0x1123e}, // (nil) .. + {0x112df, 0x112df}, // Khudawadi Sign Anusvara ..Khudawadi Sign Anusvara + {0x112e3, 0x112ea}, // Khudawadi Vowel Sign U ..Khudawadi Sign Virama + {0x11300, 0x11301}, // Grantha Sign Combining A..Grantha Sign Candrabindu + {0x1133c, 0x1133c}, // Grantha Sign Nukta ..Grantha Sign Nukta + {0x11340, 0x11340}, // Grantha Vowel Sign Ii ..Grantha Vowel Sign Ii + {0x11366, 0x1136c}, // Combining Grantha Digit ..Combining Grantha Digit + {0x11370, 0x11374}, // Combining Grantha Letter..Combining Grantha Letter + {0x11438, 0x1143f}, // (nil) .. + {0x11442, 0x11444}, // (nil) .. + {0x11446, 0x11446}, // (nil) .. + {0x114b3, 0x114b8}, // Tirhuta Vowel Sign U ..Tirhuta Vowel Sign Vocal + {0x114ba, 0x114ba}, // Tirhuta Vowel Sign Sh||t..Tirhuta Vowel Sign Sh||t + {0x114bf, 0x114c0}, // Tirhuta Sign Candrabindu..Tirhuta Sign Anusvara + {0x114c2, 0x114c3}, // Tirhuta Sign Virama ..Tirhuta Sign Nukta + {0x115b2, 0x115b5}, // Siddham Vowel Sign U ..Siddham Vowel Sign Vocal + {0x115bc, 0x115bd}, // Siddham Sign Candrabindu..Siddham Sign Anusvara + {0x115bf, 0x115c0}, // Siddham Sign Virama ..Siddham Sign Nukta + {0x115dc, 0x115dd}, // Siddham Vowel Sign Alter..Siddham Vowel Sign Alter + {0x11633, 0x1163a}, // Modi Vowel Sign U ..Modi Vowel Sign Ai + {0x1163d, 0x1163d}, // Modi Sign Anusvara ..Modi Sign Anusvara + {0x1163f, 0x11640}, // Modi Sign Virama ..Modi Sign Ardhacandra + {0x116ab, 0x116ab}, // Takri Sign Anusvara ..Takri Sign Anusvara + {0x116ad, 0x116ad}, // Takri Vowel Sign Aa ..Takri Vowel Sign Aa + {0x116b0, 0x116b5}, // Takri Vowel Sign U ..Takri Vowel Sign Au + {0x116b7, 0x116b7}, // Takri Sign Nukta ..Takri Sign Nukta + {0x1171d, 0x1171f}, // Ahom Consonant Sign Medi..Ahom Consonant Sign Medi + {0x11722, 0x11725}, // Ahom Vowel Sign I ..Ahom Vowel Sign Uu + {0x11727, 0x1172b}, // Ahom Vowel Sign Aw ..Ahom Sign Killer + {0x11c30, 0x11c36}, // (nil) .. + {0x11c38, 0x11c3d}, // (nil) .. + {0x11c3f, 0x11c3f}, // (nil) .. + {0x11c92, 0x11ca7}, // (nil) .. + {0x11caa, 0x11cb0}, // (nil) .. + {0x11cb2, 0x11cb3}, // (nil) .. + {0x11cb5, 0x11cb6}, // (nil) .. + {0x16af0, 0x16af4}, // Bassa Vah Combining High..Bassa Vah Combining High + {0x16b30, 0x16b36}, // Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta + {0x16f8f, 0x16f92}, // Miao Tone Right ..Miao Tone Below + {0x1bc9d, 0x1bc9e}, // Duployan Thick Letter Se..Duployan Double Mark + {0x1d167, 0x1d169}, // Musical Symbol Combining..Musical Symbol Combining + {0x1d17b, 0x1d182}, // Musical Symbol Combining..Musical Symbol Combining + {0x1d185, 0x1d18b}, // Musical Symbol Combining..Musical Symbol Combining + {0x1d1aa, 0x1d1ad}, // Musical Symbol Combining..Musical Symbol Combining + {0x1d242, 0x1d244}, // Combining Greek Musical ..Combining Greek Musical + {0x1da00, 0x1da36}, // Signwriting Head Rim ..Signwriting Air Sucking + {0x1da3b, 0x1da6c}, // Signwriting Mouth Closed..Signwriting Excitement + {0x1da75, 0x1da75}, // Signwriting Upper Body T..Signwriting Upper Body T + {0x1da84, 0x1da84}, // Signwriting Location Hea..Signwriting Location Hea + {0x1da9b, 0x1da9f}, // Signwriting Fill Modifie..Signwriting Fill Modifie + {0x1daa1, 0x1daaf}, // Signwriting Rotation Mod..Signwriting Rotation Mod + {0x1e000, 0x1e006}, // (nil) .. + {0x1e008, 0x1e018}, // (nil) .. + {0x1e01b, 0x1e021}, // (nil) .. + {0x1e023, 0x1e024}, // (nil) .. + {0x1e026, 0x1e02a}, // (nil) .. + {0x1e8d0, 0x1e8d6}, // Mende Kikakui Combining ..Mende Kikakui Combining + {0x1e944, 0x1e94a}, // (nil) .. + {0xe0100, 0xe01ef}, // Variation Select||-17 ..Variation Select||-256 + }; + + // https://github.com/jquast/wcwidth/blob/master/wcwidth/table_wide.py + // at commit 0d7de112202cc8b2ebe9232ff4a5c954f19d561a (2016-07-02): + // Original source: EastAsianWidth-9.0.0.txt + static struct interval WIDE_GLYPH[] = { + {0x1100, 0x115f}, // Hangul Choseong Kiyeok ..Hangul Choseong Filler + {0x231a, 0x231b}, // Watch ..Hourglass + {0x2329, 0x232a}, // Left-pointing Angle Brac..Right-pointing Angle Bra + {0x23e9, 0x23ec}, // Black Right-pointing Dou..Black Down-pointing Doub + {0x23f0, 0x23f0}, // Alarm Clock ..Alarm Clock + {0x23f3, 0x23f3}, // Hourglass With Flowing S..Hourglass With Flowing S + {0x25fd, 0x25fe}, // White Medium Small Squar..Black Medium Small Squar + {0x2614, 0x2615}, // Umbrella With Rain Drops..Hot Beverage + {0x2648, 0x2653}, // Aries ..Pisces + {0x267f, 0x267f}, // Wheelchair Symbol ..Wheelchair Symbol + {0x2693, 0x2693}, // Anch|| ..Anch|| + {0x26a1, 0x26a1}, // High Voltage Sign ..High Voltage Sign + {0x26aa, 0x26ab}, // Medium White Circle ..Medium Black Circle + {0x26bd, 0x26be}, // Soccer Ball ..Baseball + {0x26c4, 0x26c5}, // Snowman Without Snow ..Sun Behind Cloud + {0x26ce, 0x26ce}, // Ophiuchus ..Ophiuchus + {0x26d4, 0x26d4}, // No Entry ..No Entry + {0x26ea, 0x26ea}, // Church ..Church + {0x26f2, 0x26f3}, // Fountain ..Flag In Hole + {0x26f5, 0x26f5}, // Sailboat ..Sailboat + {0x26fa, 0x26fa}, // Tent ..Tent + {0x26fd, 0x26fd}, // Fuel Pump ..Fuel Pump + {0x2705, 0x2705}, // White Heavy Check Mark ..White Heavy Check Mark + {0x270a, 0x270b}, // Raised Fist ..Raised Hand + {0x2728, 0x2728}, // Sparkles ..Sparkles + {0x274c, 0x274c}, // Cross Mark ..Cross Mark + {0x274e, 0x274e}, // Negative Squared Cross M..Negative Squared Cross M + {0x2753, 0x2755}, // Black Question Mark ||na..White Exclamation Mark O + {0x2757, 0x2757}, // Heavy Exclamation Mark S..Heavy Exclamation Mark S + {0x2795, 0x2797}, // Heavy Plus Sign ..Heavy Division Sign + {0x27b0, 0x27b0}, // Curly Loop ..Curly Loop + {0x27bf, 0x27bf}, // Double Curly Loop ..Double Curly Loop + {0x2b1b, 0x2b1c}, // Black Large Square ..White Large Square + {0x2b50, 0x2b50}, // White Medium Star ..White Medium Star + {0x2b55, 0x2b55}, // Heavy Large Circle ..Heavy Large Circle + {0x2e80, 0x2e99}, // Cjk Radical Repeat ..Cjk Radical Rap + {0x2e9b, 0x2ef3}, // Cjk Radical Choke ..Cjk Radical C-simplified + {0x2f00, 0x2fd5}, // Kangxi Radical One ..Kangxi Radical Flute + {0x2ff0, 0x2ffb}, // Ideographic Description ..Ideographic Description + {0x3000, 0x303e}, // Ideographic Space ..Ideographic Variation In + {0x3041, 0x3096}, // Hiragana Letter Small A ..Hiragana Letter Small Ke + {0x3099, 0x30ff}, // Combining Katakana-hirag..Katakana Digraph Koto + {0x3105, 0x312d}, // Bopomofo Letter B ..Bopomofo Letter Ih + {0x3131, 0x318e}, // Hangul Letter Kiyeok ..Hangul Letter Araeae + {0x3190, 0x31ba}, // Ideographic Annotation L..Bopomofo Letter Zy + {0x31c0, 0x31e3}, // Cjk Stroke T ..Cjk Stroke Q + {0x31f0, 0x321e}, // Katakana Letter Small Ku..Parenthesized K||ean Cha + {0x3220, 0x3247}, // Parenthesized Ideograph ..Circled Ideograph Koto + {0x3250, 0x32fe}, // Partnership Sign ..Circled Katakana Wo + {0x3300, 0x4dbf}, // Square Apaato .. + {0x4e00, 0xa48c}, // Cjk Unified Ideograph-4e..Yi Syllable Yyr + {0xa490, 0xa4c6}, // Yi Radical Qot ..Yi Radical Ke + {0xa960, 0xa97c}, // Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo + {0xac00, 0xd7a3}, // Hangul Syllable Ga ..Hangul Syllable Hih + {0xf900, 0xfaff}, // Cjk Compatibility Ideogr.. + {0xfe10, 0xfe19}, // Presentation F||m F|| Ve..Presentation F||m F|| Ve + {0xfe30, 0xfe52}, // Presentation F||m F|| Ve..Small Full Stop + {0xfe54, 0xfe66}, // Small Semicolon ..Small Equals Sign + {0xfe68, 0xfe6b}, // Small Reverse Solidus ..Small Commercial At + {0xff01, 0xff60}, // Fullwidth Exclamation Ma..Fullwidth Right White Pa + {0xffe0, 0xffe6}, // Fullwidth Cent Sign ..Fullwidth Won Sign + {0x16fe0, 0x16fe0}, // (nil) .. + {0x17000, 0x187ec}, // (nil) .. + {0x18800, 0x18af2}, // (nil) .. + {0x1b000, 0x1b001}, // Katakana Letter Archaic ..Hiragana Letter Archaic + {0x1f004, 0x1f004}, // Mahjong Tile Red Dragon ..Mahjong Tile Red Dragon + {0x1f0cf, 0x1f0cf}, // Playing Card Black Joker..Playing Card Black Joker + {0x1f18e, 0x1f18e}, // Negative Squared Ab ..Negative Squared Ab + {0x1f191, 0x1f19a}, // Squared Cl ..Squared Vs + {0x1f200, 0x1f202}, // Square Hiragana Hoka ..Squared Katakana Sa + {0x1f210, 0x1f23b}, // Squared Cjk Unified Ideo.. + {0x1f240, 0x1f248}, // T||toise Shell Bracketed..T||toise Shell Bracketed + {0x1f250, 0x1f251}, // Circled Ideograph Advant..Circled Ideograph Accept + {0x1f300, 0x1f320}, // Cyclone ..Shooting Star + {0x1f32d, 0x1f335}, // Hot Dog ..Cactus + {0x1f337, 0x1f37c}, // Tulip ..Baby Bottle + {0x1f37e, 0x1f393}, // Bottle With Popping C||k..Graduation Cap + {0x1f3a0, 0x1f3ca}, // Carousel H||se ..Swimmer + {0x1f3cf, 0x1f3d3}, // Cricket Bat And Ball ..Table Tennis Paddle And + {0x1f3e0, 0x1f3f0}, // House Building ..European Castle + {0x1f3f4, 0x1f3f4}, // Waving Black Flag ..Waving Black Flag + {0x1f3f8, 0x1f43e}, // Badminton Racquet And Sh..Paw Prints + {0x1f440, 0x1f440}, // Eyes ..Eyes + {0x1f442, 0x1f4fc}, // Ear ..Videocassette + {0x1f4ff, 0x1f53d}, // Prayer Beads ..Down-pointing Small Red + {0x1f54b, 0x1f54e}, // Kaaba ..Men||ah With Nine Branch + {0x1f550, 0x1f567}, // Clock Face One Oclock ..Clock Face Twelve-thirty + {0x1f57a, 0x1f57a}, // (nil) .. + {0x1f595, 0x1f596}, // Reversed Hand With Middl..Raised Hand With Part Be + {0x1f5a4, 0x1f5a4}, // (nil) .. + {0x1f5fb, 0x1f64f}, // Mount Fuji ..Person With Folded Hands + {0x1f680, 0x1f6c5}, // Rocket ..Left Luggage + {0x1f6cc, 0x1f6cc}, // Sleeping Accommodation ..Sleeping Accommodation + {0x1f6d0, 0x1f6d2}, // Place Of W||ship .. + {0x1f6eb, 0x1f6ec}, // Airplane Departure ..Airplane Arriving + {0x1f6f4, 0x1f6f6}, // (nil) .. + {0x1f910, 0x1f91e}, // Zipper-mouth Face .. + {0x1f920, 0x1f927}, // (nil) .. + {0x1f930, 0x1f930}, // (nil) .. + {0x1f933, 0x1f93e}, // (nil) .. + {0x1f940, 0x1f94b}, // (nil) .. + {0x1f950, 0x1f95e}, // (nil) .. + {0x1f980, 0x1f991}, // Crab .. + {0x1f9c0, 0x1f9c0}, // Cheese Wedge ..Cheese Wedge + {0x20000, 0x2fffd}, // Cjk Unified Ideograph-20.. + {0x30000, 0x3fffd}, // (nil) .. +}; + + /* test for 8-bit control characters */ + if (ucs == 0) + return 0; + if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) + return -1; + + /* Fast return for latin. */ + if (ucs < 0x300) return 1; + + /* Binary search in table of non-spacing characters. */ + if (bisearch(ucs, combining, + sizeof(combining) / sizeof(struct interval) - 1)) + return 0; + + /* Other zero-width characters not in table above. */ + if ((0x200B <= ucs && ucs <= 0x200F) || + ucs == 0x2028 || + ucs == 0x2029 || + (0x202A <= ucs && ucs <= 0x202E) || + (0x2060 <= ucs && ucs <= 0x2063)) + return 0; + + /* If we arrive here, ucs is not a combining or C0/C1 control character */ + if (bisearch(ucs, WIDE_GLYPH, + sizeof(WIDE_GLYPH) / sizeof(struct interval) - 1)) + return 2; + return 1; +} + + +int mk_wcswidth(const wchar_t *pwcs, size_t n) +{ + int w, width = 0; + + for (;*pwcs && n-- > 0; pwcs++) + if ((w = mk_wcwidth(*pwcs)) < 0) + return -1; + else + width += w; + + return width; +} diff -Nru link-grammar-5.3.16/link-grammar/print/wcwidth.h link-grammar-5.5.0/link-grammar/print/wcwidth.h --- link-grammar-5.3.16/link-grammar/print/wcwidth.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print/wcwidth.h 2017-05-28 11:14:43.000000000 +0000 @@ -0,0 +1,8 @@ +#ifndef _WCWIDTH_H +#define _WCWIDTH_H + +#include + +int mk_wcwidth(wchar_t); +int mk_wcswidth(const wchar_t *, size_t); +#endif /* _WCWIDTH_H */ diff -Nru link-grammar-5.3.16/link-grammar/print.c link-grammar-5.5.0/link-grammar/print.c --- link-grammar-5.3.16/link-grammar/print.c 2017-04-15 21:43:39.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1481 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright 2013, 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - - -#include -#include - -#include "externs.h" -#include "api-structures.h" -#include "corpus/corpus.h" -#include "print-util.h" -#include "string-set.h" -#include "structures.h" -#include "print.h" /* needs structure.h */ -#include "word-utils.h" - -#define LEFT_WALL_SUPPRESS ("Wd") /* If this connector is used on the wall, */ - /* then suppress the display of the wall. */ -#define RIGHT_WALL_SUPPRESS ("RW")/* If this connector is used on the wall, */ - /* then suppress the display of the wall. */ - -#define HEAD_CHR ('h') /* Single char marking head-word. */ -#define DEPT_CHR ('d') /* Single char marking dependent word */ - -/** - * Find the position of the center of each word. - * Also find the offset of each word, relative to the previous one, - * needed to fully fit the names of the links between them. - * FIXME Long link names between more distant words may still not - * fit the space between these words. - * - * Return the number of characters needed for the all the words, - * including the space needed for the link names as described above. - */ -static size_t -set_centers(const Linkage linkage, int center[], int word_offset[], - bool print_word_0, int N_words_to_print) -{ - int i, tot; - size_t n; - int start_word = print_word_0 ? 0 : 1; - int *link_len = alloca(linkage->num_words * sizeof(*link_len)); - size_t max_line_len = 0; /* Needed picture array line length */ - - memset(link_len, 0, linkage->num_words * sizeof(*link_len)); - - for (n = 0; n < linkage->num_links; n++) - { - Link *l = &linkage->link_array[n]; - - if ((l->lw + 1 == l->rw) && (NULL != l->link_name)) - { - link_len[l->rw] = strlen(l->link_name) + - (DEPT_CHR == l->rc->string[0]) + - (HEAD_CHR == l->rc->string[0]) + - (DEPT_CHR == l->lc->string[0]) + - (HEAD_CHR == l->lc->string[0]); - } - } - - tot = 0; - for (i = start_word; i < N_words_to_print; i++) - { - int len, center_t; - - /* Centers obtained by counting the characters column widths, - * not the bytes in the string. */ - len = utf8_strwidth(linkage->word[i]); - center_t = tot + (len/2); -#if 1 /* Long labels - disable in order to compare output with old versions. */ - if (i > start_word) - center[i] = MAX(center_t, center[i-1] + link_len[i] + 1); - else -#endif - center[i] = center_t; - word_offset[i] = center[i] - center_t; - tot += len+1 + word_offset[i]; - max_line_len += word_offset[i] + strlen(linkage->word[i]) + 1; - } - - return max_line_len; -} - -/* The following are all for generating postscript */ -typedef struct -{ - int N_rows; /* N_rows -- the number of rows */ - /* tells the height of the links above the sentence */ - int * link_heights; - /* the word beginning each row of the display */ - int * row_starts; -} ps_ctxt_t; - -/** - * Prints s then prints the last |t|-|s| characters of t. - * if s is longer than t, it truncates s. - * Handles utf8 strings correctly. - */ -static void left_append_string(String * string, const char * s, const char * t) -{ - size_t i; - size_t slen = utf8_strwidth(s); - size_t tlen = utf8_strwidth(t); - - for (i = 0; i < tlen; i++) - { - if (i < slen) - append_utf8_char(string, s); - else - append_utf8_char(string, t); - - s += utf8_next(s); - t += utf8_next(t); - } -} - -static void print_a_link(String * s, const Linkage linkage, LinkIdx link) -{ - WordIdx l, r; - const char *label, *llabel, *rlabel; - - l = linkage_get_link_lword(linkage, link); - r = linkage_get_link_rword(linkage, link); - label = linkage_get_link_label(linkage, link); - llabel = linkage_get_link_llabel(linkage, link); - rlabel = linkage_get_link_rlabel(linkage, link); - - if (l == 0) - { - left_append_string(s, LEFT_WALL_DISPLAY, " "); - } - else if (l == (linkage_get_num_words(linkage) - 1)) - { - left_append_string(s, RIGHT_WALL_DISPLAY, " "); - } - else - { - left_append_string(s, linkage_get_word(linkage, l), " "); - } - - /* XXX FIXME -- the space allocated to a link name should depend - * on the longest link-name in the sentence! */ - left_append_string(s, llabel, " "); - if (DEPT_CHR == llabel[0]) - append_string(s, " <---"); - else if (HEAD_CHR == llabel[0]) - append_string(s, " >---"); - else - append_string(s, " ----"); - - left_append_string(s, label, "-----"); - if (DEPT_CHR == rlabel[0]) - append_string(s, "-> "); - else if (HEAD_CHR == rlabel[0]) - append_string(s, "-< "); - else - append_string(s, "-- "); - left_append_string(s, rlabel, " "); - append_string(s, " %s\n", linkage_get_word(linkage, r)); -} - - -/** - * To the left of each link, print the sequence of domains it is in. - * Printing a domain means printing its type. - * Takes info from pp_link_array and pp and chosen_words. - */ -char * linkage_print_links_and_domains(const Linkage linkage) -{ - int link, longest, j; - int N_links = linkage_get_num_links(linkage); - String * s = string_new(); - char * links_string; - const char ** dname; - - longest = 0; - for (link=0; link longest) - longest = linkage_get_link_num_domains(linkage, link); - } - for (link=0; linkinfo; - Sense *sns; - size_t nwords; - WordIdx w; - - lg_corpus_linkage_senses(linkage); - - nwords = linkage->num_words; - for (w=0; wnum_words; - - /* Loop over each word in the sentence */ - for (w = 0; w < nwords; w++) - { - int pad = 21; - double cost; - char infword[MAX_WORD]; - Disjunct *disj = linkage->chosen_disjuncts[w]; - if (NULL == disj) continue; - - /* Cleanup the subscript mark before printing. */ - strncpy(infword, disj->string, MAX_WORD); - mark = strchr(infword, SUBSCRIPT_MARK); - if (mark) *mark = SUBSCRIPT_DOT; - - /* Make sure the glyphs align during printing. */ - pad += strlen(infword) - utf8_strwidth(infword); - - dj = linkage_get_disjunct_str(linkage, w); - if (NULL == dj) dj = ""; - cost = linkage_get_disjunct_cost(linkage, w); - -#ifdef USE_CORPUS - score = linkage_get_disjunct_corpus_score(linkage, w); - append_string(s, "%*s %5.3f %6.3f %s\n", pad, infword, cost, score, dj); -#else - append_string(s, "%*s %5.3f %s\n", pad, infword, cost, dj); -#endif - } - djs = string_copy(s); - string_delete(s); - return djs; -} - -/** - * postscript printing ... - */ -static char * -build_linkage_postscript_string(const Linkage linkage, - bool display_walls, ps_ctxt_t *pctx) -{ - int link, i,j; - int d; - bool print_word_0, print_word_N; - int N_links = linkage->num_links; - Link *ppla = linkage->link_array; - String * string; - char * ps_string; - int N_words_to_print; - - string = string_new(); - - if (!display_walls) { - int N_wall_connectors = 0; - bool suppressor_used = false; - for (j=0; jnum_words-1) continue; - N_wall_connectors ++; - if (strcmp(ppla[j].lc->string, LEFT_WALL_SUPPRESS)==0) { - suppressor_used = true; - } - } - } - print_word_0 = (((!suppressor_used) && (N_wall_connectors != 0)) - || (N_wall_connectors != 1)); - } - else print_word_0 = true; - - if (!display_walls) { - int N_wall_connectors = 0; - bool suppressor_used = false; - for (j=0; jnum_words-1) { - N_wall_connectors ++; - if (strcmp(ppla[j].lc->string, RIGHT_WALL_SUPPRESS)==0){ - suppressor_used = true; - } - } - } - print_word_N = (((!suppressor_used) && (N_wall_connectors != 0)) - || (N_wall_connectors != 1)); - } - else print_word_N = true; - - if (print_word_0) d=0; else d=1; - - i = 0; - N_words_to_print = linkage->num_words; - if (!print_word_N) N_words_to_print--; - - append_string(string, "["); - for (j=d; j0)) append_string(string, "\n"); - i++; - append_string(string, "(%s)", linkage->word[j]); - } - append_string(string,"]"); - append_string(string,"\n"); - - append_string(string,"["); - j = 0; - for (link=0; linknum_words-1)) continue; - // if (ppla[link]->lw == SIZE_MAX) continue; - assert (ppla[link].lw != SIZE_MAX); - if ((j%7 == 0) && (j>0)) append_string(string,"\n"); - j++; - append_string(string,"[%zu %zu %d", - ppla[link].lw - d, ppla[link].rw - d, - pctx->link_heights[link]); - append_string(string," (%s)]", ppla[link].link_name); - } - append_string(string,"]"); - append_string(string,"\n"); - append_string(string,"["); - for (j=0; j < pctx->N_rows; j++ ) - { - if (j>0) append_string(string, " %d", pctx->row_starts[j]); - else append_string(string,"%d", pctx->row_starts[j]); - } - append_string(string,"]\n"); - - ps_string = string_copy(string); - string_delete(string); - return ps_string; -} - - -#define MAX_HEIGHT 30 - -/** - * Print the indicated linkage into a utf8-diagram. - * Works fine for general utf8 multi-byte sentences. - * Links and connectors are still mostly assumed to be ASCII, though; - * to fix this, grep for "strlen" in the code below, replace by utf8 len. - * - * Returned string is allocated with exalloc. - * Needs to be freed with linkage_free_diagram() - */ -static char * -linkage_print_diagram_ctxt(const Linkage linkage, - bool display_walls, - size_t x_screen_width, - ps_ctxt_t *pctx) -{ - bool display_short = true; - unsigned int i, j, k, cl, cr, inc, row, top_row, top_row_p1; - const char *s; - char *t; - bool print_word_0 , print_word_N; - int *center = alloca((linkage->num_words+1)*sizeof(int)); - int *word_offset = alloca((linkage->num_words+1) * sizeof(*word_offset)); - unsigned int line_len, link_length; - unsigned int N_links = linkage->num_links; - Link *ppla = linkage->link_array; - String * string; - char * gr_string; - unsigned int N_words_to_print; - - char picture[MAX_HEIGHT][MAX_LINE]; - char xpicture[MAX_HEIGHT][MAX_LINE]; - size_t start[MAX_HEIGHT]; - - // Avoid pathological case and the resulting crash. - if (0 == linkage->num_words) return strdup(""); - - string = string_new(); - - /* Do we want to print the left wall? */ - if (!display_walls) - { - int N_wall_connectors = 0; - bool suppressor_used = false; - for (j=0; jnum_words-1) continue; - N_wall_connectors ++; - if (0 == strcmp(ppla[j].lc->string, LEFT_WALL_SUPPRESS)) - { - suppressor_used = true; - } - } - } - print_word_0 = (((!suppressor_used) && (N_wall_connectors != 0)) - || (N_wall_connectors != 1)); - } - else print_word_0 = true; - - /* Do we want to print the right wall? */ - if (!display_walls) - { - int N_wall_connectors = 0; - bool suppressor_used = false; - for (j=0; jnum_words-1) - { - N_wall_connectors ++; - if (0 == strcmp(ppla[j].lc->string, RIGHT_WALL_SUPPRESS)) - { - suppressor_used = true; - } - } - } - print_word_N = (((!suppressor_used) && (N_wall_connectors != 0)) - || (N_wall_connectors != 1)); - } - else print_word_N = true; - - N_words_to_print = linkage->num_words; - if (!print_word_N) N_words_to_print--; - - if (set_centers(linkage, center, word_offset, print_word_0, N_words_to_print) - + 1 > MAX_LINE) - { - append_string(string, "The diagram is too long.\n"); - gr_string = string_copy(string); - string_delete(string); - return gr_string; - } - - line_len = center[N_words_to_print-1]+1; - - for (k=0; knum_words-1)) continue; - - /* Put it into the lowest position */ - cl = center[ppla[j].lw]; - cr = center[ppla[j].rw]; - for (row=0; row < MAX_HEIGHT; row++) - { - for (k=cl+1; klink_heights[j] = row; - - if (2*row+2 > MAX_HEIGHT-1) { - append_string(string, "The diagram is too high.\n"); - gr_string = string_copy(string); - string_delete(string); - return gr_string; - } - if (row > top_row) top_row = row; - - picture[row][cl] = '+'; - picture[row][cr] = '+'; - for (k=cl+1; klc->string[0]) { *(t-1) = '<'; } - if (DEPT_CHR == ppla[j].lc->string[0] && - (t > &picture[row][cl])) { picture[row][cl+1] = '<'; } - if (HEAD_CHR == ppla[j].lc->string[0]) { *(t-1) = '>'; } - - /* Copy connector name; stop short if no room */ - while ((*s != '\0') && (*t == '-')) *t++ = *s++; - - /* Add direction indicator */ - // if (DEPT_CHR == ppla[j]->rc->string[0]) { *t = '>'; } - if (DEPT_CHR == ppla[j].rc->string[0]) { picture[row][cr-1] = '>'; } - if (HEAD_CHR == ppla[j].rc->string[0]) { *t = '<'; } - - /* The direction indicators may have clobbered these. */ - picture[row][cl] = '+'; - picture[row][cr] = '+'; - - /* Now put in the | below this one, where needed */ - for (k=0; kword[k]; - i = 0; - while (*s != '\0') { - *t++ = *s++; - i++; - } - *t++ = ' '; - } - *t = '\0'; - - /* If display_short is NOT true, then the linkage diagram is printed - * in the "tall" style, with an extra row of vertical descenders - * between each level. */ - if (display_short) { - for (k=0; picture[0][k] != '\0'; k++) { - if ((picture[0][k] == '+') || (picture[0][k] == '|')) { - xpicture[1][k] = '|'; - } else { - xpicture[1][k] = ' '; - } - } - xpicture[1][k] = '\0'; - for (row=0; row < top_row+1; row++) { - strcpy(xpicture[row+2], picture[row]); - } - top_row += 2; - } else { - for (row=0; row < top_row+1; row++) { - strcpy(xpicture[2*row+2],picture[row]); - for (k=0; picture[row][k] != '\0'; k++) { - if ((picture[row][k] == '+') || (picture[row][k] == '|')) { - xpicture[2*row+1][k] = '|'; - } else { - xpicture[2*row+1][k] = ' '; - } - } - xpicture[2*row+1][k] = '\0'; - } - top_row = 2*top_row + 2; - } - - /* We've built the picture, now print it out. */ - - if (print_word_0) i = 0; else i = 1; - - /* Start locations, for each row. These may vary, due to different - * utf8 character widths. */ - top_row_p1 = top_row + 1; - for (row = 0; row < top_row_p1; row++) - start[row] = 0; - pctx->N_rows = 0; - pctx->row_starts[pctx->N_rows] = 0; - pctx->N_rows++; - while (i < N_words_to_print) - { - unsigned int revrs; - /* Count the column-widths of the words, - * up to the max screen width. */ - unsigned int uwidth = 0; - do { - uwidth += word_offset[i] + utf8_strwidth(linkage->word[i]) + 1; - i++; - } while ((i < N_words_to_print) && - (uwidth + word_offset[i] + utf8_strwidth(linkage->word[i]) + 1 < - x_screen_width)); - - pctx->row_starts[pctx->N_rows] = i - (!print_word_0); /* PS junk */ - if (i < N_words_to_print) pctx->N_rows++; /* same */ - - append_string(string, "\n"); - top_row_p1 = top_row + 1; - for (revrs = 0; revrs < top_row_p1; revrs++) - { - /* print each row of the picture */ - /* 'blank' is used solely to detect blank lines */ - unsigned int mbcnt = 0; - bool blank = true; - - row = top_row - revrs; - k = start[row]; - for (j = k; (mbcnt < uwidth) && (xpicture[row][j] != '\0'); ) - { - size_t n = utf8_next(&xpicture[row][j]); - blank = blank && (xpicture[row][j] == ' '); - j += n; - mbcnt ++; - } - start[row] = j; - - if (!blank) - { - mbcnt = 0; - for (j = k; (mbcnt < uwidth) && (xpicture[row][j] != '\0'); ) - { - /* Copy exactly one multi-byte character to buf */ - j += append_utf8_char(string, &xpicture[row][j]); - mbcnt ++; - } - append_string(string, "\n"); - } - } - append_string(string, "\n"); - } - gr_string = string_copy(string); - string_delete(string); - return gr_string; -} - -/** - * Print the indicated linkage as utf8-art intp the given string. - * The width of the diagram is given by the terminal width, taken - * from the parse options. - * - * The returned string is malloced, and needs to be freed with - * linkage_free_diagram() - */ -char * linkage_print_diagram(const Linkage linkage, bool display_walls, size_t screen_width) -{ - ps_ctxt_t ctx; - if (!linkage) return NULL; - - ctx.link_heights = (int *) alloca(linkage->num_links * sizeof(int)); - ctx.row_starts = (int *) alloca((linkage->num_words + 1) * sizeof(int)); - return linkage_print_diagram_ctxt(linkage, display_walls, screen_width, &ctx); -} - -void linkage_free_diagram(char * s) -{ - exfree(s, strlen(s)+1); -} - -void linkage_free_disjuncts(char * s) -{ - exfree(s, strlen(s)+1); -} - -void linkage_free_links_and_domains(char * s) -{ - exfree(s, strlen(s)+1); -} - -void linkage_free_senses(char * s) -{ - exfree(s, strlen(s)+1); -} - -/* Forward declarations, the gunk is at the bottom. */ -static const char * trailer(bool print_ps_header); -static const char * header(bool print_ps_header); - -char * linkage_print_postscript(const Linkage linkage, bool display_walls, bool print_ps_header) -{ - char * ps, * qs, * ascii; - int size; - - /* call the ascii printer to initialize the row size stuff. */ - ps_ctxt_t ctx; - ctx.link_heights = (int *) alloca(linkage->num_links * sizeof(int)); - ctx.row_starts = (int *) alloca((linkage->num_words + 1) * sizeof(int)); - ascii = linkage_print_diagram_ctxt(linkage, display_walls, 8000, &ctx); - linkage_free_diagram(ascii); - - ps = build_linkage_postscript_string(linkage, display_walls, &ctx); - size = strlen(header(print_ps_header)) + strlen(ps) + strlen(trailer(print_ps_header)) + 1; - - qs = (char *) exalloc(sizeof(char)*size); - snprintf(qs, size, "%s%s%s", header(print_ps_header), ps, trailer(print_ps_header)); - exfree(ps, strlen(ps)+1); - - return qs; -} - -void linkage_free_postscript(char * s) -{ - exfree(s, strlen(s)+1); -} - -char * linkage_print_pp_msgs(Linkage linkage) -{ - if (linkage && linkage->lifo.pp_violation_msg) - return strdup(linkage->lifo.pp_violation_msg); - return strdup(""); -} - -void linkage_free_pp_msgs(char * s) -{ - exfree(s, strlen(s)+1); -} - -void print_disjunct_counts(Sentence sent) -{ - size_t i; - int c; - Disjunct *d; - for (i=0; ilength; i++) { - c = 0; - for (d=sent->word[i].d; d != NULL; d = d->next) { - c++; - } - /* XXX alternatives[0] is not really correct, here .. */ - printf("%s(%d) ",sent->word[i].alternatives[0], c); - } - printf("\n\n"); -} - -void print_expression_sizes(Sentence sent) -{ - X_node * x; - size_t w, size; - for (w=0; wlength; w++) { - size = 0; - for (x=sent->word[w].x; x!=NULL; x = x->next) { - size += size_of_expression(x->exp); - } - /* XXX alternatives[0] is not really correct, here .. */ - printf("%s[%zu] ",sent->word[w].alternatives[0], size); - } - printf("\n\n"); -} - -static const char * trailer(bool print_ps_header) -{ - static const char * trailer_string= - "diagram\n" - "\n" - "%%EndDocument\n" - ; - - if (print_ps_header) return trailer_string; - else return ""; -} - -static const char * header(bool print_ps_header) -{ - static const char * header_string= - "%!PS-Adobe-2.0 EPSF-1.2\n" - "%%Pages: 1\n" - "%%BoundingBox: 0 -20 500 200\n" - "%%EndComments\n" - "%%BeginDocument: \n" - "\n" - "% compute size of diagram by adding\n" - "% #rows x 8.5\n" - "% (#rows -1) x 10\n" - "% \\sum maxheight x 10\n" - "/nulllink () def % The symbol of a null link\n" - "/wordfontsize 11 def % the size of the word font\n" - "/labelfontsize 9 def % the size of the connector label font\n" - "/ex 10 def % the horizontal radius of all the links\n" - "/ey 10 def % the height of the level 0 links\n" - "/ed 10 def % amount to add to this height per level\n" - "/radius 10 def % radius for rounded arcs\n" - "/row-spacing 10 def % the space between successive rows of the diagram\n" - "\n" - "/gap wordfontsize .5 mul def % the gap between words\n" - "/top-of-words wordfontsize .85 mul def\n" - " % the delta y above where the text is written where\n" - " % the major axis of the ellipse is located\n" - "/label-gap labelfontsize .1 mul def\n" - "\n" - "/xwordfontsize 10 def % the size of the word font\n" - "/xlabelfontsize 10 def % the size of the connector label font\n" - "/xex 10 def % the horizontal radius of all the links\n" - "/xey 10 def % the height of the level 0 links\n" - "/xed 10 def % amount to add to this height per level\n" - "/xradius 10 def % radius for rounded arcs\n" - "/xrow-spacing 10 def % the space between successive rows of the diagram\n" - "/xgap wordfontsize .5 mul def % the gap between words\n" - "\n" - "/centerpage 6.5 72 mul 2 div def\n" - " % this number of points from the left margin is the center of page\n" - "\n" - "/rightpage 6.5 72 mul def\n" - " % number of points from the left margin is the right margin\n" - "\n" - "/show-string-centered-dict 5 dict def\n" - "\n" - "/show-string-centered {\n" - " show-string-centered-dict begin\n" - " /string exch def\n" - " /ycenter exch def\n" - " /xcenter exch def\n" - " xcenter string stringwidth pop 2 div sub\n" - " ycenter labelfontsize .3 mul sub\n" - " moveto\n" - " string show\n" - " end\n" - "} def\n" - "\n" - "/clear-word-box {\n" - " show-string-centered-dict begin\n" - " /string exch def\n" - " /ycenter exch def\n" - " /xcenter exch def\n" - " newpath\n" - " /urx string stringwidth pop 2 div def\n" - " /ury labelfontsize .3 mul def\n" - " xcenter urx sub ycenter ury sub moveto\n" - " xcenter urx add ycenter ury sub lineto\n" - " xcenter urx add ycenter ury add lineto\n" - " xcenter urx sub ycenter ury add lineto\n" - " closepath\n" - " 1 setgray fill\n" - " 0 setgray\n" - " end\n" - "} def\n" - "\n" - "/diagram-sentence-dict 20 dict def\n" - "\n" - "/diagram-sentence-circle\n" - "{diagram-sentence-dict begin \n" - " /links exch def\n" - " /words exch def\n" - " /n words length def\n" - " /Times-Roman findfont wordfontsize scalefont setfont\n" - " /x 0 def\n" - " /y 0 def\n" - "\n" - " /left-ends [x dup words {stringwidth pop add gap add dup}\n" - " forall pop pop] def\n" - " /right-ends [x words {stringwidth pop add dup gap add} forall pop] def\n" - " /centers [0 1 n 1 sub {/i exch def\n" - " left-ends i get\n" - " right-ends i get\n" - " add 2 div\n" - " } for ] def\n" - "\n" - " x y moveto\n" - " words {show gap 0 rmoveto} forall\n" - "\n" - " .5 setlinewidth \n" - "\n" - " links {dup 0 get /leftword exch def\n" - " dup 1 get /rightword exch def\n" - " dup 2 get /level exch def\n" - " 3 get /string exch def\n" - " newpath\n" - " string nulllink eq {[2] 1 setdash}{[] 0 setdash} ifelse\n" - "% string nulllink eq {.8 setgray}{0 setgray} ifelse\n" - " centers leftword get\n" - " y top-of-words add\n" - " moveto\n" - " \n" - " centers rightword get\n" - " centers leftword get\n" - " sub 2 div dup\n" - " radius \n" - " lt {/radiusx exch def}{pop /radiusx radius def} ifelse\n" - " \n" - " \n" - " \n" - " centers leftword get\n" - " y top-of-words add ey ed level mul add add\n" - " centers rightword get\n" - " y top-of-words add ey ed level mul add add\n" - " radiusx\n" - " arcto\n" - " 4 {pop} repeat\n" - " centers rightword get\n" - " y top-of-words add ey ed level mul add add\n" - " centers rightword get\n" - " y top-of-words add\n" - " radiusx\n" - " arcto\n" - " 4 {pop} repeat\n" - " centers rightword get\n" - " y top-of-words add\n" - " lineto\n" - "\n" - " stroke\n" - "\n" - " /radius-y ey ed level mul add def\n" - "\n" - " /center-arc-x\n" - " centers leftword get centers rightword get add 2 div\n" - " def\n" - " \n" - " /center-arc-y\n" - " y top-of-words radius-y add add\n" - " def\n" - "\n" - " /Courier-Bold findfont labelfontsize scalefont setfont \n" - " center-arc-x center-arc-y string clear-word-box\n" - " center-arc-x center-arc-y string show-string-centered\n" - " } forall\n" - " end\n" - " } def\n" - "\n" - "/diagramdict 20 dict def\n" - "\n" - "/diagram\n" - "{diagramdict begin\n" - " /break-words exch def\n" - " /links exch def\n" - " /words exch def\n" - " /n words length def\n" - " /n-rows break-words length def\n" - " /Times-Roman findfont wordfontsize scalefont setfont\n" - "\n" - " /left-ends [0 dup words {stringwidth pop add gap add dup}\n" - " forall pop pop] def\n" - " /right-ends [0 words {stringwidth pop add dup gap add} forall pop] def\n" - "\n" - " /lwindows [ break-words {left-ends exch get gap 2 div sub } forall ] def\n" - " /rwindows [1 1 n-rows 1 sub {/i exch def\n" - " lwindows i get } for\n" - " right-ends n 1 sub get gap 2 div add\n" - " ] def\n" - "\n" - "\n" - " /max 0 def\n" - " 0 1 links length 1 sub {\n" - " /i exch def\n" - " /t links i get 2 get def\n" - " t max gt {/max t def} if\n" - " } for\n" - "\n" - " /max-height ed max mul ey add top-of-words add row-spacing add def\n" - " /total-height n-rows max-height mul row-spacing sub def\n" - "\n" - " /max-width 0 def % compute the widest window\n" - " 0 1 n-rows 1 sub {\n" - " /i exch def\n" - " /t rwindows i get lwindows i get sub def\n" - " t max-width gt {/max-width t def} if\n" - " } for\n" - "\n" - " centerpage max-width 2 div sub 0 translate % centers it\n" - " % rightpage max-width sub 0 translate % right justified\n" - " % Delete both of these to make it left justified\n" - "\n" - " n-rows 1 sub -1 0\n" - " {/i exch def\n" - " gsave\n" - " newpath\n" - " %/centering centerpage rwindows i get lwindows i get sub 2 div sub def\n" - " % this line causes each row to be centered\n" - " /centering 0 def\n" - " % set centering to 0 to prevent centering of each row \n" - "\n" - " centering -100 moveto % -100 because some letters go below zero\n" - " centering max-height n-rows mul lineto\n" - " rwindows i get lwindows i get sub centering add\n" - " max-height n-rows mul lineto\n" - " rwindows i get lwindows i get sub centering add\n" - " -100 lineto\n" - " closepath\n" - " clip\n" - " lwindows i get neg n-rows i sub 1 sub max-height mul translate\n" - " centerpage centering 0 translate\n" - " words links diagram-sentence-circle\n" - " grestore\n" - " } for\n" - " end\n" - "} def \n" - "\n" - "/diagramx\n" - "{diagramdict begin\n" - " /break-words exch def\n" - " /links exch def\n" - " /words exch def\n" - " /n words length def\n" - " /n-rows break-words length def\n" - " /Times-Roman findfont xwordfontsize scalefont setfont\n" - "\n" - " /left-ends [0 dup words {stringwidth pop add gap add dup}\n" - " forall pop pop] def\n" - " /right-ends [0 words {stringwidth pop add dup gap add} forall pop] def\n" - "\n" - " /lwindows [ break-words {left-ends exch get gap 2 div sub } forall ] def\n" - " /rwindows [1 1 n-rows 1 sub {/i exch def\n" - " lwindows i get } for\n" - " right-ends n 1 sub get xgap 2 div add\n" - " ] def\n" - "\n" - "\n" - " /max 0 def\n" - " 0 1 links length 1 sub {\n" - " /i exch def\n" - " /t links i get 2 get def\n" - " t max gt {/max t def} if\n" - " } for\n" - "\n" - " /max-height xed max mul xey add top-of-words add xrow-spacing add def\n" - " /total-height n-rows max-height mul xrow-spacing sub def\n" - "\n" - " /max-width 0 def % compute the widest window\n" - " 0 1 n-rows 1 sub {\n" - " /i exch def\n" - " /t rwindows i get lwindows i get sub def\n" - " t max-width gt {/max-width t def} if\n" - " } for\n" - "\n" - " centerpage max-width 2 div sub 0 translate % centers it\n" - " % rightpage max-width sub 0 translate % right justified\n" - " % Delete both of these to make it left justified\n" - "\n" - " n-rows 1 sub -1 0\n" - " {/i exch def\n" - " gsave\n" - " newpath\n" - " %/centering centerpage rwindows i get lwindows i get sub 2 div sub def\n" - " % this line causes each row to be centered\n" - " /centering 0 def\n" - " % set centering to 0 to prevent centering of each row \n" - "\n" - " centering -100 moveto % -100 because some letters go below zero\n" - " centering max-height n-rows mul lineto\n" - " rwindows i get lwindows i get sub centering add\n" - " max-height n-rows mul lineto\n" - " rwindows i get lwindows i get sub centering add\n" - " -100 lineto\n" - " closepath\n" - " clip\n" - " lwindows i get neg n-rows i sub 1 sub max-height mul translate\n" - " centerpage centering 0 translate\n" - " words links diagram-sentence-circle\n" - " grestore\n" - " } for\n" - " end\n" - "} def \n" - "\n" - "/ldiagram\n" - "{diagramdict begin\n" - " /break-words exch def\n" - " /links exch def\n" - " /words exch def\n" - " /n words length def\n" - " /n-rows break-words length def\n" - " /Times-Roman findfont wordfontsize scalefont setfont\n" - "\n" - " /left-ends [0 dup words {stringwidth pop add gap add dup}\n" - " forall pop pop] def\n" - " /right-ends [0 words {stringwidth pop add dup gap add} forall pop] def\n" - "\n" - " /lwindows [ break-words {left-ends exch get gap 2 div sub } forall ] def\n" - " /rwindows [1 1 n-rows 1 sub {/i exch def\n" - " lwindows i get } for\n" - " right-ends n 1 sub get gap 2 div add\n" - " ] def\n" - "\n" - "\n" - " /max 0 def\n" - " 0 1 links length 1 sub {\n" - " /i exch def\n" - " /t links i get 2 get def\n" - " t max gt {/max t def} if\n" - " } for\n" - "\n" - " /max-height ed max mul ey add top-of-words add row-spacing add def\n" - " /total-height n-rows max-height mul row-spacing sub def\n" - "\n" - " /max-width 0 def % compute the widest window\n" - " 0 1 n-rows 1 sub {\n" - " /i exch def\n" - " /t rwindows i get lwindows i get sub def\n" - " t max-width gt {/max-width t def} if\n" - " } for\n" - "\n" - " % centerpage max-width 2 div sub 0 translate % centers it\n" - " % rightpage max-width sub 0 translate % right justified\n" - " % Delete both of these to make it left justified\n" - "\n" - " n-rows 1 sub -1 0\n" - " {/i exch def\n" - " gsave\n" - " newpath\n" - " %/centering centerpage rwindows i get lwindows i get sub 2 div sub def\n" - " % this line causes each row to be centered\n" - " /centering 0 def\n" - " % set centering to 0 to prevent centering of each row \n" - "\n" - " centering -100 moveto % -100 because some letters go below zero\n" - " centering max-height n-rows mul lineto\n" - " rwindows i get lwindows i get sub centering add\n" - " max-height n-rows mul lineto\n" - " rwindows i get lwindows i get sub centering add\n" - " -100 lineto\n" - " closepath\n" - " clip\n" - " lwindows i get neg n-rows i sub 1 sub max-height mul translate\n" - " centerpage centering 0 translate\n" - " words links diagram-sentence-circle\n" - " grestore\n" - " } for\n" - " end\n" - "} def \n" - ; - if (print_ps_header) return header_string; - else return ""; -} - -/** - * Print elements of the 2D-word-array produced for the parsers. - * - * - print_sentence_word_alternatives(sent, false, NULL, tokenpos) - * If a pointer to struct "tokenpos" is given, return through it the index of - * the first occurrence in the sentence of the given token. This is used to - * prevent duplicate information display for repeated morphemes (if there are - * multiples splits, each of several morphemes, otherwise some of them may - * repeat). - * - * - print_sentence_word_alternatives(sent, true, NULL, NULL) - * If debugprint is "true", this is a debug printout of the sentence. (The - * debug printouts are with level 0 because this function is invoked for debug - * on certain positive level.) - * - * - * - print_sentence_word_alternatives(sent, false, display_func, NULL) - * Iterate over the sentence words and their alternatives. Handle each - * alternative using the display_func function if it is supplied, or else (if it - * is NULL) just print them. It is used to display disjunct information when - * command !!word is used. - * FIXME In the current version (using Wordgraph) the "alternatives" in the - * word-array don't necessarily consist of real word alternatives. - * - */ - -struct tokenpos /* First position of the given token - to prevent duplicates */ -{ - const char * token; - size_t wi; - size_t ai; -}; - -void print_sentence_word_alternatives(Sentence sent, bool debugprint, - void (*display)(Dictionary, const char *), struct tokenpos * tokenpos) -{ - size_t wi; /* Internal sentence word index */ - size_t ai; /* Index of a word alternative */ - size_t sentlen = sent->length; /* Shortened if there is a right-wall */ - size_t first_sentence_word = 0; /* Used for skipping a left-wall */ - bool word_split = false; /* !!word got split */ - Dictionary dict = sent->dict; - - if (0 == sentlen) - { - /* It should not happen, but if it actually happens due to some - * strange conditions, it's better not to abort the program. */ - prt_error("Error: Sentence length is 0 (reason unknown)\n"); - return; - } - - if (debugprint) lgdebug(+0, "\n\\"); - else if (NULL != tokenpos) - ; /* Do nothing */ - else - { - /* For analyzing words we need to ignore the left/right walls */ - if (dict->left_wall_defined && - (0 == strcmp(sent->word[0].unsplit_word, LEFT_WALL_WORD))) - first_sentence_word = 1; - if (dict->right_wall_defined && - ((NULL != sent->word[sentlen-1].unsplit_word)) && - (0 == strcmp(sent->word[sentlen-1].unsplit_word, RIGHT_WALL_WORD))) - sentlen--; - - /* Find if a word got split. This is indicated by: - * 1. More than one word in the sentence - * (no check if it actually results from !!w1 w2 ...). - * 2. A word containing more than one alternative. */ - if (sentlen - first_sentence_word > 1) - { - word_split = true; - } - else - { - for (wi=first_sentence_word; wiword[wi]; - - /* There should always be at least one alternative */ - assert((NULL != w.alternatives) && (NULL != w.alternatives[0]) && - ('\0' != w.alternatives[0][0]), "Missing alt for word %zu", wi); - - if (NULL != w.alternatives[1]) - { - word_split = true; - break; - } - } - } - /* "String", because it can be a word, morpheme, or (TODO) idiom */ - if (word_split && (NULL == display)) printf("String splits to:\n"); - /* We used to print the alternatives of the word here, one per line. - * In the current (Wordgraph) version, the alternatives may look - * like nonsense combination of tokens - not as the strict split - * possibilities of words as in previous versions. - * E.g.: For Hebrew word "הכלב", we now get these "alternatives": - * ה= כלב לב ב= - * ה= כ= ל= - * ה= כ= - * For "'50s," - * ' s s , - * '50 50 , - * '50s - * Clearly, this is not informative any more. Instead, one line with a - * list of tokens (without repetitions) is printed - * ה= כלב לב ב= כ= ל= - * - * FIXME Print the alternatives from the wordgraph. - */ - } - - /* Iterate over sentence input words */ - for (wi=first_sentence_word; wiword[wi]; - size_t w_start = wi; /* input word index */ - size_t max_nalt = 0; - -#if 0 /* In the Wordgraph version firstupper and post_quote don't exist. */ - if (debugprint) lgdebug(0, " word%d %c%c: %s\n ", - wi, w.firstupper ? 'C' : ' ', sent->post_quote[wi] ? 'Q' : ' ', -#endif - if (debugprint) lgdebug(0, " word%zu: %s\n\\", wi, w.unsplit_word); - - /* There should always be at least one alternative */ - assert((NULL != w.alternatives) && (NULL != w.alternatives[0]) && - ('\0' != w.alternatives[0][0]), "Missing alt for word %zu", wi); - - //err_msg(lg_Debug, "word%zu '%s' nalts %zu\n", - // wi, sent->word[wi].unsplit_word, altlen(sent->word[wi].alternatives)); - - for (wi = w_start; (wi == w_start) || - ((wi < sentlen) && (! sent->word[wi].unsplit_word)); wi++) - { - size_t nalt = altlen(sent->word[wi].alternatives); - - max_nalt = MAX(max_nalt, nalt); - } - - /* Iterate over alternatives */ - for (ai=0; ai < max_nalt; ai++) - { - if (debugprint) - { - if (0 < ai) lgdebug(0, "\n "); - lgdebug(0, " alt%zu:", ai); - } - - for (wi = w_start; (wi == w_start) || - ((wi < sentlen) && (! sent->word[wi].unsplit_word)); wi++) - { - size_t nalts = altlen(sent->word[wi].alternatives); - const char *wt; - const char *st = NULL; - char *wprint = NULL; - - if (ai >= nalts) continue; - wt = sent->word[wi].alternatives[ai]; - - /* Don't display information again for the same word */ - if ((NULL != tokenpos) && (0 == strcmp(tokenpos->token, wt))) - { - tokenpos->wi = wi; - tokenpos->ai = ai; - return; - } - if (!debugprint) - { - struct tokenpos firstpos = { wt }; - - print_sentence_word_alternatives(sent, false, NULL, &firstpos); - if (((firstpos.wi != wi) || (firstpos.ai != ai)) && - firstpos.wi >= first_sentence_word) // allow !!LEFT_WORD - { - /* We encountered this token earlier */ - if (NULL != display) - lgdebug(6, "Skipping repeated %s\n", wt); - continue; - } - } - - /* Restore SUBSCRIPT_DOT for printing */ - st = strrchr(wt, SUBSCRIPT_MARK); - if (st) - { - wprint = malloc(strlen(wt)+1); - strcpy(wprint, wt); - wprint[st-wt] = SUBSCRIPT_DOT; - wt = wprint; - } - - if (debugprint) - { - const char *opt_start = "", *opt_end = ""; - if (sent->word[wi].optional) - { - opt_start = "{"; - opt_end = "}"; - } - lgdebug(0, " %s%s%s", opt_start, wt, opt_end); - } - - /* Don't try to give info on the empty word. */ - if ('\0' != wt[0]) - { - /* For now each word component is called "Token". - * TODO: Its type can be decoded and a more precise - * term (stem, prefix, etc.) can be used. - * Display the features of the token */ - if ((NULL == tokenpos) && (NULL != display)) - { - printf("Token \"%s\" ", wt); - display(sent->dict, wt); - printf("\n"); - } - else if (word_split) printf(" %s", wt); - } - free(wprint); /* wprint is NULL if not allocated */ - } - - /* Commented out - no alternatives for now - print as one line. */ - //if (word_split && (NULL == display)) printf("\n"); - } - wi--; - if (debugprint) lgdebug(0, "\n\\"); - } - if (debugprint) lgdebug(0, "\n"); - else if (word_split) printf("\n\n"); -} - -/** - * Print a word, converting SUBSCRIPT_MARK to SUBSCRIPT_DOT. - */ -void print_with_subscript_dot(const char *s) -{ - const char *mark = strchr(s, SUBSCRIPT_MARK); - size_t len = NULL != mark ? (size_t)(mark - s) : strlen(s); - - prt_error("%.*s%s%s ", (int)len, - s, NULL != mark ? "." : "", NULL != mark ? mark+1 : ""); -} - -/** - * Print linkage wordgraph path. - */ -void print_lwg_path(Gword **w) -{ - lgdebug(+0, " "); - for (; *w; w++) lgdebug(0, "%s ", (*w)->subword); - lgdebug(0, "\n"); -} - -#define D_WPP 8 -void print_wordgraph_pathpos(const Wordgraph_pathpos *wp) -{ - size_t i = 0; - - if (NULL == wp) - { - lgdebug(+D_WPP, "Empty\n"); - return; - } - lgdebug(+D_WPP, "\n"); - for (; NULL != wp->word; wp++) - { - lgdebug(D_WPP, "%zu: %zu:word '%s', same=%d used=%d level=%zu\n", - i++, wp->word->node_num, wp->word->subword, wp->same_word, - wp->used, wp->word->hier_depth); - } -} -#undef D_WPP - -/** - * Print the chosen_disjuncts words. - * This is used for debug, e.g. for tracking them in the Wordgraph display. - */ -void print_chosen_disjuncts_words(const Linkage lkg) -{ - size_t i; - String *djwbuf = string_new(); - - err_msg(lg_Debug, "Linkage %p (%zu words): ", lkg, lkg->num_words); - for (i = 0; i < lkg->num_words; i++) - { - Disjunct *cdj = lkg->chosen_disjuncts[i]; - const char *djw; /* disjunct word - the chosen word */ - - if (NULL == cdj) - djw = lkg->sent->word[i].optional ? "{}" : "[]"; - else if ('\0' == cdj->string[0]) - djw = "\\0"; /* null string - something is wrong */ - else - djw = cdj->string; - - char *djw_tmp = strdupa(djw); - char *sm = strrchr(djw_tmp, SUBSCRIPT_MARK); - if (NULL != sm) *sm = SUBSCRIPT_DOT; - - append_string(djwbuf, "%s ", djw_tmp); - } - err_msg(lg_Debug, "%s\n", string_value(djwbuf)); - string_delete(djwbuf); -} diff -Nru link-grammar-5.3.16/link-grammar/print.h link-grammar-5.5.0/link-grammar/print.h --- link-grammar-5.3.16/link-grammar/print.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _PRINT_H -#define _PRINT_H - -#include "link-includes.h" - -#define LEFT_WALL_DISPLAY ("LEFT-WALL") /* the string to use to show the wall */ -#define RIGHT_WALL_DISPLAY ("RIGHT-WALL") /* the string to use to show the wall */ - -void print_disjunct_counts(Sentence sent); -void print_expression_sizes(Sentence sent); -struct tokenpos; -void print_sentence_word_alternatives(Sentence sent, bool debugprint, - void (*display)(Dictionary, const char *), struct tokenpos *); -void print_with_subscript_dot(const char *); -void print_chosen_disjuncts_words(const Linkage); -void print_lwg_path(Gword **); -void print_wordgraph_pathpos(const Wordgraph_pathpos *); -#endif /* _PRINT_H */ diff -Nru link-grammar-5.3.16/link-grammar/print-util.c link-grammar-5.5.0/link-grammar/print-util.c --- link-grammar-5.3.16/link-grammar/print-util.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print-util.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,134 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include -#include -#include "print-util.h" -#include "utilities.h" - -/* This is a "safe" append function, used here to build up a link diagram - incrementally. Because the diagram is built up a few characters at - a time, we keep around a pointer to the end of string to prevent - the algorithm from being quadratic. */ - -struct String_s -{ - size_t allocated; /* Unsigned so VC++ doesn't complain about comparisons */ - size_t eos; /* offset to end of string */ - char * p; -}; - -String * string_new(void) -{ -#define INITSZ 30 - String * string; - string = (String *) exalloc(sizeof(String)); - string->allocated = INITSZ; - string->p = (char *) exalloc(INITSZ*sizeof(char)); - string->p[0] = '\0'; - string->eos = 0; - return string; -} - -void string_delete(String *s) -{ - exfree(s->p, s->allocated*sizeof(char)); - exfree(s, sizeof(String)); -} - -const char * string_value(String *s) -{ - return s->p; -} - -char * string_copy(String *s) -{ - char * p = (char *) exalloc(s->eos + 1); - strcpy(p, s->p); - return p; -} - -static void string_append_l(String *string, const char *a, size_t len) -{ - if (string->allocated <= string->eos + len) - { - string->allocated = 2 * string->allocated + len + 1; - string->p = (char *)realloc(string->p, string->allocated); - } - strcpy(string->p + string->eos, a); - string->eos += len; - return; -} - -/* Note: As in the rest of the LG library, we assume here C99 compliance. */ -void vappend_string(String * string, const char *fmt, va_list args) -{ -#define TMPLEN 1024 /* Big enough for a possible error message, see below */ - char temp_buffer[TMPLEN]; - char *temp_string = temp_buffer; - size_t templen; - va_list copy_args; - - va_copy(copy_args, args); - templen = vsnprintf(temp_string, TMPLEN, fmt, copy_args); - va_end(copy_args); - - if ((int)templen < 0) goto error; - // if (fmt[0] == '(') { errno=2; goto error;} /* Test the error reporting. */ - - if (templen >= TMPLEN) - { - /* TMPLEN is too small - use a bigger buffer. Couldn't actually - * find any example of entering this code with TMPLEN=1024... */ - temp_string = alloca(templen+1); - templen = vsnprintf(temp_string, templen+1, fmt, args); - if ((int)templen < 0) goto error; - } - va_end(args); - - string_append_l(string, temp_string, templen); - return; - -error: - { - /* Some error has occurred */ - const char msg[] = "[vappend_string(): "; - strcpy(temp_buffer, msg); - strerror_r(errno, temp_buffer+sizeof(msg)-1, TMPLEN-sizeof(msg)); - strcat(temp_buffer, "]"); - string_append_l(string, temp_string, strlen(temp_buffer)); - return; - } -} - -void append_string(String * string, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - - vappend_string(string, fmt, args); -} - -size_t append_utf8_char(String * string, const char * mbs) -{ - /* Copy exactly one multi-byte character to buf */ - char buf[10]; - size_t n = utf8_next(mbs); - - assert(n<10, "Multi-byte character is too long!"); - strncpy(buf, mbs, n); - buf[n] = 0; - append_string(string, "%s", buf); - return n; -} diff -Nru link-grammar-5.3.16/link-grammar/print-util.h link-grammar-5.5.0/link-grammar/print-util.h --- link-grammar-5.3.16/link-grammar/print-util.h 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/print-util.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,38 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ -#ifndef LG_PRINT_UTIL_H_ -#define LG_PRINT_UTIL_H_ - -#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4) -#define GNUC_PRINTF( format_idx, arg_idx ) \ - __attribute__((__format__ (__printf__, format_idx, arg_idx))) -#else -#define GNUC_PRINTF( format_idx, arg_idx ) -#endif - -#include -#include - -typedef struct String_s String; - -String * string_new(void); -void string_delete(String *); -const char * string_value(String *); -char * string_copy(String *); -void append_string(String * string, const char *fmt, ...) GNUC_PRINTF(2,3); -void vappend_string(String * string, const char *fmt, va_list args) - GNUC_PRINTF(2,0); -size_t append_utf8_char(String * string, const char * mbs); - -#endif - - diff -Nru link-grammar-5.3.16/link-grammar/prune.c link-grammar-5.5.0/link-grammar/prune.c --- link-grammar-5.3.16/link-grammar/prune.c 2017-02-03 00:13:22.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/prune.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1735 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2009, 2013, 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "api-structures.h" -#include "count.h" -#include "dict-api.h" /* for print_expression when debugging */ -#include "disjunct-utils.h" -#include "externs.h" -#include "post-process.h" -#include "print.h" - -#include "prune.h" -#include "resources.h" -#include "string-set.h" -#include "wordgraph.h" -#include "word-utils.h" - -#define D_PRUNE 5 - -#define CONTABSZ 8192 -typedef Connector * connector_table; - -typedef struct disjunct_dup_table_s disjunct_dup_table; -struct disjunct_dup_table_s -{ - unsigned int dup_table_size; - Disjunct ** dup_table; -}; - -/* Indicator that this connector cannot be used -- that its "obsolete". */ -#define BAD_WORD (MAX_SENTENCE+1) - -typedef struct c_list_s C_list; -struct c_list_s -{ - C_list * next; - Connector * c; - bool shallow; -}; - -typedef struct power_table_s power_table; -struct power_table_s -{ - unsigned int power_table_size; - unsigned int *l_table_size; /* the sizes of the hash tables */ - unsigned int *r_table_size; - C_list *** l_table; - C_list *** r_table; -}; - -typedef struct cms_struct Cms; -struct cms_struct -{ - Cms * next; - const char * name; - int count; /* the number of times this is in the multiset */ -}; - -#define CMS_SIZE (2<<10) -typedef struct multiset_table_s multiset_table; -struct multiset_table_s -{ - Cms * cms_table[CMS_SIZE]; -}; - -typedef struct prune_context_s prune_context; -struct prune_context_s -{ - bool null_links; - int power_cost; - int N_changed; /* counts the number of changes - of c->nearest_word fields in a pass */ - power_table *pt; -#ifdef ALT_DISJUNCT_CONSISTENCY - const Connector *first_connector; /* for alt disjunct consistency */ -#endif - Sentence sent; -}; - -/* - - The algorithms in this file prune disjuncts from the disjunct list - of the sentence that can be elimininated by a simple checks. The first - check works as follows: - - A series of passes are made through the sentence, alternating - left-to-right and right-to-left. Consier the left-to-right pass (the - other is symmetric). A set S of connectors is maintained (initialized - to be empty). Now the disjuncts of the current word are processed. - If a given disjunct's left pointing connectors have the property that - at least one of them has no connector in S to which it can be matched, - then that disjunct is deleted. Now the set S is augmented by the right - connectors of the remaining disjuncts of that word. This completes - one word. The process continues through the words from left to right. - Alternate passes are made until no disjunct is deleted. - - It worries me a little that if there are some really huge disjuncts lists, - then this process will probably do nothing. (This fear turns out to be - unfounded.) - - Notes: Power pruning will not work if applied before generating the - "and" disjuncts. This is because certain of it's tricks don't work. - Think about this, and finish this note later.... - Also, currently I use the standard connector match procedure instead - of the pruning one, since I know power pruning will not be used before - and generation. Replace this to allow power pruning to work before - generating and disjuncts. - - Currently it seems that normal pruning, power pruning, and generation, - pruning, and power pruning (after "and" generation) and parsing take - about the same amount of time. This is why doing power pruning before - "and" generation might be a very good idea. - - New idea: Suppose all the disjuncts of a word have a connector of type - c pointing to the right. And further, suppose that there is exactly one - word to its right containing that type of connector pointing to the left. - Then all the other disjuncts on the latter word can be deleted. - (This situation is created by the processing of "either...or", and by - the extra disjuncts added to a "," neighboring a conjunction.) - -*/ - -/** - * This hash function only looks at the leading upper case letters of - * the connector string, and the label fields. This ensures that if two - * strings match (formally), then they must hash to the same place. - */ -static inline unsigned int hash_S(Connector * c) -{ - unsigned int h = connector_hash(c); - return (h & (CONTABSZ-1)); -} - -static void zero_connector_table(connector_table *ct) -{ - memset(ct, 0, sizeof(Connector *) * CONTABSZ); -} - -/** - * This function puts connector c into the connector table - * if one like it isn't already there. - */ -static void insert_connector(connector_table *ct, Connector * c) -{ - unsigned int h; - Connector * e; - - h = hash_S(c); - - for (e = ct[h]; e != NULL; e = e->tableNext) - { - if (string_set_cmp(c->string, e->string)) - return; - } - c->tableNext = ct[h]; - ct[h] = c; -} - -/* - The second algorithm eliminates disjuncts that are dominated by - another. It works by hashing them all, and checking for domination. -*/ - -#if 0 -/* ============================================================x */ - -/* - Consider the idea of deleting a disjunct if it is dominated (in terms of - what it can match) by some other disjunct on the same word. This has - been implemented below. There are three problems with it: - - (1) It is almost never the case that any disjuncts are eliminated. - - (2) connector_matches_alam may not be exactly correct. - - (3) The linkage that is eliminated by this, might just be the one that - passes post-processing, as the following example shows. - This is pretty silly, and should probably be changed. - -> telling John how our program works would be stupid -Accepted (2 linkages, 1 with no P.P. violations) - Linkage 1, cost vector = (0, 0, 7) - - +------------------G-----------------+ - +-----R-----+----CL----+ | - +---O---+ | +---D--+---S---+ +--I-+-AI-+ - | | | | | | | | | -telling.g John how our program.n works would be stupid - - ///// CLg <---CLg---> CL telling.g - (g) telling.g G <---G-----> G would - (g) (d) telling.g R <---R-----> R how - (g) (d) telling.g O <---O-----> O John - (g) (d) how CLe <---CLe---> CL program.n - (g) (d) (e) our D <---Ds----> Ds program.n - (g) (d) (e) program.n Ss <---Ss----> Ss works - (g) would I <---Ix----> Ix be - (g) be AI <---AIi---> AIi stupid - -(press return for another) -> - Linkage 2 (bad), cost vector = (0, 0, 7) - - +------------------G-----------------+ - +-----R-----+----CL----+ | - +---O---+ | +---D--+---S---+ +--I-+-AI-+ - | | | | | | | | | -telling.g John how our program.n works would be stupid - - ///// CLg <---CLg---> CL telling.g - (g) telling.g G <---G-----> G would - (g) (d) telling.g R <---R-----> R how - (g) (d) telling.g O <---O-----> O John - (g) (d) how CLe <---CLe---> CL program.n - (g) (d) (e) our D <---Ds----> Ds program.n - (g) (d) (e) program.n Ss <---Ss----> Ss works - (g) would I <---Ix----> Ix be - (g) be AI <---AI----> AI stupid - -P.P. violations: - Special subject rule violated -*/ - -/** - * hash function that takes a string and a seed value i - */ -static int string_hash(disjunct_dup_table *dt, const char * s, int i) -{ - for(;*s != '\0';s++) i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)]; - return (i & (dt->dup_table_size-1)); -} - -/** - * This returns true if the connector a matches everything that b - * matches, and possibly more. (alam=at least as much) - * - * TRUE for equal connectors. - * remains TRUE if multi-match added to the first. - * remains TRUE if subsrcripts deleted from the first. - */ -static bool connector_matches_alam(Connector * a, Connector * b) -{ - char * s, * t, *u; - if (((!a->multi) && b->multi) || - (a->label != b->label)) return false; - s = a->string; - t = b->string; - - /* isupper -- connectors cannot be UTF8 at this time */ - while (isupper(*s) || isupper(*t)) - { - if (*s == *t) { - s++; - t++; - } else return false; - } - while ((*s != '\0') && (*t != '\0')) { - if ((*s == *t) || (*s == '*')) { - s++; - t++; - } else return false; - } - while ((*s != '\0') && (*s == '*')) s++; - return (*s == '\0'); -} - - -/** - * This hash function that takes a connector and a seed value i. - * It only looks at the leading upper case letters of - * the string, and the label. This ensures that if two connectors - * match, then they must hash to the same place. - */ -static int conn_hash(Connector * c, int i) -{ - int nb; - const char * s; - s = c->string; - - i = i + (i<<1) + randtable[(c->label + i) & (RTSIZE-1)]; - nb = is_utf8_upper(s); - while (nb) - { - i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)]; - s += nb; - nb = is_utf8_upper(s); - } - return i; -} - -static inline int pconnector_hash(disjunct_dup_table *dt, Connector * c, int i) -{ - i = conn_hash(c, i); - return (i & (ct->dup_table_size-1)); -} - -/** - * This is a hash function for disjuncts - */ -static int hash_disjunct(disjunct_dup_table *dt, Disjunct * d) -{ - int i; - Connector *e; - i = 0; - for (e = d->left ; e != NULL; e = e->next) - { - i = pconnector_hash(dt, e, i); - } - for (e = d->right ; e != NULL; e = e->next) - { - i = pconnector_hash(dt, e, i); - } - return string_hash(dt, d->string, i); -} - -/** - * Returns TRUE if disjunct d1 can match anything that d2 can - * if this happens, it constitutes a proof that there is absolutely - * no use for d2. - */ -static bool disjunct_matches_alam(Disjunct * d1, Disjunct * d2) -{ - Connector *e1, *e2; - if (d1->cost > d2->cost) return false; - e1 = d1->left; - e2 = d2->left; - while ((e1!=NULL) && (e2!=NULL)) - { - if (!connector_matches_alam(e1,e2)) break; - e1 = e1->next; - e2 = e2->next; - } - if ((e1!=NULL) || (e2!=NULL)) return false; - e1 = d1->right; - e2 = d2->right; - while ((e1!=NULL) && (e2!=NULL)) - { - if (!connector_matches_alam(e1,e2)) break; - e1 = e1->next; - e2 = e2->next; - } - if ((e1!=NULL) || (e2!=NULL)) return false; - return (strcmp(d1->string, d2->string) == 0); -} - -/** - * Takes the list of disjuncts pointed to by d, eliminates all - * duplicates, and returns a pointer to a new list. - * It frees the disjuncts that are eliminated. - */ -Disjunct * eliminate_duplicate_disjuncts(Disjunct * d) -{ - int i, h, count; - Disjunct *dn, *dx, *dxn, *front; - count = 0; - disjunct_dup_table *dt; - - dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(d))); - - for (;d!=NULL; d = dn) - { - dn = d->next; - h = hash_disjunct(d); - - front = NULL; - for (dx = dt->dup_table[h]; dx != NULL; dx = dxn) - { - dxn = dx->next; - if (disjunct_matches_alam(dx,d)) - { - /* we know that d should be killed */ - d->next = NULL; - free_disjuncts(d); - count++; - front = catenate_disjuncts(front, dx); - break; - } else if (disjunct_matches_alam(d,dx)) { - /* we know that dx should be killed off */ - dx->next = NULL; - free_disjuncts(dx); - count++; - } else { - /* neither should be killed off */ - dx->next = front; - front = dx; - } - } - if (dx == NULL) { - /* we put d in the table */ - d->next = front; - front = d; - } - dt->dup_table[h] = front; - } - - /* d is now NULL */ - for (i = 0; i < dt->dup_table_size; i++) - { - for (dx = dt->dup_table[i]; dx != NULL; dx = dxn) - { - dxn = dx->next; - dx->next = d; - d = dx; - } - } - - if ((verbosity > 2) && (count != 0)) printf("killed %d duplicates\n", count); - - disjunct_dup_table_delete(dt); - return d; -} - -/* ============================================================x */ -#endif - -/* ================================================================= */ -/** - * Here is expression pruning. This is done even before the expressions - * are turned into lists of disjuncts. - * - * This uses many of the same data structures and functions that are used - * by prune. - * - * The purge operations remove all irrelevant stuff from the expression, - * and free the purged stuff. A connector is deemed irrelevant if its - * string pointer has been set to NULL. The passes through the sentence - * have the job of doing this. - * - * If an OR or AND type expression node has one child, we can replace it - * by its child. This, of course, is not really necessary, except for - * performance(?) - */ - -static Exp* purge_Exp(Exp *); - -/** - * Get rid of the elements with null expressions - */ -static E_list * or_purge_E_list(E_list * l) -{ - E_list * el; - if (l == NULL) return NULL; - if ((l->e = purge_Exp(l->e)) == NULL) - { - el = or_purge_E_list(l->next); - xfree((char *)l, sizeof(E_list)); - return el; - } - l->next = or_purge_E_list(l->next); - return l; -} - -/** - * Returns 0 iff the length of the disjunct list is 0. - * If this is the case, it frees the structure rooted at l. - */ -static int and_purge_E_list(E_list * l) -{ - if (l == NULL) return 1; - if ((l->e = purge_Exp(l->e)) == NULL) - { - free_E_list(l->next); - xfree((char *)l, sizeof(E_list)); - return 0; - } - if (and_purge_E_list(l->next) == 0) - { - free_Exp(l->e); - xfree((char *)l, sizeof(E_list)); - return 0; - } - return 1; -} - -/** - * Must be called with a non-null expression. - * Return NULL iff the expression has no disjuncts. - */ -static Exp* purge_Exp(Exp *e) -{ - if (e->type == CONNECTOR_type) - { - if (e->u.string == NULL) - { - xfree((char *)e, sizeof(Exp)); - return NULL; - } - else - { - return e; - } - } - if (e->type == AND_type) - { - if (and_purge_E_list(e->u.l) == 0) - { - xfree((char *)e, sizeof(Exp)); - return NULL; - } - } - else /* if we are here, its OR_type */ - { - e->u.l = or_purge_E_list(e->u.l); - if (e->u.l == NULL) - { - xfree((char *)e, sizeof(Exp)); - return NULL; - } - } - -/* This code makes it kill off nodes that have just one child - (1) It's going to give an insignificant speed-up - (2) Costs have not been handled correctly here. - The code is excised for these reasons. -*/ -/* - if ((e->u.l != NULL) && (e->u.l->next == NULL)) - { - ne = e->u.l->e; - xfree((char *) e->u.l, sizeof(E_list)); - xfree((char *) e, sizeof(Exp)); - return ne; - } -*/ - return e; -} - -/** - * Returns TRUE if c can match anything in the set S (err. the connector table ct). - */ -static inline bool matches_S(connector_table *ct, Connector * c) -{ - Connector * e; - - for (e = ct[hash_S(c)]; e != NULL; e = e->tableNext) - { - if (easy_match(e->string, c->string)) return true; - } - return false; -} - -/** - * Mark as dead all of the dir-pointing connectors - * in e that are not matched by anything in the current set. - * Returns the number of connectors so marked. - */ -static int mark_dead_connectors(connector_table *ct, Exp * e, char dir) -{ - int count; - count = 0; - if (e->type == CONNECTOR_type) - { - if (e->dir == dir) - { - Connector dummy; - init_connector(&dummy); - dummy.string = e->u.string; - if (!matches_S(ct, &dummy)) - { - e->u.string = NULL; - count++; - } - } - } - else - { - E_list *l; - for (l = e->u.l; l != NULL; l = l->next) - { - count += mark_dead_connectors(ct, l->e, dir); - } - } - return count; -} - -/** - * Put into the set S all of the dir-pointing connectors still in e. - * Return a list of allocated dummy connectors; these will need to be - * freed. - */ -static Connector * insert_connectors(connector_table *ct, Exp * e, - Connector *alloc_list, int dir) -{ - if (e->type == CONNECTOR_type) - { - if (e->dir == dir) - { - Connector *dummy = connector_new(); - dummy->string = e->u.string; - insert_connector(ct, dummy); - dummy->next = alloc_list; - alloc_list = dummy; - } - } - else - { - E_list *l; - for (l=e->u.l; l!=NULL; l=l->next) - { - alloc_list = insert_connectors(ct, l->e, alloc_list, dir); - } - } - return alloc_list; -} - -/** - * This removes the expressions that are empty from the list corresponding - * to word w of the sentence. - */ -static void clean_up_expressions(Sentence sent, int w) -{ - X_node head_node, *d, *d1; - d = &head_node; - d->next = sent->word[w].x; - while (d->next != NULL) - { - if (d->next->exp == NULL) - { - d1 = d->next; - d->next = d1->next; - xfree((char *)d1, sizeof(X_node)); - } - else - { - d = d->next; - } - } - sent->word[w].x = head_node.next; -} - -/* #define DBG(X) X */ -#define DBG(X) - -void expression_prune(Sentence sent) -{ - int N_deleted; - X_node * x; - size_t w; - Connector *ct[CONTABSZ]; - Connector *dummy_list = NULL; - - zero_connector_table(ct); - - N_deleted = 1; /* a lie to make it always do at least 2 passes */ - - while (1) - { - /* Left-to-right pass */ - /* For every word */ - for (w = 0; w < sent->length; w++) - { - /* For every expression in word */ - for (x = sent->word[w].x; x != NULL; x = x->next) - { -DBG(printf("before marking: "); print_expression(x->exp); printf("\n");); - N_deleted += mark_dead_connectors(ct, x->exp, '-'); -DBG(printf(" after marking: "); print_expression(x->exp); printf("\n");); - } - for (x = sent->word[w].x; x != NULL; x = x->next) - { -DBG(printf("before purging: "); print_expression(x->exp); printf("\n");); - x->exp = purge_Exp(x->exp); -DBG(printf("after purging: "); print_expression(x->exp); printf("\n");); - } - - /* gets rid of X_nodes with NULL exp */ - clean_up_expressions(sent, w); - for (x = sent->word[w].x; x != NULL; x = x->next) - { - dummy_list = insert_connectors(ct, x->exp, dummy_list, '+'); - } - } - - if (verbosity_level(D_PRUNE)) - { - printf("l->r pass removed %d\n", N_deleted); - print_expression_sizes(sent); - } - - /* Free the allocated dummy connectors */ - free_connectors(dummy_list); - dummy_list = NULL; - zero_connector_table(ct); - - if (N_deleted == 0) break; - - /* Right-to-left pass */ - N_deleted = 0; - for (w = sent->length-1; w != (size_t) -1; w--) - { - for (x = sent->word[w].x; x != NULL; x = x->next) - { -/* printf("before marking: "); print_expression(x->exp); printf("\n"); */ - N_deleted += mark_dead_connectors(ct, x->exp, '+'); -/* printf("after marking: "); print_expression(x->exp); printf("\n"); */ - } - for (x = sent->word[w].x; x != NULL; x = x->next) - { -/* printf("before perging: "); print_expression(x->exp); printf("\n"); */ - x->exp = purge_Exp(x->exp); -/* printf("after perging: "); print_expression(x->exp); printf("\n"); */ - } - clean_up_expressions(sent, w); /* gets rid of X_nodes with NULL exp */ - for (x = sent->word[w].x; x != NULL; x = x->next) - { - dummy_list = insert_connectors(ct, x->exp, dummy_list, '-'); - } - } - - if (verbosity_level(D_PRUNE)) - { - printf("r->l pass removed %d\n", N_deleted); - print_expression_sizes(sent); - } - - /* Free the allocated dummy connectors */ - free_connectors(dummy_list); - dummy_list = NULL; - zero_connector_table(ct); - if (N_deleted == 0) break; - N_deleted = 0; - } -} - - - -/* - Here is what you've been waiting for: POWER-PRUNE - - The kinds of constraints it checks for are the following: - - 1) successive connectors on the same disjunct have to go to - nearer and nearer words. - - 2) two deep connectors cannot attach to each other - (A connectors is deep if it is not the first in its list; it - is shallow if it is the first in its list; it is deepest if it - is the last on its list.) - - 3) on two adjacent words, a pair of connectors can be used - only if they're the deepest ones on their disjuncts - - 4) on two non-adjacent words, a pair of connectors can be used only - if not [both of them are the deepest]. - - The data structure consists of a pair of hash tables on every word. - Each bucket of a hash table has a list of pointers to connectors. - These nodes also store if the chosen connector is shallow. -*/ -/* - As with normal pruning, we make alternate left->right and right->left - passes. In the R->L pass, when we're on a word w, we make use of - all the left-pointing hash tables on the words to the right of w. - After the pruning on this word, we build the left-pointing hash table - this word. This guarantees idempotence of the pass -- after doing an - L->R, doing another would change nothing. - - Each connector has an integer c_word field. This refers to the closest - word that it could be connected to. These are initially determined by - how deep the connector is. For example, a deepest connector can connect - to the neighboring word, so its c_word field is w+1 (w-1 if this is a left - pointing connector). It's neighboring shallow connector has a c_word - value of w+2, etc. - - The pruning process adjusts these c_word values as it goes along, - accumulating information about any way of linking this sentence. - The pruning process stops only after no disjunct is deleted and no - c_word values change. - - The difference between RUTHLESS and GENTLE power pruning is simply - that GENTLE uses the deletable region array, and RUTHLESS does not. - So we can get the effect of these two different methods simply by - always unsuring that deletable[][] has been defined. With nothing - deletable, this is equivalent to RUTHLESS. --DS, 7/97 -*/ - -/** - * returns the number of connectors in the left lists of the disjuncts. - */ -static int left_connector_count(Disjunct * d) -{ - Connector *c; - int i=0; - for (;d!=NULL; d=d->next) { - for (c = d->left; c!=NULL; c = c->next) i++; - } - return i; -} - -static int right_connector_count(Disjunct * d) -{ - Connector *c; - int i=0; - for (;d!=NULL; d=d->next) { - for (c = d->right; c!=NULL; c = c->next) i++; - } - return i; -} - -static void free_C_list(C_list * t) -{ - C_list *xt; - for (; t!=NULL; t=xt) { - xt = t->next; - xfree((char *)t, sizeof(C_list)); - } -} - -/** - * free all of the hash tables and C_lists - */ -static void power_table_delete(power_table *pt) -{ - unsigned int w; - unsigned int i; - - for (w = 0; w < pt->power_table_size; w++) - { - for (i = 0; i < pt->l_table_size[w]; i++) - { - free_C_list(pt->l_table[w][i]); - } - xfree((char *)pt->l_table[w], pt->l_table_size[w] * sizeof (C_list *)); - - for (i = 0; i < pt->r_table_size[w]; i++) - { - free_C_list(pt->r_table[w][i]); - } - xfree((char *)pt->r_table[w], pt->r_table_size[w] * sizeof (C_list *)); - } - xfree(pt->l_table_size, 2 * pt->power_table_size * sizeof(unsigned int)); - xfree(pt->l_table, 2 * pt->power_table_size * sizeof(C_list **)); - xfree(pt, sizeof(power_table)); -} - -/** - * The disjunct d (whose left or right pointer points to c) is put - * into the appropriate hash table - */ -static void put_into_power_table(unsigned int size, C_list ** t, Connector * c, bool shal) -{ - unsigned int h; - C_list * m; - h = connector_hash(c) & (size-1); - m = (C_list *) xalloc (sizeof(C_list)); - m->next = t[h]; - t[h] = m; - m->c = c; - m->shallow = shal; -} - -/** - * Allocates and builds the initial power hash tables - */ -static power_table * power_table_new(Sentence sent) -{ - power_table *pt; - size_t w, len; - unsigned int i, size; - C_list ** t; - Disjunct * d; - Connector * c; - - pt = (power_table *) xalloc (sizeof(power_table)); - pt->power_table_size = sent->length; - pt->l_table_size = xalloc (2 * sent->length * sizeof(unsigned int)); - pt->r_table_size = pt->l_table_size + sent->length; - pt->l_table = xalloc (2 * sent->length * sizeof(C_list **)); - pt->r_table = pt->l_table + sent->length; - - for (w=0; wlength; w++) - { - /* The below uses variable-sized hash tables. This seems to - * provide performance that is equal or better than the best - * fixed-size performance. - * The best fixed-size performance seems to come at about - * a 1K table size, for both English and Russian. (Both have - * about 100 fixed link-types, and many thousands of auto-genned - * link types (IDxxx idioms for both, LLxxx suffix links for - * Russian). Pluses and minuses: - * + small fixed tables are faster to initialize. - * - small fixed tables have more collisions - * - variable-size tables require counting connectors. - * (and the more complex code to go with) - * CPU cache-size effects ... - * Strong depenence on the hashing algo! - */ - len = left_connector_count(sent->word[w].d); - size = next_power_of_two_up(len); -#define TOPSZ 32768 - if (TOPSZ < size) size = TOPSZ; - pt->l_table_size[w] = size; - t = pt->l_table[w] = (C_list **) xalloc(size * sizeof(C_list *)); - for (i=0; iword[w].d; d!=NULL; d=d->next) { - c = d->left; - if (c != NULL) { - put_into_power_table(size, t, c, true); - for (c=c->next; c!=NULL; c=c->next) { - put_into_power_table(size, t, c, false); - } - } - } - - len = right_connector_count(sent->word[w].d); - size = next_power_of_two_up(len); - if (TOPSZ < size) size = TOPSZ; - pt->r_table_size[w] = size; - t = pt->r_table[w] = (C_list **) xalloc(size * sizeof(C_list *)); - for (i=0; iword[w].d; d!=NULL; d=d->next) { - c = d->right; - if (c != NULL) { - put_into_power_table(size, t, c, true); - for (c=c->next; c!=NULL; c=c->next){ - put_into_power_table(size, t, c, false); - } - } - } - } - - return pt; -} - -/** - * This runs through all the connectors in this table, and eliminates those - * who are obsolete. The word fields of an obsolete one has been set to - * BAD_WORD. - */ -static void clean_table(unsigned int size, C_list ** t) -{ - unsigned int i; - C_list * m, * xm, * head; - for (i = 0; i < size; i++) { - head = NULL; - for (m = t[i]; m != NULL; m = xm) { - xm = m->next; - if (m->c->nearest_word != BAD_WORD) { - m->next = head; - head = m; - } else { - xfree((char *) m, sizeof(C_list)); - } - } - t[i] = head; - } -} - -/** - * Find if words w1 and w2 may become adjacent due to optional words. - * This may happen if they only contain optional words between them. - * - * Return true iff they may become adjacent (i.e. all the words - * between them are optional). - */ -static bool optional_gap_collapse(Sentence sent, int w1, int w2) -{ - for (int w = w1+1; w < w2; w++) - if (!sent->word[w].optional) return false; - - return true; -} - -#if defined(ALT_MUTUAL_CONSISTENCY) || defined(ALT_DISJUNCT_CONSISTENCY) -static bool alt_consistency(prune_context *pc, - Connector *lc, Connector *rc, - int lword, int rword, bool lr) -{ - bool same_alternative = false; - -#ifdef ALT_MUTUAL_CONSISTENCY - /* Validate that rc and lc are from the same alternative. */ - for (Gword **lg = (Gword **)lc->word; NULL != *lg; lg++) - { - for (Gword **rg = (Gword **)rc->word; NULL != *rg; rg++) - { - if (in_same_alternative(*lg, *rg)) - { - same_alternative = true; - break; - } - } - if (same_alternative) break; - } -#endif /* ALT_MUTUAL_CONSISTENCY */ - -#ifdef ALT_DISJUNCT_CONSISTENCY - /* Validate that the candidate connector is not from a different - * alternative than any of the already existing connectors of the - * checked disjunct (XXX at the same side). - */ - - if (same_alternative) - { - const Connector *remote_connecor = lr ? lc : rc; - const Gword **gword_c = remote_connecor->word; - const Connector *curr_connecor = lr ? rc : lc; - -#if 0 - printf("CHECK %s F%p=%s R%p=%s:", lr ? "rc" : "lc", - pc->first_connector, pc->first_connector->string, - remote_connecor, remote_connecor->string); -#endif - for (const Connector *i = pc->first_connector; curr_connecor != i; i = i->next) - { - printf(" I%p=%s", i, i->string); - bool alt_compatible = false; - for (Gword **gi = (Gword **)i->word; NULL != *gi; gi++) - { - for (Gword **gcp = (Gword **)gword_c; NULL != *gcp; gcp++) - { - if (in_same_alternative(*gi, *gcp)) - { - alt_compatible = true; - break; - } - } - if (alt_compatible) break; - } - if (!alt_compatible) - { - same_alternative = false; -#if 0 - printf(" FALSE\n"); -#endif - break; - } - } -#if 0 - printf("\n"); -#endif - } -#endif /* ALT_DISJUNCT_CONSISTENCY */ - - if (!same_alternative) - { - lgdebug(8, "w%d=%s and w%d=%s NSA\n", - lword, lc->word[0]->subword, - rword, rc->word[0]->subword); - - return false; - } - - return same_alternative; -} -#endif /* defined(ALT_MUTUAL_CONSISTENCY) || defined(ALT_DISJUNCT_CONSISTENCY)*/ - -/** - * This takes two connectors (and whether these are shallow or not) - * (and the two words that these came from) and returns TRUE if it is - * possible for these two to match based on local considerations. - */ -static bool possible_connection(prune_context *pc, - Connector *lc, Connector *rc, - bool lshallow, bool rshallow, - int lword, int rword, bool lr) -{ - int dist; - if ((!lshallow) && (!rshallow)) return false; - - /* Two deep connectors can't work */ - if ((lc->nearest_word > rword) || (rc->nearest_word < lword)) return false; - - dist = rword - lword; - // assert(0 < dist, "Bad word order in possible connection."); - - /* Word range constraints */ - if (1 == dist) - { - if ((lc->next != NULL) || (rc->next != NULL)) return false; - } - else - if (dist > lc->length_limit || dist > rc->length_limit) - { - return false; - } - /* If the words are NOT next to each other, then there must be - * at least one intervening connector (i.e. cannot have both - * lc->next amnd rc->next being null). But we only enforce this - * when we think its still possible to have a complete parse, - * i.e. before well allow null-linked words. - */ - else - if (!pc->null_links && - (lc->next == NULL) && - (rc->next == NULL) && - (!lc->multi) && (!rc->multi) && - !optional_gap_collapse(pc->sent, lword, rword)) - { - return false; - } - -#if defined(ALT_MUTUAL_CONSISTENCY) || defined(ALT_DISJUNCT_CONSISTENCY) - if (!alt_consistency(pc, lc, rc, lword, rword, lr)) return false; -#endif - - return easy_match(lc->string, rc->string); -} - -/** - * This returns TRUE if the right table of word w contains - * a connector that can match to c. shallow tells if c is shallow. - */ -static bool -right_table_search(prune_context *pc, int w, Connector *c, - bool shallow, int word_c) -{ - unsigned int size, h; - C_list *cl; - power_table *pt = pc->pt; - - size = pt->r_table_size[w]; - h = connector_hash(c) & (size-1); - for (cl = pt->r_table[w][h]; cl != NULL; cl = cl->next) - { - if (possible_connection(pc, cl->c, c, cl->shallow, shallow, w, word_c, true)) - return true; - } - return false; -} - -/** - * This returns TRUE if the right table of word w contains - * a connector that can match to c. shallows tells if c is shallow - */ -static bool -left_table_search(prune_context *pc, int w, Connector *c, - bool shallow, int word_c) -{ - unsigned int size, h; - C_list *cl; - power_table *pt = pc->pt; - - size = pt->l_table_size[w]; - h = connector_hash(c) & (size-1); - for (cl = pt->l_table[w][h]; cl != NULL; cl = cl->next) - { - if (possible_connection(pc, c, cl->c, shallow, cl->shallow, word_c, w, false)) - return true; - } - return false; -} - -/** - * Take this connector list, and try to match it with the words - * w-1, w-2, w-3... Returns the word to which the first connector of - * the list could possibly be matched. If c is NULL, returns w. If - * there is no way to match this list, it returns a negative number. - * If it does find a way to match it, it updates the c->nearest_word fields - * correctly. - */ -static int -left_connector_list_update(prune_context *pc, Connector *c, - int w, bool shallow) -{ - int n, lb; - bool foundmatch; - - if (c == NULL) return w; - n = left_connector_list_update(pc, c->next, w, false) - 1; - if (((int) c->nearest_word) < n) n = c->nearest_word; - - /* lb is now the leftmost word we need to check */ - lb = w - c->length_limit; - if (0 > lb) lb = 0; - - /* n is now the rightmost word we need to check */ - foundmatch = false; - for (; n >= lb ; n--) - { - pc->power_cost++; - if (right_table_search(pc, n, c, shallow, w)) - { - foundmatch = true; - break; - } - } - if (n < ((int) c->nearest_word)) - { - c->nearest_word = n; - pc->N_changed++; - } - return (foundmatch ? n : -1); -} - -/** - * Take this connector list, and try to match it with the words - * w+1, w+2, w+3... Returns the word to which the first connector of - * the list could possibly be matched. If c is NULL, returns w. If - * there is no way to match this list, it returns a number greater than - * N_words - 1. If it does find a way to match it, it updates the - * c->nearest_word fields correctly. - */ -static size_t -right_connector_list_update(prune_context *pc, Connector *c, - size_t w, bool shallow) -{ - size_t n, ub; - bool foundmatch; - Sentence sent = pc->sent; - - if (c == NULL) return w; - n = right_connector_list_update(pc, c->next, w, false) + 1; - if (c->nearest_word > n) n = c->nearest_word; - - /* ub is now the rightmost word we need to check */ - ub = w + c->length_limit; - if (ub > sent->length) ub = sent->length - 1; - - /* n is now the leftmost word we need to check */ - foundmatch = false; - for (; n <= ub ; n++) - { - pc->power_cost++; - if (left_table_search(pc, n, c, shallow, w)) - { - foundmatch = true; - break; - } - } - if (n > c->nearest_word) { - c->nearest_word = n; - pc->N_changed++; - } - return (foundmatch ? n : sent->length); -} - -/** The return value is the number of disjuncts deleted */ -int power_prune(Sentence sent, Parse_Options opts) -{ - power_table *pt; - prune_context *pc; - Disjunct *d, *free_later, *dx, *nd; - Connector *c; - size_t N_deleted, total_deleted; - size_t w; - - pc = (prune_context *) xalloc (sizeof(prune_context)); - pc->power_cost = 0; - pc->null_links = (opts->min_null_count > 0); - pc->N_changed = 1; /* forces it always to make at least two passes */ - - pc->sent = sent; - - pt = power_table_new(sent); - pc->pt = pt; - - free_later = NULL; - N_deleted = 0; - - total_deleted = 0; - - while (1) - { - /* left-to-right pass */ - for (w = 0; w < sent->length; w++) { - for (d = sent->word[w].d; d != NULL; d = d->next) { - if (d->left == NULL) continue; -#ifdef ALT_DISJUNCT_CONSISTENCY - pc->first_connector = d->left; -#endif - if (left_connector_list_update(pc, d->left, w, true) < 0) { - for (c=d->left; c != NULL; c = c->next) c->nearest_word = BAD_WORD; - for (c=d->right; c != NULL; c = c->next) c->nearest_word = BAD_WORD; - N_deleted++; - total_deleted++; - } - } - - clean_table(pt->r_table_size[w], pt->r_table[w]); - nd = NULL; - for (d = sent->word[w].d; d != NULL; d = dx) { - dx = d->next; - if ((d->left != NULL) && (d->left->nearest_word == BAD_WORD)) { - d->next = free_later; - free_later = d; - } else { - d->next = nd; - nd = d; - } - } - sent->word[w].d = nd; - } - if (verbosity_level(D_PRUNE)) - { - printf("l->r pass changed %d and deleted %zu\n", pc->N_changed, N_deleted); - } - - if (pc->N_changed == 0) break; - - pc->N_changed = N_deleted = 0; - /* right-to-left pass */ - - for (w = sent->length-1; w != (size_t) -1; w--) { - for (d = sent->word[w].d; d != NULL; d = d->next) { - if (d->right == NULL) continue; -#ifdef ALT_DISJUNCT_CONSISTENCY - pc->first_connector = d->right; -#endif - if (right_connector_list_update(pc, d->right, w, true) >= sent->length) { - for (c=d->right; c != NULL; c = c->next) c->nearest_word = BAD_WORD; - for (c=d->left; c != NULL; c = c->next) c->nearest_word = BAD_WORD; - N_deleted++; - total_deleted++; - } - } - clean_table(pt->l_table_size[w], pt->l_table[w]); - nd = NULL; - for (d = sent->word[w].d; d != NULL; d = dx) { - dx = d->next; - if ((d->right != NULL) && (d->right->nearest_word == BAD_WORD)) { - d->next = free_later; - free_later = d; - } else { - d->next = nd; - nd = d; - } - } - sent->word[w].d = nd; - } - - if (verbosity_level(D_PRUNE)) - { - printf("r->l pass changed %d and deleted %zu\n", - pc->N_changed, N_deleted); - } - - if (pc->N_changed == 0) break; - pc->N_changed = N_deleted = 0; - } - free_disjuncts(free_later); - power_table_delete(pt); - pt = NULL; - pc->pt = NULL; - - if (verbosity_level(D_PRUNE)) - printf("power prune cost: %d\n", pc->power_cost); - - print_time(opts, "power pruned"); - if (verbosity_level(D_PRUNE)) - { - printf("\nAfter power_pruning:\n"); - print_disjunct_counts(sent); - } - - xfree(pc, sizeof(prune_context)); - return total_deleted; -} - -/* =================================================================== - PP Pruning - - The "contains one" post-processing constraints give us a new way to - prune. Suppose there's a rule that says "a group that contains foo - must contain a bar or a baz." Here foo, bar, and baz are connector - types. foo is the trigger link, bar and baz are called the criterion - links. If, after considering the disjuncts we find that there is is - a foo, but neither a bar, nor a baz, then we can eliminte the disjuct - containing bar. - - Things are actually a bit more complex, because of the matching rules - and subscripts. The problem is that post-processing deals with link - names, while at this point all we have to work with is connector - names. Consider the foo part. Consider a connector C. When does - foo match C for our purposes? It matches it if every possible link - name L (that can result from C being at one end of that link) results - in post_process_match(foo,L) being true. Suppose foo contains a "*". - Then there is no C that has this property. This is because the *s in - C may be replaced by some other subscripts in the construction of L. - And the non-* chars in L will not post_process_match the * in foo. - - So let's assume that foo has no *. Now the result we want is simply - given by post_process_match(foo, C). Proof: L is the same as C but - with some *s replaced by some other letters. Since foo contains no * - the replacement in C of some * by some other letter could change - post_process_match from FALSE to TRUE, but not vice versa. Therefore - it's conservative to use this test. - - For the criterion parts, we need to determine if there is a - collection of connectors C1, C2,... such that by combining them you - can get a link name L that post_process_matches bar or baz. Here's a - way to do this. Say bar="Xabc". Then we see if there are connector - names that post_process_match "Xa##", "X#b#", and "X##c". They must - all be there in order for it to be possible to create a link name - "Xabc". A "*" in the criterion part is a little different. In this - case we can simply skip the * (treat it like an upper case letter) - for this purpose. So if bar="X*ab" then we look for "X*#b" and - "X*a#". (The * in this case could be treated the same as another - subscript without breaking it.) Note also that it's only necessary - to find a way to match one of the many criterion links that may be in - the rule. If none can match, then we can delete the disjunct - containing C. - - Here's how we're going to implement this. We'll maintain a multiset - of connector names. We'll represent them in a hash table, where the - hash function uses only the upper case letters of the connector name. - We'll insert all the connectors into the multiset. The multiset will - support the operation of deletion (probably simplest to just - decrement the count). Here's the algorithm. - - Insert all the connectors into M. - - While the previous pass caused a count to go to 0 do: - For each connector C do - For each rule R do - if C is a trigger for R and the criterion links - of the rule cannot be satisfied by the connectors in - M, Then: - We delete C's disjunct. But before we do, - we remove all the connectors of this disjunct - from the multiset. Keep tabs on whether or not - any of the counts went to 0. - - - - Efficiency hacks to be added later: - Note for a given rule can become less and less satisfiable. - That is, rule_satisfiable(r) for a given rule r can change from - TRUE to FALSE, but not vice versa. So once it's FALSE, we can just - remember that. - - Consider the effect of a pass p on the set of rules that are - satisfiable. Suppose this set does not change. Then pass p+1 - will do nothing. This is true even if pass p caused some - disjuncts to be deleted. (This observation will only obviate - the need for the last pass.) - - */ - -static multiset_table * cms_table_new(void) -{ - multiset_table *mt; - int i; - - mt = (multiset_table *) xalloc(sizeof(multiset_table)); - - for (i=0; icms_table[i] = NULL; - } - return mt; -} - -static void cms_table_delete(multiset_table *mt) -{ - Cms * cms, *xcms; - int i; - for (i=0; icms_table[i]; cms != NULL; cms = xcms) - { - xcms = cms->next; - xfree(cms, sizeof(Cms)); - } - } - xfree(mt, sizeof(multiset_table)); -} - -static unsigned int cms_hash(const char * s) -{ - unsigned int i = 5381; - if (islower((int) *s)) s++; /* skip head-dependent indicator */ - while (isupper((int) *s)) - { - i = ((i << 5) + i) + *s; - s++; - } - return (i & (CMS_SIZE-1)); -} - -/** - * This returns TRUE if there is a connector name C in the table - * such that post_process_match(pp_match_name, C) is TRUE - */ -static bool match_in_cms_table(multiset_table *cmt, const char * pp_match_name) -{ - Cms * cms; - for (cms = cmt->cms_table[cms_hash(pp_match_name)]; cms != NULL; cms = cms->next) - { - if (post_process_match(pp_match_name, cms->name)) return true; - } - return false; -} - -static Cms * lookup_in_cms_table(multiset_table *cmt, const char * str) -{ - Cms * cms; - for (cms = cmt->cms_table[cms_hash(str)]; cms != NULL; cms = cms->next) - { - if (string_set_cmp(str, cms->name)) return cms; - } - return NULL; -} - -static void insert_in_cms_table(multiset_table *cmt, const char * str) -{ - Cms * cms; - unsigned int h; - cms = lookup_in_cms_table(cmt, str); - if (cms != NULL) { - cms->count++; - } else { - cms = (Cms *) xalloc(sizeof(Cms)); - cms->name = str; /* don't copy the string...just keep a pointer to it. - we won't free these later */ - cms->count = 1; - h = cms_hash(str); - cms->next = cmt->cms_table[h]; - cmt->cms_table[h] = cms; - } -} - -/** - * Delete the given string from the table. Return TRUE if - * this caused a count to go to 0, return FALSE otherwise. - */ -static bool delete_from_cms_table(multiset_table *cmt, const char * str) -{ - Cms * cms = lookup_in_cms_table(cmt, str); - if (cms != NULL && cms->count > 0) - { - cms->count--; - return (cms->count == 0); - } - return false; -} - -static bool rule_satisfiable(multiset_table *cmt, pp_linkset *ls) -{ - unsigned int hashval; - const char * t; - char name[20], *s; - pp_linkset_node *p; - int bad, n_subscripts; - - for (hashval = 0; hashval < ls->hash_table_size; hashval++) - { - for (p = ls->hash_table[hashval]; p!=NULL; p=p->next) - { - /* ok, we've got our hands on one of the criterion links */ - strncpy(name, p->str, sizeof(name)-1); - /* could actually use the string in place because we change it back */ - name[sizeof(name)-1] = '\0'; - /* now we want to see if we can satisfy this criterion link */ - /* with a collection of the links in the cms table */ - - s = name; - if (islower((int)*s)) s++; /* skip head-dependent indicator */ - for (; isupper((int)*s); s++) {} - for (;*s != '\0'; s++) if (*s != '*') *s = '#'; - - s = name; - t = p->str; - if (islower((int)*s)) s++; /* skip head-dependent indicator */ - if (islower((int)*t)) t++; /* skip head-dependent indicator */ - for (; isupper((int) *s); s++, t++) {} - - /* s and t remain in lockstep */ - bad = 0; - n_subscripts = 0; - for (;*s != '\0' && bad==0; s++, t++) { - if (*s == '*') continue; - n_subscripts++; - /* after the upper case part, and is not a * so must be a regular subscript */ - *s = *t; - if (!match_in_cms_table(cmt, name)) bad++; - *s = '#'; - } - - if (n_subscripts == 0) { - /* now we handle the special case which occurs if there - were 0 subscripts */ - if (!match_in_cms_table(cmt, name)) bad++; - } - - /* now if bad==0 this criterion link does the job - to satisfy the needs of the trigger link */ - - if (bad == 0) return true; - } - } - return false; -} - -static int pp_prune(Sentence sent, Parse_Options opts) -{ - pp_knowledge * knowledge; - size_t i, w; - int total_deleted, N_deleted; - bool change, deleteme; - multiset_table *cmt; - - if (sent->postprocessor == NULL) return 0; - if (!opts->perform_pp_prune) return 0; - - knowledge = sent->postprocessor->knowledge; - - cmt = cms_table_new(); - - for (w = 0; w < sent->length; w++) - { - Disjunct *d; - for (d = sent->word[w].d; d != NULL; d = d->next) - { - char dir; - d->marked = true; - for (dir=0; dir < 2; dir++) - { - Connector *c; - for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) - { - insert_in_cms_table(cmt, c->string); - } - } - } - } - - total_deleted = 0; - change = true; - while (change) - { - change = false; - N_deleted = 0; - for (w = 0; w < sent->length; w++) - { - Disjunct *d; - for (d = sent->word[w].d; d != NULL; d = d->next) - { - char dir; - if (!d->marked) continue; - deleteme = false; - for (dir = 0; dir < 2; dir++) - { - Connector *c; - for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) - { - for (i = 0; i < knowledge->n_contains_one_rules; i++) - { - pp_rule* rule = &knowledge->contains_one_rules[i]; /* the ith rule */ - const char * selector = rule->selector; /* selector string for this rule */ - pp_linkset * link_set = rule->link_set; /* the set of criterion links */ - - if (strchr(selector, '*') != NULL) continue; /* If it has a * forget it */ - - if (!post_process_match(selector, c->string)) continue; - - /* - printf("pp_prune: trigger ok. selector = %s c->string = %s\n", selector, c->string); - */ - - /* We know c matches the trigger link of the rule. */ - /* Now check the criterion links */ - - if (!rule_satisfiable(cmt, link_set)) - { - deleteme = true; - rule->use_count++; - } - if (deleteme) break; - } - if (deleteme) break; - } - if (deleteme) break; - } - - if (deleteme) /* now we delete this disjunct */ - { - char dir; - N_deleted++; - total_deleted++; - d->marked = false; /* mark for deletion later */ - for (dir=0; dir < 2; dir++) - { - Connector *c; - for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) - { - change |= delete_from_cms_table(cmt, c->string); - } - } - } - } - } - - if (verbosity_level(D_PRUNE)) - printf("pp_prune pass deleted %d\n", N_deleted); - } - delete_unmarked_disjuncts(sent); - cms_table_delete(cmt); - - if (verbosity_level(D_PRUNE)) - { - printf("\nAfter pp_pruning:\n"); - print_disjunct_counts(sent); - } - - print_time(opts, "pp pruning"); - - return total_deleted; -} - - -/** - * Do the following pruning steps until nothing happens: - * power pp power pp power pp.... - * Make sure you do them both at least once. - */ -void pp_and_power_prune(Sentence sent, Parse_Options opts) -{ - power_prune(sent, opts); - pp_prune(sent, opts); - - return; - - // Not reached. We can actually gain a few percent of - // performance be skipping the loop below. Mostly, it just - // does a lot of work, and pretty much finds nothing. - // And so we skip it. -#ifdef ONLY_IF_YOU_THINK_THIS_IS_WORTH_IT - for (;;) { - if (pp_prune(sent, opts) == 0) break; - if (power_prune(sent, opts) == 0) break; - } -#endif -} diff -Nru link-grammar-5.3.16/link-grammar/prune.h link-grammar-5.5.0/link-grammar/prune.h --- link-grammar-5.3.16/link-grammar/prune.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/prune.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,22 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _PRUNE_H -#define _PRUNE_H - -#include "api-types.h" -#include "link-includes.h" - -int power_prune(Sentence, Parse_Options); -void pp_and_power_prune(Sentence, Parse_Options); -void expression_prune(Sentence); -#endif /* _PRUNE_H */ diff -Nru link-grammar-5.3.16/link-grammar/README.md link-grammar-5.5.0/link-grammar/README.md --- link-grammar-5.3.16/link-grammar/README.md 2017-03-14 05:43:05.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/README.md 2017-07-15 21:16:13.000000000 +0000 @@ -1,17 +1,32 @@ Directories =========== +Listed in rough processing order. -The minisat and sat-solver directories contain code for the Boolean-SAT -parser. + * dict-common - generic dictionary-handling code. + * dict-file - read dictionaries from files. + * dict-sql - read dictionaries from an SQL DB + (unfinished, under development!). -The corpus directory contains code to read word-sense disambiguation -data from an SQL file. + * tokenize - Convert sentence strings into word sequences. + Actually, into a word-graph of possible words. -The dict-file directory contains code to read dictionaries from files. + * prepare - After tokenization, prepare sentence for parsing. -The dict-sql directory contains code to read dictionaries from an SQL DB - (unfinished, under development!). + * parse - Classic Sleator/Temperley/Lafferty parser. + + * minisat - Copy of the mini-SAT Boolean-SAT solver. + * sat-solver - Alternate parser, using boolean-SAT. + + * linkage - Construction and analysis of linkages from parsing. + + * post-process - Constraint rules applied after parsing. + Also, rules for constituent-tree construction. + + * corpus - word-sense disambiguation data from an SQL file. + + * print - functions that convert parses into + human-readable strings and printed output. Version 5.3.14 - Improved error notification facility @@ -98,6 +113,11 @@ This allows, for example, constructing a single message using a loop or conditionals. +7) `bool lg_error_flush(void);` +If a partial error message is buffered, flush it by adding a "\n" to it. +If no error message is buffered - do nothing. +Return **true** iff there was a message to flush. + See [link-includes.h](link-includes.h) for the definition of severity levels and the `lg_errinfo` structure. @@ -126,383 +146,3 @@ `class LG_Error` is also used as a general exception. See [tests.py](../bindings/python-examples/tests.py) for usage of all of these bindings. - - -Version 5.3.0 - Introduction of a word-graph for tokenizing -=========================================================== - -In this version the tokenizer code has been heavily modified once again. -Instead of tokenizing all the words in one pass directly into the -word-array of the parser, it now tokenizes them using a graph which its -nodes are "subwords". Initially the original sentence words are tokenized -to subwords only by whitespace. After that step, each subword is handled in -its turn, and gets split to further subwords if needed. A special care is -taken if one of the alternatives of a subword is the subword itself -(alone, which is usual, or with a dict-cap token - a new experimental -mechanism which is described below). - -The new way of tokenizing is much more flexible that the old one, and it -serves as an infrastructure on which new tokenizing and display features -can be implemented. One of them, that is implemented in this release, is -spelling for languages other then English. This is now possible because the -spell result can now pass further tokenization if needed. This also -enhances the spelling for English in case the spell result is a -contraction. In addition, the multi-level of tokenization, inherent to the -word-graph, allows multiple run-on and spell-correction fixes on the same -word at once. - -The tokenizing code is still based much on the old code and further -work is needed to clean it up (or to replace it, e.g. by a -regex-tokenizer). It still doesn't use the full power of the word-graph, -and there are constructs that need to be tokenized but they are not (they -are also not in the sentence test batches). E.g. `--` between words without -whitespace. - -There is still no API to get information from the word-graph. In particular, -it is not possible to find out the sentence words after punctuation -tokenization, as in previous releases. - -Since the parser cannot currently use the word-graph directly, there is a -need to convert it to the 2D-word-array that it uses. This is implemented -by the function `flatten_wordgraph()`, which uses a complex algorithm: It -scans all the word-graph paths in parallel, trying to advance to the next -words and to issue them into the 2D-word-array. - -It advances to the next word of a given word in rounds, using two passes, one -word per word-graph path on each round: -Pass 1. Next words which are in the same alternative of the given word. -Pass 2. Next words which are in a different alternative (of the common -ancestor word) of words that has already been advanced to in pass 1 -and this pass. - -The words that got advanced to are issued into the 2D-word-array. It is -possible that the second pass above cannot get advance in particular -word-graph path, because the next word is in the same alternative of one of -the next words in other paths. In that case an empty word is issued. This -constraint arises because all the next words in each word-graph advance -round, are issued into the same 2D-word-array "slot", which cannot hold -words from the same alternative. - -As in the previous versions, due to the way alternatives are implemented, -morphemes from different word-tokenization alternatives can be mixed in a -linkage. Such linkages are of course useless and need to be detected and -discarded. This is done by the `sane_linkage_morphism()` function. In the -previous versions it validated that the chosen words (the words in the -linkage) that are subwords (e.g. morphemes) of a sentence word, all belong -to a single tokenization alternative of that word. -It now works in another way - it validates that the chosen words create a -path in the word-graph. In case of "null-words" - words with no linkage - -the first path which is encountered is used. It means that a word in the -word-graph path corresponding to a null-word, may be only one of the potential -possibilities. - -Word-graph display ------------------- -Another feature that has been implemented, mainly for debug (but it can -also be useful for inspiration and fun), is displaying a graphical -representation of the word graph. The graphs can be displayed in several -ways, controlled by one-letter flags. The command `!test=wg` enables the -display of a graphs which includes no "prev" links for clarity, but -includes "unsplit word" links. Other graphical representation modes can be -enabled by `!test=wg:FLAGS`, when FLAGS are lowercase letters as defined in -wordgraph.h. For example, `!test=wg:sl` displays unsplit words as subgraphs, -with a legend, and adding the `p` flag (i.e. `!test=wg:slp`) adds "prev" links. -The graphical display still needs improvements in order to be able to -display complex word-graph situations in a simple manner. The graphical -display code is not compiled by default because it involves invocation of -an external program (`dot`) and in addition, files are created, both things -may not be desired by some users. Use `--enable-wordgraph-display` to enable -this feature. - -On Windows this feature is enabled when compiled with `USE_WORDGRAPH_DISPLAY`. -See "../msvcNN/RDADME" (NN is the MSVC version) for further details. - -Quote handling --------------- -Quotes now are not discarded, but are considered to be regular dict tokens. -In this version they have no significant linkage and always attach to the word -before them (or to the LEFT-WALL). In order to compare detailed batch runs with -previous versions of the library, a `!test=removeZZZ` can be used to remove the -quote display. - -Handling capitalized words --------------------------- -Not as in previous releases, capital letters which got downcased are not -restored for display if the affected words have a linkage. - -A new experimental handling of capital words using the dictionary has been -introduced. It inserts the token `1stCAP` before the uc version, and `nonCAP` -before the lc one, as discussed in: -https://groups.google.com/forum/?hl=en#!topic/link-grammar/hmK5gjXYWbk -It is enabled by `!test=dictcap`. The special "dictcap" tokens are not yet -discarded, so in order to compare results to previous library versions, the -following can be used: `!test=dictcap,removeZZZ`. - - -HOWTO use the new regex tokenizer/splitter -========================================== -It's new, experimental code. - -To compile: `../configure --enable-regex-tokenizer` - - - -- At the linkparser> prompt, enter: -`!/REGEX/,tokentosplit` - -Currently, if tokentosplit contains white space, command-line.c discards -it. -Also, case is currently observed. - -The regex syntax is designed so the regex is a valid one (although -meaningless) as written, so compiling it would reveal syntax errors in -it (the result of this initial compilation is not used). - -- All the /regexes/ are anchored at their start and end, as if `^` and `$` - were used. -- Mentioning a word class (x is an optional constraint, defaults to - `.*`): - -`(?x)` - -CLASS can be: - * DICTWORD, to match a word from `4.0.dict`. - * An affix class name (takes priority if there is a regex with the same - name). - * A regex name from `4.0.regex` (prefix it with `r` if there is such an - affix class). - -For regexes from `4.0.regex`, the code combine the ones with the same -name, taking care to omit the `^` and `$` from each, if exist (constraints -are said to be supported (not tested) and can be added if needed, but I -could not find an example of anything useful). - -DICTWORD can be optionally followed by a word mark, which is taken from -the affix file: - - * DICTWORDaM append M to DICTWORD before looking it up. - * DICTWORDpM prepend M to DICTWORD before looking it up. - -If M contains more than one word (in the affix file), only the first one -is currently used. - - -Examples: - * `(?)` match a suffix from the affix file - * `(?)` match the regex `NUMBER`. - * `(?)` match `UNITS` from the affix file. - * `(?)` match `UNITS` from the regex file. - * `(?)` match a dictionary word. - * `)` match word.= (if `STEMSUBSCR` is ^C=). - * `x)` ): -Matching single letters by DISTWORD (because they are in the dict) may -note be desired. -In such a case _x_ can be constrained to include 2 letters at least, plus -the desired 1-letter words. -E.g.: `(?.{2,}|a)` , which matches words of 2 letters and more, -plus the word `a`. - -- Currently the outer part of the regex should not contain alternations. - This is because I was too lazy to add code for adding `(?:...)` over it -in such cases. So in order to contain alternations the `(?:...)` should -currently be added by hand, as in: - -`/(?:(?)|(?))*/,dfs,dsfadsdsa,.?!sfads` - -- Holes are not supported. For example, this is not fine (and not - tested): - -`/(?)-(?)/,khasdkflhdsfa` - -because the `-` character would create a hole in the result. -But this is fine (and also not tested...): - -`/(?)(-)(?)/,asdfkjajfahlad` - -Currently, since named capturing groups are used for classes, if the same -class name is used more than once, there may be a need to start the regex -by `(?J)`. This will be fixed later. - -- The regex cannot include nested capture groups, so inner groups, if - needed, should be non-capturing ones. - -This is because currently the matching groups create a linear string, -without holes. -If you will find a use for internal capture groups, I can use them. -Because of that, backreferences in regexes from the regex file are not -supported (but there are currently none...). - -So this is not valid (a DICTWORD which matches a `NUMBER`): - -`/(?))/,qazwsx` - -and this too (a nonsense constraint for demo): - -`/(?([A-Z][0-9])*)/,qazwsx` - -but this should be fine: - -`/(?(?:[A-Z][0-9])*)/,qazwsx` - - -Some fun examples: - -``` -!/(.*)*/,test -Modified pattern: (?:(.*)(?C))*$(?C1) -Alternative 1: - 0 (1): test (0,4) - 1 (1): (4,4) -Alternative 2: - 0 (1): test (0,4) -Alternative 3: - 0 (1): tes (0,3) - 1 (1): t (3,4) - 2 (1): (4,4) -Alternative 4: - 0 (1): tes (0,3) - 1 (1): t (3,4) -Alternative 5: - 0 (1): te (0,2) - 1 (1): st (2,4) - 2 (1): (4,4) -[...] -Alternative 14: - 0 (1): t (0,1) - 1 (1): e (1,2) - 2 (1): st (2,4) -Alternative 15: - 0 (1): t (0,1) - 1 (1): e (1,2) - 2 (1): s (2,3) - 3 (1): t (3,4) - 4 (1): (4,4) -Alternative 16: - 0 (1): t (0,1) - 1 (1): e (1,2) - 2 (1): s (2,3) - 3 (1): t (3,4) -``` - -(Some appear "twice" due to the terminating null match. I think I will -discard such matches.). - -With splits to 2 parts only: -``` -linkparser> !/(.*){2}/,test -Modified pattern: (?:(.*)(?C)){2}$(?C1) -Alternative 1: - 0 (1): test (0,4) - 1 (1): (4,4) -Alternative 2: - 0 (1): tes (0,3) - 1 (1): t (3,4) -Alternative 3: - 0 (1): te (0,2) - 1 (1): st (2,4) -Alternative 4: - 0 (1): t (0,1) - 1 (1): est (1,4) -Alternative 5: - 0 (1): test (0,4) -linkparser> -``` - -``` -!/(?:(?.{2,}|a)(?)?)+/,theriver,dangeroustonavigatebutimportantforcommerce,hasmanyshoals. -``` -(This is one long line, just test it...) - - -`!/(?)(?)*(?)*/,123.2milligram/terag/?!`
-(test it...) - -``` -!/(?)(?)/,there's -Modified pattern: (?:(?.*)(?C))(?:(?.*)(?C))$(?C1) -Alternative 1: - 0 (1): there (0,5) [DICTWORD] - 1 (2): 's (5,7) [SUF] -linkparser> -``` - - -In the next example, we get only whole word and double-dash because -it can only match wpwp (when w is DICTWORD and p is `--`). - -``` -!/(?:(?)(?))+/,this--is-- -Modified pattern: (?:(?:(?.*)(?C))(?:(?.*)(?C)))+$(?C1) -Alternative 1: - 0 (1): this (0,4) [DICTWORD] - 1 (2): -- (4,6) [LPUNC] - 2 (1): is (6,8) [DICTWORD] - 3 (2): -- (8,10) [LPUNC] - 4 (1): (10,10) [DICTWORD] -linkparser> -``` - -However, this breaks to single characters, as expected: -``` -!/(?:(?)(?:(?))*)+/,this--is-- -... -Alternative 360: - 0 (1): t (0,1) [DICTWORD] - 1 (1): h (1,2) [DICTWORD] - 2 (1): i (2,3) [DICTWORD] - 3 (1): s (3,4) [DICTWORD] - 4 (1): - (4,5) [DICTWORD] - 5 (1): - (5,6) [DICTWORD] - 6 (1): i (6,7) [DICTWORD] - 7 (1): s (7,8) [DICTWORD] - 8 (1): - (8,9) [DICTWORD] - 9 (1): - (9,10) [DICTWORD] -10 (1): (10,10) [DICTWORD] -linkparser> -``` - -But this stops after the first match: -``` -!/(?:(?)(?:(?)(*COMMIT))*)+/,this--is-- -Alternative 1: - 0 (1): this (0,4) [DICTWORD] - 1 (2): -- (4,6) [LPUNC] - 2 (1): is (6,8) [DICTWORD] - 3 (2): -- (8,10) [LPUNC] - 4 (1): (10,10) [DICTWORD] -linkparser> -```` - -And this is even more interesting: -``` -!/(?:(?)(*COMMIT)(?:(?))*)+/,this--is-- -Alternative 1: - 0 (1): this (0,4) [DICTWORD] - 1 (2): -- (4,6) [LPUNC] - 2 (1): is (6,8) [DICTWORD] - 3 (2): -- (8,10) [LPUNC] - 4 (1): (10,10) [DICTWORD] -Alternative 2: - 0 (1): this (0,4) [DICTWORD] - 1 (2): -- (4,6) [LPUNC] - 2 (1): is (6,8) [DICTWORD] - 3 (2): - (8,9) [LPUNC] - 4 (2): - (9,10) [LPUNC] - 5 (1): (10,10) [DICTWORD] -Alternative 3: - 0 (1): this (0,4) [DICTWORD] - 1 (2): -- (4,6) [LPUNC] - 2 (1): is (6,8) [DICTWORD] - 3 (2): - (8,9) [LPUNC] - 4 (1): - (9,10) [DICTWORD] - 5 (1): (10,10) [DICTWORD] -linkparser> -``` - -It seems as if conditional matching using (?(condition)yes-pattern|no-pattern) -or `(*THEN)` can do some fun things, but I don't have useful examples yet. - -The question is how to use this code for tokenization. I have some -ideas, more on that later. diff -Nru link-grammar-5.3.16/link-grammar/regex-morph.c link-grammar-5.5.0/link-grammar/regex-morph.c --- link-grammar-5.3.16/link-grammar/regex-morph.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/regex-morph.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,154 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2005 Sampo Pyysalo */ -/* Copyright (c) 2009 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/* On MS Windows, regex.h fails to pull in size_t, so work around this by - * including before ( is not enough) */ -#include -#include -#include "api-structures.h" -#include "error.h" /* verbosity */ -#include "externs.h" /* lgdebug() */ -#include "dict-api.h" -#include "link-includes.h" -#include "regex-morph.h" -#include "structures.h" - - -/** - * Support for the regular-expression based token matching system - * using standard POSIX regex. - */ - -/** - * Notify an error message according to the error code. - */ -static void prt_regerror(const char *msg, const Regex_node *re, int rc) -{ - const size_t errbuf_size = regerror(rc, re->re, NULL, 0); - char * const errbuf = malloc(errbuf_size); - - /* - prt_error("Error: Failed to compile regex '%s' (%s) at %d: %s\n", - re->pattern, re->name, erroroffset, error); - */ - regerror(rc, re->re, errbuf, errbuf_size); - prt_error("Error: %s: \"%s\" (%s): %s\n", msg, re->pattern, re->name, errbuf); - free(errbuf); -} - -/** - * Compiles all the given regexs. Returns 0 on success, - * else an error code. - */ -int compile_regexs(Regex_node *re, Dictionary dict) -{ - regex_t *preg; - int rc; - - while (re != NULL) - { - /* If re->re non-null, assume compiled already. */ - if(re->re == NULL) - { - /* Compile with default options (0) and default character - * tables (NULL). */ - /* re->re = pcre_compile(re->pattern, 0, &error, &erroroffset, NULL); */ - preg = (regex_t *) malloc (sizeof(regex_t)); - re->re = preg; - - /* REG_ENHANCED is needed for OS X to support \w etc. */ -#ifndef REG_ENHANCED -#define REG_ENHANCED 0 -#endif - rc = regcomp(preg, re->pattern, REG_EXTENDED|REG_ENHANCED); - if (rc) - { - prt_regerror("Failed to compile regex", re, rc); - return rc; - } - - /* Check that the regex name is defined in the dictionary. */ - if ((NULL != dict) && !boolean_dictionary_lookup(dict, re->name)) - { - /* TODO: better error handing. Maybe remove the regex? */ - prt_error("Error: Regex name %s not found in dictionary!\n", - re->name); - } - } - re = re->next; - } - return 0; -} - -/** - * Tries to match each regex in turn to word s. - * On match, returns the name of the first matching regex. - * If no match is found, returns NULL. - */ -#define D_MRE 6 -const char *match_regex(const Regex_node *re, const char *s) -{ - int rc; - const char *nre_name; - - while (re != NULL) - { - /* Make sure the regex has been compiled. */ - assert(re->re); - -#if 0 - /* Try to match with no extra data (NULL), whole str - * (0 to strlen(s)), and default options (second 0). */ - int rc = pcre_exec(re->re, NULL, s, strlen(s), 0, - 0, ovector, PCRE_OVEC_SIZE); -#endif - - rc = regexec((regex_t*) re->re, s, 0, NULL, 0); - if (0 == rc) - { - lgdebug(+D_MRE, "%s%s %s\n", &"!"[!re->neg], re->name, s); - if (!re->neg) - return re->name; /* Match found - return--no multiple matches. */ - - /* Negative match - skip this regex name. */ - for (nre_name = re->name; re->next != NULL; re = re->next) - { - if (strcmp(nre_name, re->next->name) != 0) break; - } - } - else if (rc != REG_NOMATCH) - { - /* We have an error. */ - prt_regerror("Regex matching error", re, rc); - } - re = re->next; - } - return NULL; /* No matches. */ -} -#undef D_MRE - -/** - * Delete associated storage - */ -void free_regexs(Regex_node *re) -{ - while (re != NULL) - { - Regex_node *next = re->next; - regfree((regex_t *)re->re); - free(re->re); - free(re->name); - free(re->pattern); - free(re); - re = next; - } -} diff -Nru link-grammar-5.3.16/link-grammar/regex-morph.h link-grammar-5.5.0/link-grammar/regex-morph.h --- link-grammar-5.3.16/link-grammar/regex-morph.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/regex-morph.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,20 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2005 Sampo Pyysalo */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _REGEX_MORPH_H -#define _REGEX_MORPH_H - -#include "api-structures.h" - -int compile_regexs(Regex_node *, Dictionary); -const char *match_regex(const Regex_node *, const char *); -void free_regexs(Regex_node *); -#endif /* _REGEX_MORPH_H */ diff -Nru link-grammar-5.3.16/link-grammar/regex-tokenizer.c link-grammar-5.5.0/link-grammar/regex-tokenizer.c --- link-grammar-5.3.16/link-grammar/regex-tokenizer.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/regex-tokenizer.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1114 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2014 Amir Plivatsky */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/* FIXME: Fold long lines. */ - -#ifdef USE_REGEX_TOKENIZER - -#include -#include -#include -#include -#include -#include -#include - -#include "regex-tokenizer.h" -#include "dict-api.h" -#include "dict-common.h" -#include "error.h" -#include "regex-morph.h" -#include "structures.h" -#include "tokenize.h" -#include "utilities.h" -#include "word-utils.h" -#include "dict-file/read-dict.h" - -#include "pcre.h" - -/* Tokenizer flags. */ -#define MARK_TOKENS 0x1 /* stem/affix marks in the result tokens (not implemented) */ - -/* Debug signature for the initial end of sub-pattern 0 (subp[0].e). If we find - * it while printing an alternative, it means we have a logic failure, because - * we should not reach the end of the pattern without a match which assigns - * there the matched sub-pattern string end. */ -#define SUBP0END_DEBUG_SIGNATURE -2 - -//extern const char const * afdict_classname[]; - -typedef enum -{ - CALLBACK_REP, - CALLBACK_END, - CALLBACK_CONSTANT_START, /* UNUSED */ - CALLBACK_CONSTANT_END /* UNUSED */ -} callback_num; - -typedef struct ov -{ - int s; - int e; -} ov; - -#ifdef REGEX_TOKENIZER_CACHE -/* TODO */ -/* Match cache bit vector. */ -typedef struct bitvec -{ - int len; /* current vector length, in bytes */ - char *vec; - int get; /* cache get counter */ - int set; /* cache set counter */ -} bitvec; -#endif - -/* info per capture group number */ -typedef struct cgnum -{ - Dictionary dict; /* dictionary to use */ - const char *afclass; /* affix class, or NULL for main dict */ - const char *lookup_mark; /* potential stem or infix marks */ - char lookup_mark_pos; /* "+" (append) or "-" (prepend) */ - const char *name; /* currently only for result tagging printout */ -#ifdef REGEX_TOKENIZER_CACHE /* TODO */ - bitvec *mcache; /* substring match cache */ -#endif - /* FIXME: Maybe add formatting function for SUF, PRE, STEM */ -} cgnum; - -#define MAX_SUBP 100 -typedef struct callout_data -{ - int function; /* callout function multiplexing */ - const char *pattern; - int test; - ov subp[MAX_SUBP]; /* sub-pattern array */ - int capture_level[MAX_SUBP]; - int subp_i; /* current sub-pattern index */ - bool subp_ovfl; /* subp array overflow */ - int capture_last; /* UNUSED */ - const char ***wordlist; - cgnum **cgnum; - // bool is_constant; /* a constant alternation - don't lookup (FIXME. UNUSED)*/ - int alt_counter; /* counter for number of alternatives */ -} callout_data; - -/** - * Get a regex (of 4.0.regex) by name. - * Replace all capturing groups by non-capturing ones, since the invoking - * function cannot currently handle them. Hence back references are not - * supported. This can be fixed if needed. - * - * If a regex name appears multiple times, concatenate them using an alternation - * bar. Remove anchors ^ and $ if exist (suppose they can only appear at the - * start and end of the regex, as currently in 4.0.regex). - */ -static char *get_regex_by_name(Dictionary const dict, const char * const name) -{ - dyn_str * const pat = dyn_str_new(); - char *result = NULL; - Regex_node *re = dict->regex_root; - const char *p; - - while (NULL != re) - { - if (0 == strcmp(re->name, name)) - { - /* re analyze state */ - bool insqb = false; /* in square brackets */ - bool qn = false; /* quote next character */ - - p = re->pattern; - if ('\0' != pat->str[0]) dyn_strcat(pat, "|"); - if ('^' == *p) p++; - - /* Change groups in POSIX regex to PCRE non-capturing groups. - * FIXME: Add support for PCRE syntax, - * especially, skip (?...) and (*...). - * The following code supports backslash and square brackets. - * It supposes the regex is valid. */ - for (; '\0' != *p; p++) - { - char c0[2] = "\0\0"; - - if (qn) - { - qn = false; - } - else - { - switch (*p) - { - case '\\': - qn = true; - break; - case '[': - insqb = true; - break; - case ']': - if (p > re->pattern && '[' == p[-1]) break; - insqb = false; - break; - case '(': - if (insqb) break; - dyn_strcat(pat, "(?:"); - continue; - } - } - if ('$' != *p || '\0' != p[1]) - { - c0[0] = *p; - dyn_strcat(pat, c0); - } - } - } - re = re->next; - } - - if ('\0' != pat->str[0]) result = strdup(pat->str); - dyn_str_delete(pat); - return result; -} - -static void printov(const char *str, ov *ov, int top, callout_data *cd, bool is_pcreov) -{ - int i; - const cgnum *cgnump = NULL; - - for (i = 0; i < top; i++) - { - printf("%2d", i); - if (!is_pcreov && (NULL != cd) && (NULL != cd->capture_level)) - printf(" (%d)", (ov[i].e < 0) ? 0 : cd->capture_level[i]); - printf(": "); - if (ov[i].s < 0) - { - printf(" "); - } else - { - if (ov[i].e < 0) - printf(" END<0 (%d,%d)", ov[i].s, ov[i].e); - else - printf(" %.*s (%d,%d)", ov[i].e - ov[i].s, str + ov[i].s, ov[i].s, ov[i].e); - } - - /* Find the tokenizer capture group info for the current OV element: - * - For PCRE OV, use its index (if > 0) as capture group. - * - For the tokenizer OV, use the recorded capture level. - * Since the cgnum array is 0-based and the first parenthesized capture - * group is 1, subtract 1 to get the actual index. */ - if ((NULL != cd) && (NULL != cd->capture_level) && (NULL != cd->cgnum) && - (!is_pcreov || (i > 0)) && ov[i].e >= 0) - cgnump = cd->cgnum[(is_pcreov ? i : cd->capture_level[i]) - 1]; - - if (NULL != cgnump) - { - const char *a = "", *p = ""; - char lookup_mark[10]; - char *sm; - - if (NULL != cgnump->lookup_mark) - { - if ('a' == cgnump->lookup_mark_pos) - { - safe_strcpy(lookup_mark, cgnump->lookup_mark, sizeof(lookup_mark)); - sm = strrchr(lookup_mark, SUBSCRIPT_MARK); - if (NULL != sm) *sm = '.'; - a = lookup_mark; - } - else - { - p = cgnump->lookup_mark; - } - } - printf(" [%s%s%s]", p, cgnump->name, a); - } - - printf("\n"); - } -} - -/** - * Compare a portion of the tokenized string, starting at word_stat with length - * of numchar, to the dictionary or affix class word that is defined in the - * capture group whose info is pointed to by cgnump. - * - * FIXME: Return int instead of bool, see the comment at E1 below. - */ -static bool is_word(const char *word_start, int numchar, cgnum *cgnump) -{ - Dictionary const dict = cgnump->dict; - const char * const afclass = cgnump->afclass; - const int lookup_mark_len = - (NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0; - char * const word = alloca(numchar+lookup_mark_len+1); -#ifdef AFFIX_DICTIONARY_TREE - const Dict_node *dn; -#endif - const Afdict_class *ac; - size_t i; - - /* Append/prepend stem/infix marks. */ - if (NULL == cgnump->lookup_mark) - { - strncpy(word, word_start, numchar); - word[numchar] = '\0'; - } - else - { - switch (cgnump->lookup_mark_pos) - { - case 'p': /* prepend a mark */ - strcpy(word, cgnump->lookup_mark); - strncat(word, word_start, numchar); - word[numchar+lookup_mark_len] = '\0'; - break; - case 'a': /* append a mark */ - strncpy(word, word_start, numchar); - strcpy(word+numchar, cgnump->lookup_mark); - break; - default: - printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark); - strncpy(word, word_start, numchar); - word[numchar] = '\0'; - } - } - - lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name); - if (0 == afclass) return boolean_dictionary_lookup(dict, word); - - /* We don't have for now a tree representation of the affix file, only lists */ -#ifdef AFFIX_DICTIONARY_TREE - dn = lookup_list(dict, word); - printf("WORD %s afclass %s dn %p\n", word, afclass, dn); - if (NULL == dn) return false; - - for (; NULL != dn; dn = dn->left) - { - const char *con = word_only_connector(dn); - if (NULL == con) - { - /* Internal error - nothing else to do for now unless we don't - * rerun bool, but return an int so -1 signifies an error. */ - printf("is_word(%s):E1 ", word); - } - printf("CON '%s'\n", con); - if (0 == strcmp(afclass, con)) return true; - } -#else - /* Make it the hard way. */ - ac = afdict_find(dict, afclass, /*notify_err*/false); - if (NULL == ac) - { - /* Internal error - nothing else to do for now unless we don't - * rerun bool, but return an int so -1 signifies an error. */ - printf("is_word(%s):E2 ", word); - } - - for (i = 0; i < ac->length; i++) - { - if (0 == strcmp(ac->string[i], word)) return true; - } -#endif - - return false; -} - -static int callout(pcre_callout_block *cb) -{ - callout_data *cd = cb->callout_data; - ov *cb_ov = (ov *)&cb->offset_vector[2*cb->capture_last]; - -#if 0 - const char **wordlist = NULL; -#endif - cgnum *cgnum = NULL; - const char *openp; - const char *endname; - bool subp_updated = false; - - if ((NULL != cd->cgnum) && (-1 != cb->capture_last)) - { - cgnum = cd->cgnum[cb->capture_last-1]; - } - lgdebug(6, "Callout %d: capture_last %d cgnum %p\n", - cb->callout_number, cb->capture_last, cgnum); - - if (verbosity >= 6) - printov(cb->subject, (ov *)cb->offset_vector, cb->capture_top, cd, /*is_pcreov*/true); - - switch(cb->callout_number) - { - case CALLBACK_REP: - if (cb->capture_last > 0) - { - int subp_i = cd->subp_i; - ov *subp = &cd->subp[subp_i]; - - lgdebug(2, "Current capture %d: s=%d, e=%d\n", - cb->capture_last, cb_ov->s, cb_ov->e); - assert(cb_ov->s>=0 && cb_ov->e>=0, "Bad start/end in capture group %d: s=%d e=%d", - cb->capture_last, cb_ov->s, cb_ov->e); - - if (verbosity >= 6) - { - printf("INITIAL subp:\n"); - if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ - printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); - } - - /* Record all the captures into the subp (sub-pattern) vector. - * If we capture a continuation to another capture then it is a new - * capture. Else we update a previous position in subp. There should be - * no gaps between the capture strings. - * FIXME: Handled null matches properly. Need to use cd->capture_level - * to remember at which level a null match has been captured. - * FIXME: Move after the word lookup (efficiency). - * FIXME: Increment subp instead of cd->subp_i (cosmetic fix). */ - - if (cb_ov->s > subp->s) - { - if (cb_ov->s == subp->e) - { - cd->subp_i++; - if (cd->subp_i == MAX_SUBP) - { - cd->subp_ovfl = true; - return PCRE_ERROR_CALLOUT; - } - lgdebug(2, "OV start gt, update next sub-pattern %d\n", cd->subp_i); - cd->subp[cd->subp_i] = *cb_ov; - subp_updated = true; - } - else - { - printf("Capture group %d (s=%d e=%d) makes a hole (subp_i %d: s=%d e=%d)\n", - cb->capture_last, subp->s, subp->e, subp_i, cb_ov->s, cb_ov->e); - return PCRE_ERROR_CALLOUT; - } - } - else - { - /* A backtrack occurred. */ - for (subp_i = cd->subp_i; subp_i >= 0; subp_i--) - { - subp = &cd->subp[subp_i]; - - lgdebug(2, "Checking recorded sub-pattern %d: s=%d e=%d: ", - subp_i, subp->s, subp->e); - - if (cb_ov->s == subp->s) - { - lgdebug(2, "OV start eq, update sub-pattern %d\n", subp_i); - *subp = *cb_ov; - cd->subp_i = subp_i; - subp_updated = true; - break; - } - lgdebug(2, "Backtrack handling\n"); - } - } - assert(subp_i >= 0, "Recorded sub-pattern index"); - assert(subp_updated); - cd->capture_level[cd->subp_i] = cb->capture_last; - - if (verbosity >= 6) - { - printf("AFTER: subp:\n"); - if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ - printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); - } - - /* Make a dictionary lookup for NAME in capture groups (?x) - * (x is a constraint for the initial pattern-match comparison done by - * PCRE). */ - // if (cgnum && * cd->is_constant) printf("is_constant\n"); - - /* If we have a cgnum structure with a dict, check if the string to be - * matched is in the dict or belongs to the given affix class. - * A NULL cgnum->dict means this is a regex from the regex file. */ - - if (cgnum && cgnum->dict) - { /* && !cd->is_constant */ - int numchar = cb_ov->e - cb_ov->s; - - /* Debug: Sanity check. */ - assert(numchar>=0, "numchar=%d", numchar); - endname = NULL; - for (openp = &cd->pattern[cb->pattern_position-5]; *openp; openp--) - { - if (*openp == '>') endname = openp; - if (*openp == '(' && openp[1] == '?' && openp[2] == '<' && openp[3] != '=') break; - } - if (NULL != openp && *openp == '(' && NULL != endname && strncmp(openp, "(?<", 3) == 0 && endname > openp) - ; /* Everything is OK. */ - else - { - assert(0, "Error: Not in a named group!"); - } - lgdebug(6, "GROUP NAME %.*s, cgnum %d, ptr %p, numchar %d\n", - (int)(endname - openp - 3), openp+3, cb->capture_last-1, cgnum, numchar); - /* End of debug sanity check. */ - - lgdebug(2, "Try match '%.*s': ", numchar, cb->subject+cb_ov->s); - -#if 0 - if (0 == numchar) - { - lgdebug(2, "Zero match denied\n"); - return 1; - } -#endif - - if (!is_word(cb->subject+cb_ov->s, numchar, cgnum)) - { - lgdebug(2, "NO MATCH\n"); - return 1; - } - lgdebug(6, "MATCH\n"); - } - } -#if 0 - if (verbosity >= 6) - { - printf("DEBUG subp:\n"); - if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ - printov(cb->subject, cd->subp, cd->subp_i+1, cd); - } -#endif - - // cd->is_constant = false; - return 0; /* continue to match the rest of the regex */ - break; - -#if 0 - case CALLBACK_CONSTANT_START: - // cd->is_constant = true; - return 0; - break; - - case CALLBACK_CONSTANT_END: - // cd->is_constant = false; - return 0; - break; -#endif - - case CALLBACK_END: - cd->alt_counter++; - printf("Alternative %d:\n", cd->alt_counter); - /* See the comment for SUBP0END_DEBUG_SIGNATURE. */ - assert(cd->subp[0].e>=0, "subp[0].e is %d!", cd->subp[0].e); - printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); - - /* Remove the last sub-pattern, in case it is a null string (no need to - * check, it can be removed anyway since if it is not a null string it is - * going to be replaced on the next match). Else the next match, which - * will be without this null string, we emit it again as the last - * sub-pattern component. FIXME: It doesn't always help. */ - - if (cd->subp_i > 0) - { - cd->capture_level[cd->subp_i] = -3; /* mark as invalid, for debug */ - cd->subp_i--; - } - - // cd->is_constant = false; - return 1; /* signify a backtrack in order to find the next alternative */ - break; - - default: - assert("Callout: Unreached" && 0); - } - - return 0; /* Really unreached. */ - -/* - printf("Callout %d, data test %d\n" - "version %d\n" - "subject '%s\n" - "subject_length %d\n" - "start_match %d\n" - "current_position %d\n" - "capture_top %d\n" - "capture_last %d\n" - "pattern_position %d\n" - "next_item_length %d\n", - cb->callout_number, ((callout_data *)cb->callout_data)->test, - cb->version, cb->subject, cb->subject_length, cb->start_match, - - cb->current_position, - cb->capture_top, - cb->capture_last, - - cb->pattern_position, - cb->next_item_length); - return 0; -*/ - -} - -/* Was main() of the test program... */ -static int regex_split(const char *inpat, int flags, const char *str, Dictionary dict) -{ - const char *p; - dyn_str *pat; - int plevel; /* paren level */ - int cglevel; /* capture group level */ - int nplevel; /* paren level within named capture group */ - int cgnum; /* capture group number*/ - int options; - const char *errptr; - int erroffset; - pcre *pcre; - const char * const prog = "regex_tokenizer_test"; - int rc; - pcre_extra *extra = NULL; -#define OVCNT 15 - int ovector[OVCNT]; - callout_data callout_data; - -#if 0 - const char **wordlist; -#endif - bool word_compare_flag = true; -#ifdef notdef - dyn_str *wordalts; -#endif - const char *group_name = NULL; - char *word_classname; - char c0[2] = "\0\0"; - - /* FIXME: validate we use PCRE version 2 at least. */ - - /* Find the number of capturing groups in the input pattern. */ - cgnum = 0; - for (p = inpat; '\0' != *p; p++) - { - /* Count as capture groups only (string) or (?). Especially, avoid - * counting (?<=...) (positive look behind) and (?(condition)...) (the - * (condition) part). - * FIXME: support () inside []. - * FIXME: support \. */ - if ((*p == '(') && (*p != '*') && - ((p[1] != '?') || ((p[2] == '<') && (p[3] != '='))) && - ((p-inpat < 2) || (p[-2] != '(') || (p[-1] != '?'))) - { - cgnum++; - } - } - if (0 == cgnum) - { - printf("%s: pattern must include at least one () group (was: %s)\n", prog, inpat); - return 9; - } -#if 0 - if (p[-1] != '$') - { - /* FIXME: add $ if needed */ - printf("%s: pattern must end with $ (was: %s)\n", prog, inpat); - return 9; - } -#endif - - /* Regex syntax check of the pattern. - * FIXME: Add support for "(?J)" */ - options = PCRE_UTF8; - pcre = pcre_compile(inpat, options, &errptr, &erroffset, NULL); - if (NULL == pcre) - { - printf("%s: pcre_compile: Error in pattern '%s' at offset %d: %s\n", - prog, inpat, erroffset, errptr); - return 2; - } - - callout_data.wordlist = NULL; - callout_data.cgnum = NULL; - if (word_compare_flag) - { - int i; -#if 0 - callout_data.wordlist = malloc(sizeof(*callout_data.wordlist)*cgnum); -#endif - callout_data.cgnum = malloc(sizeof(*callout_data.cgnum)*cgnum); - //printf("ALLOCATED callout_data.cgnum %ld for %d groups\n", sizeof(*callout_data.wordlist)*cgnum, cgnum); - for (i = 0; i < cgnum; i++) - { -#if 0 - callout_data.wordlist[i] = NULL; -#endif - callout_data.cgnum[i] = NULL; - - } - } - - /* Build the pattern that finds all possible matches. */ - pat = dyn_str_new(); - plevel = 0; - cglevel = 0; - cgnum = -1; /* First capture group (plevel==1) is cgnum==0. */ - - /* Convert the input regex to the tokenizer regex. - * cglevel counts named capture groups - * plevel counts all groups - * - * FIXME: Add support for: - * (?x) - comment mode. - * (?i) - ignore case. - * \ - backslash for ()<>?* . - * [] - () inside it - * FIXME: Add "(?: ... )" over the result pattern. - */ - //dyn_strcat(pat, "(?J)"); - for (p = inpat; '\0' != *p; p++) - { - char *re = NULL; /* a regex from the 4.0.regex file */ - - switch (*p) - { - const char *c; - - case '(': - if (cglevel > 0) - { - printf("Error at position %ld: Tokenizer capture groups cannot have nested groups\n", p-inpat); - } - plevel++; - if ((p[1] == '*') || - ((p[1] == '?') && ((p[2] != '<') || (p[3] == '='))) || - ((p-inpat > 1) && (p[-2] == '(') && (p[-1] == '?'))) - { - break; - } - cglevel++; - if (cglevel > 1) - { - printf("Error at position %ld: Tokenizer aregex cannot have capture group level > 1\n", p-inpat); - free(callout_data.cgnum); - return 199; - } - cgnum++; - dyn_strcat(pat, "(?:"); - group_name = NULL; - break; - case ')': - plevel--; - if (cglevel > 0) - { - cglevel--; - /* Add the dict lookup and capturing callback. */ - dyn_strcat(pat, ")(?C)"); - } - group_name = NULL; - break; - case '<': - /* Remember it as a potential start of a named group. */ - if ((p-2 >= inpat) && (p[-2] == '(') && (p[-1] == '?') && (p[1] != '=')) - { - group_name = p + 1; - } - else - group_name = NULL; - break; - case '>': - if (NULL != group_name) - { - /* Check if this is actually a group name */ - for (c = group_name; c < p; c++) - { - /* FIXME: 'a' and 'p' are part of a hack for lookup_mark. - * FIXME: 'r' is part of a hack for regex names that match affix - * class names. The fix is not to use matching names. */ - if ((*c > 'Z' || *c < 'A') && *c != 'a' && *c != 'p' && *c != 'r') break; - } - if (c == p) - { - word_classname = malloc(p-group_name+1); - strncpy(word_classname, group_name, p-group_name); - word_classname[p-group_name] = '\0'; - } else - { - printf("%s: Invalid class name in group name found at '%s'\n", - prog, group_name-4); - word_classname = NULL; - } - } else - { - word_classname = NULL; - } - if (!word_classname) - { - group_name = NULL; - break; - } - dyn_strcat(pat, ">"); - - lgdebug(6, "Found word-class %s\n", word_classname); -#if 0 - wordlist = readwords(word_classname); - if (NULL == wordlist) - { - printf("i%s: Invalid class name %s in group name\n", prog, word_classname); - return 100; - } - - if (!word_compare_flag) - { - printf("Invocation without -w is not supported\n"); - return 103; - } -#endif - - if (word_compare_flag) - { - char *t; - const char *lookup_mark = NULL; -#if 0 - callout_data.wordlist[cgnum] = wordlist; - printf("WORDLIST %p at cgnum %d\n", wordlist, cgnum); -#endif - /* Allocate per group info */ - callout_data.cgnum[cgnum] = malloc(sizeof(*(callout_data.cgnum)[0])); - callout_data.cgnum[cgnum]->name = NULL; - //printf("ALLOCATED cgnum[%d]=%p\n", cgnum, callout_data.cgnum[cgnum]); - - /* A hack for testing: Handle WORDpX or WORDaX. - * The above a/p marks mean append/prepend X to word before making - * the lookup. - * FIXME: Find another way to specify that, maybe in the affix file - * or in a tokenizer definition file. */ - t = strpbrk(word_classname, "pa"); - if (NULL != t) - { - Afdict_class *ac; - - callout_data.cgnum[cgnum]->lookup_mark_pos = *t; - *t = '\0'; - ac = afdict_find(dict->affix_table, t+1, /*notify_err*/false); - if (NULL == ac) - { - printf("%s: Unknown afclass '%s'\n", prog, t+1); - return 253; - } - - /* Check if the requested affix class is defined and is not an - * empty string (like the default INFIXMARK). */ - if (0 == ac->length || '\0' == ac->string[0][0]) - { - printf("%s: No value for afclass '%s'\n", prog, t+1); - return 252; - } - lookup_mark = ac->string[0]; /* FIXME: support more than one value. */ - } - - callout_data.cgnum[cgnum]->lookup_mark = lookup_mark; - callout_data.cgnum[cgnum]->name = word_classname; - - if (0 == strcmp(word_classname, "DICTWORD")) - { - /* Assign data for looking up a word in the main dict. */ - callout_data.cgnum[cgnum]->dict = dict; - callout_data.cgnum[cgnum]->afclass = NULL; - } - else - if (afdict_find(dict->affix_table, word_classname, /*notify_err*/false)) - { - callout_data.cgnum[cgnum]->dict = dict->affix_table; - callout_data.cgnum[cgnum]->afclass = word_classname; - } - else - { - if ('r' == word_classname[0]) word_classname++; - re = get_regex_by_name(dict, word_classname); - if (re) - { - lgdebug(6, "Regex %s with modified groups: '%s'\n", word_classname, re); - callout_data.cgnum[cgnum]->dict = NULL; - /* FIXME: No need to allocate callout_data.cgnum[cgnum] in this - * case. */ - } - else - { - printf("%s: Unknown word classname '%s'\n", prog, word_classname); - return 254; - } - } - /* TODO: Assign flags, e.g. for emitting the words with stem/infix marks. */ - - } else - { -#if 0 - wordalts = make_wordalts(wordlist); - dyn_strcat(pat, wordalts->str); - dyn_str_delete(wordalts); - free(wordlist); -#else - printf("%s: Invocation without -w is not supported\n", prog); - return 103; -#endif - } - /* Default match for dictionary lookup is ".*". - * Allow replacing it by something else. - * E.g: .{2,}|a */ - if (')' == p[1]) - { - if (NULL == re) - { - dyn_strcat(pat, ".*"); - } - else - { - dyn_strcat(pat, re); - free(re); - re = NULL; - } - } - else - { - nplevel = 1; - /* FIXME: Add support for: - * (?x) - comment mode. - * \ - backslash for ()<>?* . - * [] - () inside it - */ - for (; p[1] != '\0' && nplevel > 0; p++) - { - switch (p[1]) - { - case '(': - if (('?' != p[2]) && ('*' != p[2]) && - ((p[-1] != '(') || (p[0] != '?'))) - { - printf("%s: Capture_group %d: Nested capture group is not supported\n", - prog, cgnum+1); - return 250; - } - nplevel++; - break; - case ')': - nplevel--; - if (0 == nplevel) continue; /* we are done */ - break; - } - - c0[0] = p[1]; - dyn_strcat(pat, c0); - } - p--; - } - - word_classname = NULL; - group_name = NULL; - continue; - } - - c0[0] = *p; - dyn_strcat(pat, c0); - } - - /* Add '$' at the end if needed. */ - if ('$' != pat->str[pat->end-1]) dyn_strcat(pat, "$"); - /* Add the backtracking callback. */ - dyn_strcat(pat, "(?C1)"); - - printf("Modified pattern: %s", pat->str); - lgdebug(2, " (len %zu/%zu)", pat->end, pat->len); - printf("\n"); - - pcre_callout = callout; - - callout_data.function = 1; - callout_data.subp_i = 0; - callout_data.subp[0].s = 0; - callout_data.subp[0].e = SUBP0END_DEBUG_SIGNATURE; - callout_data.subp_ovfl = false; - callout_data.capture_last = 0; - callout_data.pattern = pat->str; - callout_data.alt_counter = 0; - - options = PCRE_UTF8; - pcre = pcre_compile(pat->str, options, &errptr, &erroffset, NULL); - if (NULL == pcre) - { - printf("%s: Internal error: pcre_compile: Error in pattern '%s' at offset %d: %s\n", - prog, pat->str, erroffset, errptr); - return 99; - } - - /* TODO: Check if using JIT may optimize out some needed callouts. */ - options = 0; //PCRE_STUDY_JIT_COMPILE; - extra = pcre_study(pcre, options, &errptr); - if (NULL == extra) - { - if (NULL != errptr) - { - printf("%s: pcre_study: Error for pattern '%s': %s\n", prog, pat->str, errptr); - return 3; - } - extra = malloc(sizeof(*extra)); - memset(extra, 0, sizeof(*extra)); - } else - { - /* For some reason JIT is sometimes done even though it was not requested. - * But the callouts are still invoked as expected in such cases. */ - lgdebug(6, "%s: pcre_study: JIT %ld\n", prog, extra->flags & PCRE_STUDY_JIT_COMPILE); - } - -#if 0 - extra->match_limit = 10000; - extra->match_limit_recursion = 10000; - extra->flags |= PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION; -#endif - - extra->callout_data = (void *)&callout_data; - extra->flags |= PCRE_EXTRA_CALLOUT_DATA; - -#if 0 - printf("CGNUM %d\n", cgnum); - if (NULL != callout_data.cgnum) - { - int i; - - for (i = 0; i <= cgnum; i++) - { - printf("callout_data.cgnum[%d] %p\n", i, callout_data.cgnum[i]); - } - } else - printf("CGNUM %p\n", callout_data.cgnum); -#endif - - options = PCRE_ANCHORED; /* XXX Maybe PCRE_NO_START_OPTIMIZE is needed too */ - rc = pcre_exec(pcre, extra, str, strlen(str), 0, options, ovector, OVCNT); - if (rc < 0) - { - if (PCRE_ERROR_NOMATCH == rc) - { - lgdebug(2, "No match (must always happen)\n"); - } else - { - printf("%s: pcre_exec: Error %d\n", prog, rc); - } - } else - { - printf("Internal error: Unexpected match, rc=%d\n", rc); - } - - if (0 == rc) - { - rc = OVCNT/3; - printf("ovector only has room for %d captured substrings\n", rc - 1); - } - - printov(str, (ov *)ovector, rc, NULL, /*is_pcreov*/true); - - if (verbosity > 6) - { - if (0 != callout_data.subp_i) - { - printf("Callout stack:\n"); - printov(str, callout_data.subp, callout_data.subp_i, &callout_data, /*is_pcreov*/false); - } - } - - /* Free everything. */ - dyn_str_delete(pat); /* note - callback_data uses parts of pat */ - pcre_free_study(extra); /* safe even if malloc'ed */ - free(pcre); - - if (NULL != callout_data.cgnum) - { - int i; - - for (i = 0; i <= cgnum; i++) - { - if (callout_data.cgnum[i]) - { - /* FIXME: Free also word_classname. */ - free(callout_data.cgnum[i]); - } - } - free(callout_data.cgnum); - } - -#if 0 - if (NULL != callout_data.wordlist) - { - int i; - - for (i = 0; i < cgnum; i++) - { - free(callout_data.wordlist[i]); - } - free(callout_data.wordlist); - } -#endif - - return 0; -} - -/** - * Test the RegEx tokenizer. - * line - REGEX/,token - */ -int regex_tokenizer_test(Dictionary dict, const char *line) -{ - int linelen = strlen(line); - char *regex = alloca(linelen+1); - char *token = alloca(linelen); - char *regex_end; - int tokenizer_flags; - - strcpy(regex, line); - regex_end = index(regex, '/'); - if (NULL == regex_end) - { - printf("Missing terminating '/' in regex.\nUsage: /REGEX/,token\n"); - return 101; - } - *regex_end = '\0'; - regex_end++; - /* FIXME: Add iterations for more flags if needed. */ - switch (*regex_end) - { - case 'M': - tokenizer_flags = MARK_TOKENS; - regex_end++; - break; - } - if (',' != *regex_end) - { - printf("Missing terminating ',' after regex end.\nUsage: /REGEX/,token\n"); - return 102; - } - strcpy(token, regex_end + 1); - if ('\0' == token[0]) - { - printf("Missing token\nUsage: /REGEX/,token\n"); - return 103; - } - - return regex_split(regex, tokenizer_flags, token, dict); -} - -#else /* USE_REGEX_TOKENIZER */ - -/* Mac OSX will fail to link if this dummy is not defined. - * But why is it needed? Because it shows up in the exported - * symbols list (link-grammar.def) and if its there, it must - * also be in the code. Thus, the below. - */ -#include "regex-tokenizer.h" -int regex_tokenizer_test(Dictionary dict, const char *line) -{ - return 0; -} -#endif /* USE_REGEX_TOKENIZER */ - diff -Nru link-grammar-5.3.16/link-grammar/regex-tokenizer.h link-grammar-5.5.0/link-grammar/regex-tokenizer.h --- link-grammar-5.3.16/link-grammar/regex-tokenizer.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/regex-tokenizer.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,4 +0,0 @@ -#include "link-includes.h" - -int regex_tokenizer_test(Dictionary, const char *); - diff -Nru link-grammar-5.3.16/link-grammar/resources.c link-grammar-5.5.0/link-grammar/resources.c --- link-grammar-5.3.16/link-grammar/resources.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/resources.c 2017-09-25 04:28:41.000000000 +0000 @@ -131,11 +131,11 @@ #define RES_COL_WIDTH sizeof(" ") /** print out the cpu ticks since this was last called */ -static void resources_print_time(int verbosity, Resources r, const char * s) +static void resources_print_time(int verbosity_opt, Resources r, const char * s) { double now; now = current_usage_time(); - if (verbosity >= D_USER_TIMES) + if (verbosity_opt >= D_USER_TIMES) { prt_error("++++ %-36s %7.2f seconds\n", s, now - r->when_last_called); } @@ -143,12 +143,12 @@ } /** print out the cpu ticks since this was last called */ -static void resources_print_total_time(int verbosity, Resources r) +static void resources_print_total_time(int verbosity_opt, Resources r) { double now; now = current_usage_time(); r->cumulative_time += (now - r->time_when_parse_started) ; - if (verbosity >= D_USER_BASIC) + if (verbosity_opt >= D_USER_BASIC) { prt_error("++++ %-36s %7.2f seconds (%.2f total)\n", "Time", now - r->time_when_parse_started, r->cumulative_time); @@ -156,9 +156,9 @@ r->time_when_parse_started = now; } -static void resources_print_total_space(int verbosity, Resources r) +static void resources_print_total_space(int verbosity_opt, Resources r) { - if (verbosity >= D_USER_TIMES) + if (verbosity_opt >= D_USER_TIMES) { prt_error("++++ %-36s %zu bytes (%zu max)\n", "Total space", get_space_in_use(), get_max_space_used()); diff -Nru link-grammar-5.3.16/link-grammar/resources.h link-grammar-5.5.0/link-grammar/resources.h --- link-grammar-5.3.16/link-grammar/resources.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/resources.h 2017-06-17 01:29:32.000000000 +0000 @@ -4,7 +4,7 @@ /* All rights reserved */ /* */ /* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ +/* license set forth in the LICENSE file included with this software. */ /* This license allows free redistribution and use in source and binary */ /* forms, with or without modification, subject to certain conditions. */ /* */ @@ -23,6 +23,6 @@ bool resources_timer_expired(Resources r); bool resources_memory_exhausted(Resources r); bool resources_exhausted(Resources r); -Resources resources_create(void); +Resources resources_create(void); void resources_delete(Resources ti); #endif /* _RESOURCES_H */ diff -Nru link-grammar-5.3.16/link-grammar/sat-solver/guiding.hpp link-grammar-5.5.0/link-grammar/sat-solver/guiding.hpp --- link-grammar-5.3.16/link-grammar/sat-solver/guiding.hpp 2017-01-27 22:02:55.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/sat-solver/guiding.hpp 2018-01-23 17:05:08.000000000 +0000 @@ -2,10 +2,15 @@ #define __GUIDING_HPP__ #include -#include +#include #undef assert #include "util.hpp" +extern "C" +{ +#include "api-structures.h" // For definition of Sentence +} + #ifndef HAVE_SETPOLARITY_BOOL // setPolarity has lbool argument, not bool #define setPolarity(v, b) setPolarity(v, toLbool(b)) #endif diff -Nru link-grammar-5.3.16/link-grammar/sat-solver/Makefile.in link-grammar-5.5.0/link-grammar/sat-solver/Makefile.in --- link-grammar-5.3.16/link-grammar/sat-solver/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/sat-solver/Makefile.in 2018-04-29 21:40:03.000000000 +0000 @@ -195,6 +195,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -223,6 +224,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -231,13 +233,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -304,12 +309,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/link-grammar/sat-solver/sat-encoder.cpp link-grammar-5.5.0/link-grammar/sat-solver/sat-encoder.cpp --- link-grammar-5.3.16/link-grammar/sat-solver/sat-encoder.cpp 2017-01-27 22:02:55.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/sat-solver/sat-encoder.cpp 2018-04-22 14:57:33.000000000 +0000 @@ -19,7 +19,7 @@ extern "C" { #include "sat-encoder.h" } -#include "core/Solver.h" +#include "minisat/core/Solver.h" #undef assert #include "sat-encoder.hpp" @@ -30,12 +30,18 @@ #include "fast-sprintf.hpp" extern "C" { -#include "analyze-linkage.h" // for compute_link_names() -#include "build-disjuncts.h" // for build_disjuncts_for_exp() -#include "dict-api.h" // for print_expression() -#include "linkage.h" -#include "post-process.h" -#include "score.h" // for linkage_score() +#include "dict-common/dict-api.h" // for print_expression() +#include "dict-common/dict-utils.h" // for free_Exp() +#include "disjunct-utils.h" +#include "linkage/analyze-linkage.h" // for compute_link_names() +#include "linkage/linkage.h" +#include "linkage/sane.h" // for sane_linkage_morphism() +#include "linkage/score.h" // for linkage_score() +#include "prepare/build-disjuncts.h" // for build_disjuncts_for_exp() +#include "post-process/post-process.h" +#include "post-process/pp-structures.h" +#include "tokenize/word-structures.h" // for Word_struct +#include "tokenize/tok-structures.h" // got Gword internals } // Macro DEBUG_print is used to dump to stdout information while debugging @@ -461,7 +467,7 @@ Exp* exp; E_list* or_list = NULL;; for (X_node* x = _sent->word[w].x; x != NULL; x = x->next) { - E_list* new_node = (E_list*) xalloc(sizeof(E_list)); + E_list* new_node = (E_list*) malloc(sizeof(E_list)); new_node->e = x->exp; new_node->next = NULL; if (or_list == NULL) { @@ -473,7 +479,7 @@ y->next = new_node; } } - exp = (Exp*) xalloc(sizeof(Exp)); + exp = (Exp*) malloc(sizeof(Exp)); exp->type = OR_type; exp->u.l = or_list; exp->cost = 0.0; @@ -486,17 +492,17 @@ E_list *l = exp->u.l; while (l != NULL) { E_list* next = l->next; - xfree(l, sizeof(E_list)); + free(l); l = next; } - xfree(exp, sizeof(exp)); + free(exp); } void SATEncoder::generate_link_cw_ordinary_definition(size_t wi, int pi, Exp* e, size_t wj) { - const char* Ci = e->u.string; + const char* Ci = e->u.condesc->string; char dir = e->dir; double cost = e->cost; Lit lhs = Lit(_variables->link_cw(wj, wi, pi, Ci)); @@ -513,12 +519,12 @@ if (dir == '+') { rhs.push(Lit(_variables->link_cost(wi, pi, Ci, e, (*i)->word, (*i)->position, - (*i)->connector.string, + connector_string(&(*i)->connector), (*i)->exp, cost + (*i)->cost))); } else if (dir == '-'){ rhs.push(Lit(_variables->link((*i)->word, (*i)->position, - (*i)->connector.string, + connector_string(&(*i)->connector), (*i)->exp, wi, pi, Ci, e))); } @@ -715,8 +721,8 @@ for (mw1i = mw1.begin(); mw1i != mw1.end(); mw1i++) { for (mw2i = mw2.begin(); mw2i != mw2.end(); mw2i++) { if (*mw1i >= *mw2i) { - clause[0] = ~Lit(_variables->link_cw(*mw1i, w, (*i)->position, (*i)->connector.string)); - clause[1] = ~Lit(_variables->link_cw(*mw2i, w, (*j)->position, (*j)->connector.string)); + clause[0] = ~Lit(_variables->link_cw(*mw1i, w, (*i)->position, connector_string(&(*i)->connector))); + clause[1] = ~Lit(_variables->link_cw(*mw2i, w, (*j)->position, connector_string(&(*j)->connector))); add_clause(clause); } } @@ -760,8 +766,8 @@ for (mw1i = mw1.begin(); mw1i != mw1.end(); mw1i++) { for (mw2i = mw2.begin(); mw2i != mw2.end(); mw2i++) { if (*mw1i <= *mw2i) { - clause[0] = ~Lit(_variables->link_cw(*mw1i, w, (*i)->position, (*i)->connector.string)); - clause[1] = ~Lit(_variables->link_cw(*mw2i, w, (*j)->position, (*j)->connector.string)); + clause[0] = ~Lit(_variables->link_cw(*mw1i, w, (*i)->position, connector_string(&(*i)->connector))); + clause[1] = ~Lit(_variables->link_cw(*mw2i, w, (*j)->position, connector_string(&(*j)->connector))); add_clause(clause); } } @@ -1366,10 +1372,10 @@ vec triggers; - for (size_t i = 0; i < link_variables.size(); i++) { - const Variables::LinkVar* var = _variables->link_variable(link_variables[i]); + for (size_t vi = 0; vi < link_variables.size(); vi++) { + const Variables::LinkVar* var = _variables->link_variable(link_variables[vi]); if (post_process_match(rule.selector, var->label)) { - triggers.push(Lit(link_variables[i])); + triggers.push(Lit(link_variables[vi])); } } @@ -1392,8 +1398,8 @@ DEBUG_print("---pp_pruning--"); for (int k = 0; k < triggers.size(); k++) { vec clause(criterions.size() + 1); - for (int i = 0; i < criterions.size(); i++) - clause[i] = criterions[i]; + for (int ci = 0; ci < criterions.size(); ci++) + clause[ci] = criterions[ci]; clause[criterions.size()] = (~triggers[k]); add_clause(clause); } @@ -1415,14 +1421,12 @@ */ Linkage SATEncoder::create_linkage() { - /* Using exalloc since this is external to the parser itself. */ - Linkage linkage = (Linkage) exalloc(sizeof(struct Linkage_s)); + Linkage linkage = (Linkage) malloc(sizeof(struct Linkage_s)); memset(linkage, 0, sizeof(struct Linkage_s)); partial_init_linkage(_sent, linkage, _sent->length); sat_extract_links(linkage); compute_link_names(linkage, _sent->string_set); - remove_empty_words(linkage); /* Discard optional words. */ return linkage; } @@ -1480,6 +1484,7 @@ free(linkage); continue; // skip this linkage } + remove_empty_words(linkage); /* Discard optional words. */ } if (!connected) { @@ -1509,7 +1514,7 @@ { _sent->num_linkages_alloced = _opts->linkage_limit; size_t nbytes = _sent->num_linkages_alloced * sizeof(struct Linkage_s); - _sent->lnkages = (Linkage)exalloc(nbytes); + _sent->lnkages = (Linkage) malloc(nbytes); _next_linkage_index = 0; } assert(_next_linkage_index<_sent->num_linkages_alloced, "_sent->lnkages ovl"); @@ -1517,7 +1522,7 @@ Linkage lkg = &_sent->lnkages[_next_linkage_index]; _next_linkage_index++; *lkg = *linkage; /* copy en-mass */ - exfree(linkage, sizeof(struct Linkage_s)); + free(linkage); /* The link-parser code checks the next linkage for num_violations * (to save calls to linkage_create()). Allow for that practice. */ @@ -1525,10 +1530,10 @@ lkg[1].lifo.N_violations = 0; // Perform the rest of the post-processing - PP_node *ppn = do_post_process(_sent->postprocessor, lkg, false); - if (NULL != ppn->violation) { + do_post_process(_sent->postprocessor, lkg, false); + if (NULL != _sent->postprocessor->violation) { lkg->lifo.N_violations++; - lkg->lifo.pp_violation_msg = ppn->violation; + lkg->lifo.pp_violation_msg = _sent->postprocessor->violation; lgdebug(+D_SAT, "Postprocessing error: %s\n", lkg->lifo.pp_violation_msg); } else { // XXX We cannot maintain num_valid_linkages, because it starts from @@ -1536,14 +1541,11 @@ // would report "Unique linkage". //_sent->num_valid_linkages++; } - - build_type_array(_sent->postprocessor); - linkage_set_domain_names(_sent->postprocessor, lkg); post_process_free_data(&_sent->postprocessor->pp_data); linkage_score(lkg, _opts); - //if (NULL == ppn->violation && verbosity > 1) - // _solver->printStats(); + // if (NULL == _sent->postprocessor->violation && verbosity > 1) + // _solver->printStats(); return lkg; } @@ -1569,7 +1571,7 @@ void SATEncoderConjunctionFreeSentences::generate_satisfaction_for_connector( int wi, int pi, Exp *e, char* var) { - const char* Ci = e->u.string; + const char* Ci = e->u.condesc->string; char dir = e->dir; bool multi = e->multi; double cost = e->cost; @@ -1631,7 +1633,7 @@ for (c = w1_connectors.begin(); c != w1_connectors.end(); c++) { assert(c->word == w1, "Connector word must match"); if (_word_tags[w2].match_possible(c->word, c->position)) { - rhs.push(Lit(_variables->link_cw(w2, c->word, c->position, c->connector.string))); + rhs.push(Lit(_variables->link_cw(w2, c->word, c->position, connector_string(&c->connector)))); } } @@ -1746,11 +1748,11 @@ Exp* SATEncoderConjunctionFreeSentences::PositionConnector2exp(const PositionConnector* pc) { - Exp* e = (Exp*)xalloc(sizeof(Exp)); + Exp* e = (Exp*) malloc(sizeof(Exp)); e->type = CONNECTOR_type; e->dir = pc->dir; e->multi = pc->connector.multi; - e->u.string = pc->connector.string; + e->u.condesc = (condesc_t *)pc->connector.desc; // FIXME - const e->cost = pc->cost; return e; @@ -1761,10 +1763,10 @@ Disjunct *d; int current_link = 0; - Exp **exp_word = (Exp **)alloca(_sent->length * sizeof(Exp)); - memset(exp_word, 0, _sent->length * sizeof(Exp)); - const X_node **xnode_word = (const X_node **)alloca(_sent->length * sizeof(X_node)); - memset(xnode_word, 0, _sent->length * sizeof(X_node)); + Exp **exp_word = (Exp **)alloca(_sent->length * sizeof(Exp *)); + memset(exp_word, 0, _sent->length * sizeof(Exp *)); + const X_node **xnode_word = (const X_node **)alloca(_sent->length * sizeof(X_node *)); + memset(xnode_word, 0, _sent->length * sizeof(X_node *)); const std::vector& link_variables = _variables->link_variables(); std::vector::const_iterator i; @@ -1790,8 +1792,8 @@ // Allocate memory for the connectors, because they should persist // beyond the lifetime of the sat-solver data structures. - clink.lc = connector_new(); - clink.rc = connector_new(); + clink.lc = connector_new(NULL, NULL); + clink.rc = connector_new(NULL, NULL); *clink.lc = lpc->connector; *clink.rc = rpc->connector; @@ -1844,7 +1846,10 @@ lgdebug(+0, "Warning: No expression for word %zu\n", wi); } - d = build_disjuncts_for_exp(de, xnode_word[wi]->string, UNLIMITED_LEN); +#ifndef MAX_CONNECTOR_COST +#define MAX_CONNECTOR_COST 1000.0f +#endif + d = build_disjuncts_for_exp(de, xnode_word[wi]->string, MAX_CONNECTOR_COST, _opts); word_record_in_disjunct(xnode_word[wi]->word, d); lkg->chosen_disjuncts[wi] = d; free_Exp(de); diff -Nru link-grammar-5.3.16/link-grammar/sat-solver/util.cpp link-grammar-5.5.0/link-grammar/sat-solver/util.cpp --- link-grammar-5.3.16/link-grammar/sat-solver/util.cpp 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/sat-solver/util.cpp 2018-01-22 20:39:46.000000000 +0000 @@ -1,8 +1,10 @@ #include "util.hpp" extern "C" { +#include "api-structures.h" +#include "disjunct-utils.h" +#include "linkage/linkage.h" #include "utilities.h" -#include "linkage.h" }; /** @@ -52,19 +54,20 @@ if (orig == NULL) { orig = addit; - } else { + } else + { // flist is orig - E_list* flist = (E_list*)xalloc(sizeof(E_list)); + E_list* flist = (E_list*) malloc(sizeof(E_list)); flist->e = orig; flist->next = NULL; // elist is addit, orig - E_list* elist = (E_list*)xalloc(sizeof(E_list)); + E_list* elist = (E_list*) malloc(sizeof(E_list)); elist->next = flist; elist->e = addit; // The updated orig is addit & orig - orig = (Exp*)xalloc(sizeof(Exp)); + orig = (Exp*) malloc(sizeof(Exp)); orig->type = AND_type; orig->cost = 0.0; orig->u.l = elist; diff -Nru link-grammar-5.3.16/link-grammar/sat-solver/util.hpp link-grammar-5.5.0/link-grammar/sat-solver/util.hpp --- link-grammar-5.3.16/link-grammar/sat-solver/util.hpp 2016-10-14 23:03:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/sat-solver/util.hpp 2017-05-31 07:34:56.000000000 +0000 @@ -1,9 +1,10 @@ #ifndef __UTIL_HPP__ #define __UTIL_HPP__ -extern "C" { +extern "C" +{ #include "link-includes.h" -#include "disjunct-utils.h" +#include "dict-common/dict-structures.h" // For definition of Exp } bool isEndingInterpunction(const char* str); diff -Nru link-grammar-5.3.16/link-grammar/sat-solver/variables.hpp link-grammar-5.5.0/link-grammar/sat-solver/variables.hpp --- link-grammar-5.3.16/link-grammar/sat-solver/variables.hpp 2017-01-27 22:02:55.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/sat-solver/variables.hpp 2018-01-24 01:57:32.000000000 +0000 @@ -1,6 +1,7 @@ #ifndef __VARIABLES_HPP__ #define __VARIABLES_HPP__ +#include #include #include #include @@ -16,6 +17,10 @@ #define MAX_VARIABLE_NAME 256 +extern "C" +{ +#include "api-structures.h" // for definition of Sentence +} // #define SAT_DEBUG // #define _VARS @@ -34,6 +39,11 @@ char* result = (char*)xalloc((std::max(strlen(connector1), strlen(connector2)) + 1)* sizeof(char)); char* presult = result; + + /* Skip head-dependent indicator in each connector. */ + if (islower(*connector1)) connector1++; + if (islower(*connector2)) connector2++; + while (*connector1 != '\0' && *connector2 != '\0') { if (*connector1 == '*') *presult++ = *connector2; @@ -64,7 +74,9 @@ ,_linked_min_variable_map(sent->length, -1) ,_linked_max_variable_map(sent->length, -1) ,_thin_link_variable_map(sent->length, -1) - ,_link_top_cw_variable_map(sent->length) +#if 0 + ,_link_top_cw_variable_map(sent->length) +#endif ,_link_cw_variable_map(sent->length) ,_guiding(new CostDistanceGuiding(sent)) ,_var(0) @@ -80,8 +92,8 @@ } } - for (size_t i = 0; i < _linked_variables.size(); i++) - delete _linked_variables[i]; + for (size_t vi = 0; vi < _linked_variables.size(); vi++) + delete _linked_variables[vi]; delete _guiding; } @@ -409,6 +421,7 @@ * link_top_cw((wi, pi), wj) */ +#if 0 // Returns indices of all link_top_cw variables const std::vector& link_top_cw_variables() const { return _link_top_cw_variables_indices; @@ -427,7 +440,6 @@ const char* connector; }; -#if 0 // Returns additional info about the given link_top_cw variable const LinkTopCWVar* link_top_cw_variable(int var) const { return _link_top_cw_variables[var]; @@ -533,6 +545,7 @@ // What is the number of the thin_link(i, j) variable? MatrixUpperTriangle _thin_link_variable_map; +#if 0 /* * Information about the link_top_cw(w, wj, pj) variables */ @@ -545,6 +558,7 @@ // Additional info about the link_top_cw(wi, wj, pj) variable with the given number std::vector _link_top_cw_variables; +#endif #if 0 // Set this additional info diff -Nru link-grammar-5.3.16/link-grammar/sat-solver/word-tag.cpp link-grammar-5.5.0/link-grammar/sat-solver/word-tag.cpp --- link-grammar-5.3.16/link-grammar/sat-solver/word-tag.cpp 2017-01-27 22:02:55.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/sat-solver/word-tag.cpp 2018-04-22 14:57:33.000000000 +0000 @@ -3,10 +3,11 @@ extern "C" { #ifdef DEBUG -#include // for prt_exp_mem() -#include // for print_expression() +#include "prepare/build-disjuncts.h" // prt_exp_mem() +#include "dict-common/dict-api.h" // print_expression() #endif #include "error.h" +#include "tokenize/tok-structures.h" #include "utilities.h" } @@ -31,13 +32,10 @@ if (exp->type == CONNECTOR_type) { dfs_position++; - const char* name = exp->u.string; - Connector connector; - init_connector(&connector); connector.multi = exp->multi; - connector.string = name; - set_connector_length_limit(&connector); + connector.desc = exp->u.condesc; + set_connector_length_limit(&connector, _opts); switch (exp->dir) { case '+': @@ -90,10 +88,10 @@ eps_right, eps_left, new_var, false, cost, parent_exp, word_xnode); #ifdef POWER_PRUNE_CONNECTORS - if (leading_right && var != NULL) { + if (leading_right) { eps_right.push_back(_variables->epsilon(new_var, '+')); } - if (leading_left && var != NULL) { + if (leading_left) { eps_left.push_back(_variables->epsilon(new_var, '-')); } #endif @@ -164,13 +162,12 @@ } #undef D_IC -void WordTag::find_matches(int w, const char* C, char dir, std::vector& matches) +void WordTag::find_matches(int w, const condesc_t* C, char dir, std::vector& matches) { // cout << "Look connection on: ." << _word << ". ." << w << ". " << C << dir << endl; Connector search_cntr; - init_connector(&search_cntr); - search_cntr.string = C; - set_connector_length_limit(&search_cntr); + search_cntr.desc = C; + set_connector_length_limit(&search_cntr, _opts); std::vector* connectors; switch(dir) { @@ -197,7 +194,7 @@ std::vector::iterator i; for (i = _right_connectors.begin(); i != _right_connectors.end(); i++) { std::vector connector_matches; - tag.find_matches(_word, (*i).connector.string, '+', connector_matches); + tag.find_matches(_word, (*i).connector.desc, '+', connector_matches); std::vector::iterator j; for (j = connector_matches.begin(); j != connector_matches.end(); j++) { i->matches.push_back(*j); diff -Nru link-grammar-5.3.16/link-grammar/sat-solver/word-tag.hpp link-grammar-5.5.0/link-grammar/sat-solver/word-tag.hpp --- link-grammar-5.3.16/link-grammar/sat-solver/word-tag.hpp 2017-01-27 22:02:55.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/sat-solver/word-tag.hpp 2018-02-15 03:03:12.000000000 +0000 @@ -6,7 +6,8 @@ #include extern "C" { -#include "word-utils.h" +#include "connectors.h" +#include "dict-common/dict-common.h" }; #include "variables.hpp" @@ -22,12 +23,12 @@ eps_right(er), eps_left(el), word_xnode(w_xnode) { // Initialize some fields in the connector struct. - connector.string = c->string; + connector.desc = c->desc; connector.multi = c->multi; connector.length_limit = c->length_limit; if (word_xnode == NULL) { - cerr << "Internal error: Word" << w << ": " << "; connector: '" << c->string << "'; X_node: " << (word_xnode?word_xnode->string: "(null)") << endl; + cerr << "Internal error: Word" << w << ": " << "; connector: '" << connector_string(c) << "'; X_node: " << (word_xnode?word_xnode->string: "(null)") << endl; } /* @@ -144,26 +145,12 @@ return NULL; } - void set_connector_length_limit(Connector* c) - { - unsigned int len = _opts->short_length; - Connector_set * conset = _sent->dict->unlimited_connector_set; - - if (len > UNLIMITED_LEN) len = UNLIMITED_LEN; - - if (_opts->all_short || - (conset != NULL && !match_in_connector_set(conset, c))) - { - c->length_limit = len; - } - } - bool match(int w1, Connector& cntr1, char dir, int w2, Connector& cntr2) { int dist = w2 - w1; assert(0 < dist, "match() did not receive words in the natural order."); if (dist > cntr1.length_limit || dist > cntr2.length_limit) return false; - return easy_match(cntr1.string, cntr2.string); + return easy_match_desc(cntr1.desc, cntr2.desc); } void insert_connectors(Exp* exp, int& dfs_position, @@ -181,7 +168,7 @@ void add_matches_with_word(WordTag& tag); // Find matches in this word tag with the connector (name, dir). - void find_matches(int w, const char* C, char dir, std::vector& matches); + void find_matches(int w, const condesc_t* C, char dir, std::vector& matches); // A simpler function: Can any connector in this word match a connector wi, pi? // It is assumed that diff -Nru link-grammar-5.3.16/link-grammar/score.c link-grammar-5.5.0/link-grammar/score.c --- link-grammar-5.3.16/link-grammar/score.c 2014-12-15 19:38:45.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/score.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,88 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2012, 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - - -#include -#include "api-structures.h" -#include "score.h" -#include "string-set.h" -#include "structures.h" -#include "word-utils.h" - -/** - * This function defines the cost of a link as a function of its length. - */ -static inline int cost_for_length(int length) -{ - return length-1; -} - -/** - * Computes the cost of the current parse of the current sentence, - * due to the length of the links. - */ -static size_t compute_link_cost(Linkage lkg) -{ - size_t lcost, i; - lcost = 0; - for (i = 0; i < lkg->num_links; i++) - { - lcost += cost_for_length(lkg->link_array[i].rw - lkg->link_array[i].lw); - } - return lcost; -} - -static int unused_word_cost(Linkage lkg) -{ - int lcost; - size_t i; - lcost = 0; - for (i = 0; i < lkg->num_words; i++) - lcost += (lkg->chosen_disjuncts[i] == NULL); - return lcost; -} - -/** - * Computes the cost of the current parse of the current sentence - * due to the cost of the chosen disjuncts. - */ -static double compute_disjunct_cost(Linkage lkg) -{ - size_t i; - double lcost; - lcost = 0.0; - for (i = 0; i < lkg->num_words; i++) - { - if (lkg->chosen_disjuncts[i] != NULL) - lcost += lkg->chosen_disjuncts[i]->cost; - } - return lcost; -} - -/** Assign parse score (cost) to linkage, used for parse ranking. */ -void linkage_score(Linkage lkg, Parse_Options opts) -{ - lkg->lifo.unused_word_cost = unused_word_cost(lkg); - if (opts->use_sat_solver) - { - lkg->lifo.disjunct_cost = 0.0; - } - else - { - lkg->lifo.disjunct_cost = compute_disjunct_cost(lkg); - } - lkg->lifo.link_cost = compute_link_cost(lkg); - lkg->lifo.corpus_cost = -1.0; - - lg_corpus_score(lkg); -} diff -Nru link-grammar-5.3.16/link-grammar/score.h link-grammar-5.5.0/link-grammar/score.h --- link-grammar-5.3.16/link-grammar/score.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/score.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2012, 2014 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _SCORE_H -#define _SCORE_H - -#include "api-types.h" -#include "link-includes.h" - -void linkage_score(Linkage, Parse_Options); -#endif /* _SCORE_H */ diff -Nru link-grammar-5.3.16/link-grammar/spellcheck-aspell.c link-grammar-5.5.0/link-grammar/spellcheck-aspell.c --- link-grammar-5.3.16/link-grammar/spellcheck-aspell.c 2016-11-17 18:36:07.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/spellcheck-aspell.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,158 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2009 Vikas N. Kumar */ -/* Copyright (c) 2009 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifdef HAVE_ASPELL - -#include -#include -#include -#include - -#include "link-includes.h" -#include "spellcheck.h" - -#define ASPELL_LANG_KEY "lang" -/* FIXME: Move to a definition file (affix file?). */ -static const char *spellcheck_lang_mapping[] = { -/* link-grammar language , Aspell language key */ - "en", "en_US", - "ru", "ru_RU", - "he", "he_IL", - "de", "de_DE", - "lt", "lt_LT", -}; - -struct linkgrammar_aspell { - AspellConfig *config; - AspellSpeller *speller; -}; - -/** - * create a neew spell-checker for the language 'lang' - */ -void * spellcheck_create(const char * lang) -{ - struct linkgrammar_aspell *aspell = NULL; - size_t i = 0; - AspellCanHaveError *spell_err = NULL; - - for (i = 0; i < sizeof(spellcheck_lang_mapping)/sizeof(char *); i += 2) - { - if (0 != strcmp(lang, spellcheck_lang_mapping[i])) continue; - aspell = (struct linkgrammar_aspell *)malloc(sizeof(struct linkgrammar_aspell)); - if (!aspell) { - prt_error("Error: out of memory. Aspell not used.\n"); - aspell = NULL; - break; - } - aspell->config = NULL; - aspell->speller = NULL; - aspell->config = new_aspell_config(); - if (aspell_config_replace(aspell->config, ASPELL_LANG_KEY, - spellcheck_lang_mapping[i]) == 0) { - prt_error("Error: failed to set language in aspell: %s\n", lang); - delete_aspell_config(aspell->config); - free(aspell); - aspell = NULL; - break; - } - spell_err = new_aspell_speller(aspell->config); - if (aspell_error_number(spell_err) != 0) { - prt_error("Error: Aspell: %s\n", aspell_error_message(spell_err)); - delete_aspell_can_have_error(spell_err); - delete_aspell_config(aspell->config); - free(aspell); - aspell = NULL; - break; - } - aspell->speller = to_aspell_speller(spell_err); - break; - } - return aspell; -} - -/** - * Free memory structures used wiith spell-checker 'chk' - */ -void spellcheck_destroy(void * chk) -{ - struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk; - if (aspell) { - delete_aspell_speller(aspell->speller); - delete_aspell_config(aspell->config); - free(aspell); - aspell = NULL; - } -} - -/** - * Ask the spell-checker if the spelling looks good. - * Return true if the spelling is good, else false. - */ -bool spellcheck_test(void * chk, const char * word) -{ - int val = 0; - struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk; - if (aspell && aspell->speller) { - /* this can return -1 on failure */ - val = aspell_speller_check(aspell->speller, word, -1); - } - return (val == 1); -} - -int spellcheck_suggest(void * chk, char ***sug, const char * word) -{ - struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk; - if (!sug) { - prt_error("Error: Aspell. Corrupt pointer.\n"); - return 0; - } - if (aspell && aspell->speller) { - const AspellWordList *list = NULL; - AspellStringEnumeration *elem = NULL; - const char *aword = NULL; - unsigned int size, i; - char **array = NULL; - - list = aspell_speller_suggest(aspell->speller, word, -1); - elem = aspell_word_list_elements(list); - size = aspell_word_list_size(list); - /* allocate an array of char* for returning back to link-parser - */ - array = (char **)malloc(sizeof(char *) * size); - if (!array) { - prt_error("Error: Aspell. Out of memory.\n"); - delete_aspell_string_enumeration(elem); - return 0; - } - i = 0; - while ((aword = aspell_string_enumeration_next(elem)) != NULL) { - array[i++] = strdup(aword); - } - delete_aspell_string_enumeration(elem); - *sug = array; - return size; - } - return 0; -} - -void spellcheck_free_suggest(void *chk, char **sug, int size) -{ - int i = 0; - for (i = 0; i < size; ++i) { - free(sug[i]); - sug[i] = NULL; - } - free(sug); -} - -#endif /* #ifdef HAVE_ASPELL */ diff -Nru link-grammar-5.3.16/link-grammar/spellcheck.h link-grammar-5.5.0/link-grammar/spellcheck.h --- link-grammar-5.3.16/link-grammar/spellcheck.h 2016-11-17 18:36:07.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/spellcheck.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,32 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2009 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _SPELLCHECK_H -#define _SPELLCHECK_H - -#include "api-types.h" - -#if (defined HAVE_HUNSPELL) || (defined HAVE_ASPELL) -void * spellcheck_create(const char * lang); -void spellcheck_destroy(void *); -bool spellcheck_test(void *, const char * word); -int spellcheck_suggest(void * chk, char ***sug, const char * word); -void spellcheck_free_suggest(void * chk, char **sug, int size); - -#else -static inline void * spellcheck_create(const char * lang) { return NULL; } -static inline void spellcheck_destroy(void * chk) {} -static inline bool spellcheck_test(void * chk, const char * word) { return false; } -static inline int spellcheck_suggest(void * chk, char ***sug, const char * word) { return 0; } -static inline void spellcheck_free_suggest(void * chk, char **sug, int size) {} -#endif - -#endif /* _SPELLCHECK_H */ diff -Nru link-grammar-5.3.16/link-grammar/spellcheck-hun.c link-grammar-5.5.0/link-grammar/spellcheck-hun.c --- link-grammar-5.3.16/link-grammar/spellcheck-hun.c 2016-11-17 18:36:07.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/spellcheck-hun.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,138 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2009 Linas Vepstas */ -/* Copyright (c) 2009 Vikas N. Kumar */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include -#include "link-includes.h" -#include "spellcheck.h" - -#ifdef HAVE_HUNSPELL - -#ifndef HUNSPELL_DICT_DIR -#define HUNSPELL_DICT_DIR (char *)0 -#endif /* HUNSPELL_DICT_DIR */ - -static const char *hunspell_dict_dirs[] = { - "/usr/share/myspell/dicts", - "/usr/share/hunspell/dicts", - "/usr/local/share/myspell/dicts", - "/usr/local/share/hunspell/dicts", - "/usr/share/myspell", - "/usr/share/hunspell", - "/usr/local/share/myspell", - "/usr/local/share/hunspell", - HUNSPELL_DICT_DIR -}; - -static const char *spellcheck_lang_mapping[] = { -/* link-grammar language, Hunspell filename */ - "en", "en-US", - "en", "en_US", - "ru", "ru-RU", - "ru", "ru_RU", - "he", "he-IL", - "he", "he_IL", - "de", "de-DE", - "de", "de_DE", - "lt", "lt-LT", - "lt", "lt_LT", -}; - -#define FPATHLEN 256 -static char hunspell_aff_file[FPATHLEN]; -static char hunspell_dic_file[FPATHLEN]; - -#include -#include - -void * spellcheck_create(const char * lang) -{ - size_t i = 0, j = 0; - Hunhandle *h = NULL; - - memset(hunspell_aff_file, 0, FPATHLEN); - memset(hunspell_dic_file, 0, FPATHLEN); - for (i = 0; i < sizeof(spellcheck_lang_mapping)/sizeof(char *); i += 2) - { - if (0 != strcmp(lang, spellcheck_lang_mapping[i])) continue; - - /* check in each hunspell_dict_dir if the files exist */ - for (j = 0; j < sizeof(hunspell_dict_dirs)/sizeof(char *); ++j) - { - FILE *fh; - /* if the directory name is NULL then ignore */ - if (hunspell_dict_dirs[j] == NULL) continue; - - snprintf(hunspell_aff_file, FPATHLEN, "%s/%s.aff", hunspell_dict_dirs[j], - spellcheck_lang_mapping[i+1]); - snprintf(hunspell_dic_file, FPATHLEN, "%s/%s.dic", hunspell_dict_dirs[j], - spellcheck_lang_mapping[i+1]); - - /* Some versions of Hunspell_create() will succeed even if - * there are no dictionary files. So test for permissions. - */ - fh = fopen(hunspell_aff_file, "r"); - if (fh) fclose (fh); - else continue; - - fh = fopen(hunspell_dic_file, "r"); - if (fh) fclose (fh); - else continue; - - h = Hunspell_create(hunspell_aff_file, hunspell_dic_file); - /* if hunspell handle was created break from loop */ - if (h != NULL) - break; - } - /* if hunspell handle was created break from loop */ - if (h != NULL) break; - } - return h; -} - -void spellcheck_destroy(void * chk) -{ - Hunhandle *h = (Hunhandle *) chk; - Hunspell_destroy(h); -} - -/** - * Return boolean: 1 if spelling looks good, else zero - */ -bool spellcheck_test(void * chk, const char * word) -{ - if (NULL == chk) - { - prt_error("Error: no spell-check handle specified!\n"); - return 0; - } - - return (bool) Hunspell_spell((Hunhandle *)chk, word); -} - -int spellcheck_suggest(void * chk, char ***sug, const char * word) -{ - if (NULL == chk) - { - prt_error("Error: no spell-check handle specified!\n"); - return 0; - } - - return Hunspell_suggest((Hunhandle *)chk, sug, word); -} - -void spellcheck_free_suggest(void *chk, char **sug, int size) -{ - Hunspell_free_list((Hunhandle *)chk, &sug, size); -} - -#endif /* #ifdef HAVE_HUNSPELL */ diff -Nru link-grammar-5.3.16/link-grammar/string-set.c link-grammar-5.5.0/link-grammar/string-set.c --- link-grammar-5.3.16/link-grammar/string-set.c 2017-01-29 20:56:58.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/string-set.c 2018-01-22 20:45:41.000000000 +0000 @@ -45,15 +45,16 @@ { unsigned int accum = 0; for (;*str != '\0'; str++) - accum = ((7 * accum) + ((unsigned char) *str)) % (ss->size); - return accum; + accum = (7 * accum) + (unsigned char)*str; + return accum % (ss->size); } static unsigned int stride_hash_string(const char *str, const String_set *ss) { unsigned int accum = 0; for (;*str != '\0'; str++) - accum = ((17 * accum) + ((unsigned char) *str)) % (ss->size); + accum = (17 * accum) + (unsigned char)*str; + accum %= ss->size; /* This is the stride used, so we have to make sure that * its value is not 0 */ if (accum == 0) accum = 1; @@ -80,11 +81,11 @@ String_set * string_set_create(void) { String_set *ss; - ss = (String_set *) xalloc(sizeof(String_set)); + ss = (String_set *) malloc(sizeof(String_set)); // ss->size = 1013; /* 1013 is a prime number */ // ss->size = 211; /* 211 is a prime number */ ss->size = 419; /* 419 is a prime number */ - ss->table = (char **) xalloc(ss->size * sizeof(char *)); + ss->table = (char **) malloc(ss->size * sizeof(char *)); memset(ss->table, 0, ss->size*sizeof(char *)); ss->count = 0; return ss; @@ -96,12 +97,16 @@ */ static unsigned int find_place(const char * str, String_set *ss) { - unsigned int h, s, i; + unsigned int h, s; h = hash_string(str, ss); + + if ((ss->table[h] == NULL) || (strcmp(ss->table[h], str) == 0)) return h; s = stride_hash_string(str, ss); - for (i=h; true; i = (i + s)%(ss->size)) + while (true) { - if ((ss->table[i] == NULL) || (strcmp(ss->table[i], str) == 0)) return i; + h = h + s; + if (h >= ss->size) h %= ss->size; + if ((ss->table[h] == NULL) || (strcmp(ss->table[h], str) == 0)) return h; } } @@ -113,7 +118,7 @@ old = *ss; ss->size = next_prime_up(3 * old.size); /* at least triple the size */ - ss->table = (char **) xalloc(ss->size * sizeof(char *)); + ss->table = (char **) malloc(ss->size * sizeof(char *)); memset(ss->table, 0, ss->size*sizeof(char *)); ss->count = 0; for (i=0; isize); */ /* fflush(stdout); */ - xfree((char *) old.table, old.size * sizeof(char *)); + free(old.table); } const char * string_set_add(const char * source_string, String_set * ss) @@ -145,10 +150,10 @@ #ifdef DEBUG /* Store the String_set structure address for debug verifications */ len = ((len+1)&~(sizeof(ss)-1)) + 2*sizeof(ss); - str = (char *) xalloc(len); + str = (char *) malloc(len); *(String_set **)&str[len-sizeof(ss)] = ss; #else - str = (char *) xalloc(len+1); + str = (char *) malloc(len+1); #endif strcpy(str, source_string); ss->table[p] = str; @@ -177,8 +182,8 @@ if (ss == NULL) return; for (i=0; isize; i++) { - if (ss->table[i] != NULL) xfree(ss->table[i], strlen(ss->table[i]) + 1); + if (ss->table[i] != NULL) free(ss->table[i]); } - xfree((char *) ss->table, ss->size * sizeof(char *)); - xfree((char *) ss, sizeof(String_set)); + free(ss->table); + free(ss); } diff -Nru link-grammar-5.3.16/link-grammar/structures.h link-grammar-5.5.0/link-grammar/structures.h --- link-grammar-5.3.16/link-grammar/structures.h 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/structures.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,494 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _STRUCTURES_H_ -#define _STRUCTURES_H_ - -#include - -#include "api-types.h" -#include "api-structures.h" -#include "dict-structures.h" /* For Exp, Exp_list */ -#include "histogram.h" /* Count_bin */ - -#define NEGATIVECOST -1000000 -/* This is a hack that allows one to discard disjuncts containing - connectors whose cost is greater than given a bound. This number plus - the cost of any connectors on a disjunct must remain negative, and - this number multiplied times the number of costly connectors on any - disjunct must fit into an integer. */ - -/* Upper bound on the cost of any connector. */ -#define MAX_CONNECTOR_COST 1000.0f - -/* The following define the names of the special strings in the dictionary. */ -#define LEFT_WALL_WORD ("LEFT-WALL") -#define RIGHT_WALL_WORD ("RIGHT-WALL") - -/* Word subscripts come after the subscript mark (ASCII ETX) - * In the dictionary, a dot is used; but that dot interferes with dots - * in the input stream, and so we convert dictionary dots into the - * subscript mark, which we don't expect to see in user input. - */ -#define SUBSCRIPT_MARK '\3' -#define SUBSCRIPT_DOT '.' -#define EMPTY_CONNECTOR "ZZZ" - -/* Dictionary capitalization handling */ -#define CAP1st "1stCAP" /* Next word is capitalized */ -#define CAPnon "nonCAP" /* Next word the lc version of a capitalized word */ - -/* Stems, by definition, end with ".=x" (when x is usually an empty string, i.e. - * ".="). The STEMSUBSCR definition in the affix file may include endings with - * other x values, when x serves as a word subscript, e.g. ".=a". */ -#define STEM_MARK '=' - -/* Suffixes start with it. - * This is needed to distinguish suffixes that were stripped off from - * ordinary words that just happen to be the same as the suffix. - * Kind-of a weird hack, but I'm not sure what else to do... - * Similarly, prefixes end with it. - */ -#define INFIX_MARK(afdict) \ - ((NULL == afdict) ? '\0' : (AFCLASS(afdict, AFDICT_INFIXMARK)->string[0][0])) - -#define UNLIMITED_CONNECTORS_WORD ("UNLIMITED-CONNECTORS") - -#define UNKNOWN_WORD "UNKNOWN-WORD" - -#define MAX_PATH_NAME 200 /* file names (including paths) - should not be longer than this */ - -/* Some size definitions. Reduce these for small machines */ -/* MAX_WORD is large, because Unicode entries can use a lot of space */ -#define MAX_WORD 180 /* maximum number of bytes in a word */ -#define MAX_LINE 2500 /* maximum number of chars in a sentence */ - -/* conditional compiling flags */ -#define INFIX_NOTATION - /* If defined, then we're using infix notation for the dictionary */ - /* otherwise we're using prefix notation */ - -#define UNLIMITED_LEN 255 -#define SHORT_LEN 6 -#define NO_WORD 255 - -/* An ordered set of gword pointers, used to indicate the source gword - * (Wordgraph word) of disjuncts and connectors. Usually it contains only - * one element. However, when a duplicate disjunct is eliminated (see - * eliminate_duplicate_disjuncts()) and it originated from a different - * gword (a relatively rare event) its gword is added to the gword_set of - * the remaining disjunct. A set of 3 elements is extremely rare. The - * original order is preserved, in a hope for better caching on - * alternatives match checks in fast-match.c. - * - * Memory management: - * A copy-on-write semantics is used when constructing a new gword_set. It - * means that all the gword sets with one element are shared. These gword - * sets are part of the Gword structure. Copied and added element are - * alloc'ed and chained. The result is that the chain_next of the gword - * sets that are part of each gword contains the list of alloc'ed elements - - * to be used in gword_set_delete() called *only* in sentence_delete(). - * This ensures that the gword_set of connectors doesn't get stale when - * their disjuncts are deleted and later restored in one-parse when - * min_null_count=0 and max_null count>0 (see classic_parse()). - */ -typedef struct gword_set -{ - Gword *o_gword; - struct gword_set *next; - struct gword_set *chain_next; -} gword_set; - -/* On a 64-bit machine, this struct should be exactly 4*8=32 bytes long. - * Lets try to keep it that way. - */ -struct Connector_struct -{ - int16_t hash; - uint8_t length_limit; - /* If this is a length limited connector, this - gives the limit of the length of the link - that can be used on this connector. Since - this is strictly a function of the connector - name, efficiency is the only reason to store - this. If no limit, the value is set to 255. */ - uint8_t nearest_word; - /* The nearest word to my left (or right) that - this could ever connect to. Computed by - setup_connectors() */ - bool multi; /* TRUE if this is a multi-connector */ - uint8_t lc_start; /* lc start position (or 0) - for match speedup. */ - uint8_t uc_length; /* uc part length - for match speedup. */ - uint8_t uc_start; /* uc start position - for match speedup. */ - Connector * next; - const char * string; /* The connector name w/o the direction mark, e.g. AB */ - - /* Hash table next pointer, used only during pruning. */ - union - { - Connector * tableNext; - const gword_set *originating_gword; - }; -}; - -static inline void connector_set_string(Connector *c, const char *s) -{ - c->string = s; - c->hash = -1; -} -static inline const char * connector_get_string(Connector *c) -{ - return c->string; -} - -//#ifdef DEBUG -#define VERIFY_MATCH_LIST -//#endif -struct Disjunct_struct -{ - Disjunct *next; - Connector *left, *right; - double cost; - bool marked; /* unmarked disjuncts get deleted */ - /* match_left, right used only during parsing, for the match list. */ - bool match_left, match_right; -#ifdef VERIFY_MATCH_LIST - int match_id; /* verify the match list integrity */ -#endif - gword_set *originating_gword; /* List of originating gwords */ - const char * string; /* subscripted dictionary word */ -}; - -typedef struct Match_node_struct Match_node; -struct Match_node_struct -{ - Match_node * next; - Disjunct * d; -}; - -typedef struct X_node_struct X_node; -struct X_node_struct -{ - const char * string; /* the word itself */ - Exp * exp; - X_node *next; - const Gword *word; /* originating Wordgraph word */ -}; - -/** - * Word, as represented shortly after tokenization, but before parsing. - * - * X_node* x: - * Contains a pointer to a list of expressions from the dictionary, - * Computed by build_sentence_expressions(). - * - * Disjunct* d: - * Contains a pointer to a list of disjuncts for this word. - * Computed by: prepare_to_parse(), but modified by pruning and power - * pruning. - */ -struct Word_struct -{ - const char *unsplit_word; - - X_node * x; /* Sentence starts out with these, */ - Disjunct * d; /* eventually these get generated. */ - bool optional; /* Linkage is optional. */ - - const char **alternatives; -}; - -typedef enum -{ - MT_INVALID, /* Zero, to be changed to the correct type */ - MT_WORD, /* Regular word */ - MT_FEATURE, /* Pseudo morpheme, currently capitalization marks */ - MT_INFRASTRUCTURE, /* Start and end Wordgraph pseudo-words */ - MT_WALL, /* The LEFT-WALL and RIGHT-WALL pseudo-words */ - MT_EMPTY, /* Empty word FIXME: Remove it. */ - MT_UNKNOWN, /* Unknown word (FIXME? Unused) */ - /* Experimental for Semitic languages (yet unused) */ - MT_TEMPLATE, - MT_ROOT, - /* Experimental - for display purposes. - * MT_CONTR is now used in the tokenization step, see the comments there. */ - MT_CONTR, /* Contracted part of a contraction (e.g. y', 's) */ - MT_PUNC, /* Punctuation (yet unused) */ - /* We are not going to have >63 types up to here. */ - MT_STEM = 1<<6, /* Stem */ - MT_PREFIX = 1<<7, /* Prefix */ - MT_MIDDLE = 1<<8, /* Middle morpheme (yet unused) */ - MT_SUFFIX = 1<<9 /* Suffix */ -} Morpheme_type; -#define IS_REG_MORPHEME (MT_STEM|MT_PREFIX|MT_MIDDLE|MT_SUFFIX) - -/* Word status */ -/* - Tokenization */ -#define WS_UNKNOWN (1<<0) /* Unknown word */ -#define WS_REGEX (1<<1) /* Matches a regex */ -#define WS_SPELL (1<<2) /* Result of a spell guess */ -#define WS_RUNON (1<<3) /* Separated from words run-on */ -#define WS_HASALT (1<<4) /* Has alternatives (one or more)*/ -#define WS_UNSPLIT (1<<5) /* It's an alternative to itself as an unsplit word */ -#define WS_INDICT (1<<6) /* boolean_dictionary_lookup() is true */ -#define WS_FIRSTUPPER (1<<7) /* Subword is the lc version of its unsplit_word. - The idea of marking subwords this way, in order to - enable restoring their original capitalization, - may be wrong in general, since in some languages - the process is not always reversible. Instead, - the original word may be saved. */ -/* - Post linkage stage. XXX Experimental. */ -#define WS_PL (1<<14) /* Post-Linkage, not belonging to tokenization */ - -#define WS_GUESS (WS_SPELL|WS_RUNON|WS_REGEX) - -/* XXX Only TS_ANYSPLIT and TS_DONE are actually used. */ -typedef enum -{ - TS_INITIAL, - TS_LR_STRIP, - TS_AFFIX_SPLIT, - TS_REGEX, - TS_RUNON, - TS_SPELL, - TS_ANYSPLIT, /* After anysplit */ - TS_DONE /* Tokenization done */ -} Tokenizing_step; - -/* For the "guess" field of Gword_struct. */ -typedef enum -{ - GM_REGEX = '!', - GM_SPELL = '~', - GM_RUNON = '&', - GM_UNKNOWN = '?' -} Guess_mark; - -#define MAX_SPLITS 10 /* See split_counter below */ - -struct Gword_struct -{ - const char *subword; - - Gword *unsplit_word; /* Upward-going co-tree */ - Gword **next; /* Right-going tree */ - Gword **prev; /* Left-going tree */ - Gword *chain_next; /* Next word in the chain of all words */ - - /* Disjuncts and connectors point back to their originating Gword(s). */ - gword_set gword_set_head; - - /* For debug and inspiration. */ - const char *label; /* Debug label - code locations of tokenization */ - size_t node_num; /* For differentiating words with identical subwords, - and for indicating the order in which word splits - have been done. Shown in the Wordgraph display and in - debug messages. Not used otherwise. Could have been - used for hier_position instead of pointers in order - to optimize its generation and comparison. */ - - /* Tokenizer state */ - Tokenizing_step tokenizing_step; - bool issued_unsplit; /* The word has been issued as an alternative to itself. - It will become an actual alternative to itself only - if it's not the sole alternative, in which case it - will be marked with WS_UNSPLIT. */ - size_t split_counter; /* Incremented on splits. A word cannot split more than - MAX_SPLITS times and a warning is issued then. */ - - unsigned int status; /* See WS_* */ - Morpheme_type morpheme_type; /* See MT_* */ - Gword *alternative_id; /* Alternative start - a unique identifier of - the alternative to which the word belongs. */ - const char *regex_name; /* Subword matches this regex. - FIXME? Extend for multiple regexes. */ - - /* Only used by wordgraph_flatten() */ - const Gword **hier_position; /* Unsplit_word/alternative_id pointer list, up - to the original sentence word. */ - size_t hier_depth; /* Number of pointer pairs in hier_position */ - - /* XXX Experimental. Only used after the linkage (by compute_chosen_words()) - * for an element in the linkage display wordgraph path that represents - * a block of null words that are morphemes of the same word. */ - Gword **null_subwords; /* Null subwords represented by this word */ -}; - -/* Wordgraph path word-positions, - * used in wordgraph_flatten() and sane_linkage_morphism(). - * FIXME Separate to two different structures. */ -struct Wordgraph_pathpos_s -{ - Gword *word; /* Position in the Wordgraph */ - /* Only for wordgraph_flatten(). */ - bool same_word; /* Still the same word - issue an empty word */ - bool next_ok; /* OK to proceed to the next Wordgraph word */ - bool used; /* Debug - the word has been issued */ - /* Only for sane_morphism(). */ - const Gword **path; /* Linkage candidate wordgraph path */ -}; - -/* The regexes are stored as a linked list of the following nodes. */ -struct Regex_node_s -{ - char *name; /* The identifying name of the regex */ - char *pattern; /* The regular expression pattern */ - bool neg; /* Negate the match */ - void *re; /* The compiled regex. void * to avoid - having re library details invading the - rest of the LG system; regex-morph.c - takes care of all matching. - */ - Regex_node *next; -}; - -/* The parse_choice is used to extract links for a given parse */ -typedef struct Parse_choice_struct Parse_choice; - -struct Parse_choice_struct -{ - Parse_choice * next; - Parse_set * set[2]; - Link link[2]; /* the lc fields of these is NULL if there is no link used */ - Disjunct *ld, *md, *rd; /* the chosen disjuncts for the relevant three words */ -}; - -struct Parse_set_struct -{ - short lw, rw; /* left and right word index */ - unsigned short null_count; /* number of island words */ - Connector *le, *re; /* pending, unconnected connectors */ - - s64 count; /* The number of ways to parse. */ - /* s64 recount; Exactly the same as above, but counted at a later stage. */ - // s64 cut_count; /* Count only low-cost parses, i.e. below the cost cutoff */ - // double cost_cutoff; - Parse_choice * first; - Parse_choice * tail; -}; - -struct X_table_connector_struct -{ - Parse_set set; - X_table_connector *next; -}; - -/* ----------------------------------------------------------------- */ -/* Post-processing-related structures */ - -/* The following two structs comprise what is returned by post_process(). */ -typedef struct D_type_list_struct D_type_list; -struct D_type_list_struct -{ - D_type_list * next; - int type; -}; - -struct PP_node_struct -{ - size_t dtsz; - D_type_list **d_type_array; - const char *violation; -}; - -/* Davy added these */ -struct List_o_links_struct -{ - size_t link; /* the link number */ - size_t word; /* the word at the other end of this link */ - List_o_links * next; -}; - -/* from pp_linkset.c */ -typedef struct pp_linkset_node_s -{ - const char *str; - struct pp_linkset_node_s *next; -} pp_linkset_node; - -struct pp_linkset_s -{ - unsigned int hash_table_size; - unsigned int population; - pp_linkset_node **hash_table; /* data actually lives here */ -}; - - -/* from pp_lexer.c */ -#define PP_LEXER_MAX_LABELS 512 - -typedef struct pp_label_node_s -{ - /* linked list of strings associated with a label in the table */ - const char *str; - struct pp_label_node_s *next; -} pp_label_node; /* next=NULL: end of list */ - - -/* from pp_knowledge.c */ -typedef struct StartingLinkAndDomain_s -{ - const char *starting_link; - int domain; /* domain which the link belongs to (-1: terminator)*/ -} StartingLinkAndDomain; - -typedef struct pp_rule_s -{ - /* Holds a single post-processing rule. Since rules come in many - flavors, not all fields of the following are always relevant */ - const char *selector; /* name of link to which rule applies */ - pp_linkset *link_set; /* handle to set of links relevant to rule */ - int link_set_size; /* size of this set */ - int domain; /* type of domain to which rule applies */ - const char **link_array; /* array holding the spelled-out names */ - const char *msg; /* explanation (NULL=end sentinel in array)*/ - int use_count; /* Number of times rule has been applied */ -} pp_rule; - -typedef struct PPLexTable_s PPLexTable; -struct pp_knowledge_s -{ - PPLexTable *lt; /* Internal rep'n of sets of strings from knowledge file */ - const char *path; /* Name of file we loaded from */ - - /* handles to sets of links specified in knowledge file. These constitute - auxiliary data, necessary to implement the rules, below. See comments - in post-process.c for a description of these. */ - pp_linkset *domain_starter_links; - pp_linkset *urfl_domain_starter_links; - pp_linkset *urfl_only_domain_starter_links; - pp_linkset *domain_contains_links; - pp_linkset *must_form_a_cycle_links; - pp_linkset *restricted_links; - pp_linkset *ignore_these_links; - pp_linkset *left_domain_starter_links; - - /* arrays of rules specified in knowledge file */ - pp_rule *form_a_cycle_rules; - pp_rule *contains_one_rules; - pp_rule *contains_none_rules; - pp_rule *bounded_rules; - - size_t n_form_a_cycle_rules; - size_t n_contains_one_rules; - size_t n_contains_none_rules; - size_t n_bounded_rules; - - size_t nStartingLinks; - pp_linkset *set_of_links_starting_bounded_domain; - StartingLinkAndDomain *starting_link_lookup_table; - String_set *string_set; -}; - -#endif diff -Nru link-grammar-5.3.16/link-grammar/tokenize/anysplit.c link-grammar-5.5.0/link-grammar/tokenize/anysplit.c --- link-grammar-5.3.16/link-grammar/tokenize/anysplit.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/anysplit.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,629 @@ +/*************************************************************************/ +/* Copyright (c) 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +/** + * anysplit.c -- code that splits words into random morphemes. + * This is used for the language-learning/morpheme-learning project. + */ + +/* General assumptions: + * - false is binary 0 (for memset()) + * - int is >= 32 bit (for random number) + */ + +#include "utilities.h" /* included first, for MSVC rand_s() */ + +#include +#include +#include +#include +#include +#include + +#include "api-structures.h" +#include "dict-common/dict-affix.h" +#include "dict-common/dict-common.h" +#include "print/print-util.h" // For patch_subscript_mark() +#include "dict-common/regex-morph.h" +#include "error.h" +#include "externs.h" +#include "tokenize.h" +#include "tok-structures.h" + +#include "anysplit.h" + + +#define MAX_WORD_TO_SPLIT 31 /* in codepoins */ + +extern const char * const afdict_classname[]; + +typedef int p_start; /* partition start in a word */ +typedef p_start *p_list; /* list of partitions in a word */ + +typedef struct split_cache /* split cached by word length */ +{ + size_t nsplits; /* number of splits */ + p_list sp; /* list of splits */ + bool *p_tried; /* list of tried splits */ + bool *p_selected; /* list of selected splits */ +} split_cache; + +typedef struct anysplit_params +{ + int nparts; /* maximum number of suffixes to split to */ + size_t altsmin; /* minimum number of alternatives to generate */ + size_t altsmax; /* maximum number of alternatives to generate */ + Regex_node *regpre, *regmid, *regsuf; /* issue matching combinations */ + split_cache scl[MAX_WORD_TO_SPLIT+1]; /* split cache according to word length */ +} anysplit_params; + +#define DEBUG_ANYSPLIT 0 + + +#if DEBUG_ANYSPLIT +static const char *gw; +/* print the current partitions */ +static void printsplit(int *ps, int n) +{ + static int sn = 0; /* split number */ + int pos = 0; + int p; + int l = strlen(gw); + + printf("split %d: ", sn++); + for (pos = 0, p = 0; pos < l && p <= n; pos++) + { + if (pos == ps[p]) + { + p++; + putchar(' '); + } + putchar(gw[pos]); + } + putchar('\n'); +} +static void printps(int *ps, int n) +{ + int i; + + printf("printps:"); + for (i = 0; i<=n; i++) printf(" ps[%d]=%d", i, ps[i]); + printf("\n"); +} +#endif + +static void cache_partitions(p_list pl, int *ps, int p) +{ + memcpy(pl, ps, sizeof(p_start) * p); +} + + /* p = 5 */ + /* 0 1 2 3 */ + /* | | | | */ + /* 123456789 */ + /* l = 9 */ + /* */ + /* n = 4 */ + /* ps[0] = 2 */ + /* ps[1] = 5 */ + /* ps[2] = 7 */ + /* ps[3] = 9 */ + +/** + * `scl`: If NULL, return the index of the last split, else cache the + * splits into scl. + */ +static int split_and_cache(int word_length, int nparts, split_cache *scl) +{ + + int n; + int maxindex; + p_list ps = alloca(sizeof(p_start)*nparts); /* partition start */ + + if (0 == word_length) return 0; + + /* The first partitioning is the whole word. + * (Using a first dummy partition would make this code unneeded.) + * But in any case the whole word here is unneeded, and I'm + * too lazy to change that. + */ + ps[0] = word_length; + maxindex = 0; + if (scl) cache_partitions(&scl->sp[0], ps, nparts); + + /* Generate all possible partitions up to nparts partitions */ + for (n = 1; n < nparts; n++) + { + /* increase the number of partitions */ + int m = 0; + int t; + + ps[0] = 1; + ps[n] = word_length; /* set last partition end (dummy partition start) */ + + //printf("New number of partitions: n=%d\n", n); + do + { + /* set next initial partitions lengths to 1 */ + //printf("Initialize: m=%d\n", m); + for (t = m; t < n; t++) + { + ps[t] = ps[m] + (t-m); + //printf("ps[%d]=%d ", t, ps[t]); + } + //printf("\n"); + + /* move last partition */ + //printf("Moving m=%d ps[m]=%d ps[m+1]=%d\n", n-1, ps[n-1], ps[n]); + for (m = n-1; ps[m] < ps[m+1]; ps[m]++) + { + maxindex++; + if (scl) cache_partitions(&scl->sp[maxindex*nparts], ps, nparts); + +#if DEBUG_ANYSPLIT + printsplit(ps, n); + printps(ps, n); +#endif + } + + /* last partition got to size 1, backtrack */ + do + { + //printf("Backtrack m %d->%d\n", m, m-1); + m--; + /* continue as long as there is a place to move for partition m */ + } while (m >= 0 && ps[m] + 1 == ps[m+1]); + if (m >= 0) ps[m]++; + } while (m >= 0); /* we have still positions to move */ + //printf("End (n=%d)\n", n); + } + + return maxindex+1; +} + +void free_anysplit(Dictionary afdict) +{ + size_t i; + anysplit_params *as = afdict->anysplit; + + if (NULL == as) return; + + for (i = 0; i < ARRAY_SIZE(as->scl); i++) + { + if (NULL == as->scl[i].sp) continue; + free(as->scl[i].sp); + free(as->scl[i].p_selected); + free(as->scl[i].p_tried); + } + free_regexs(as->regpre); + free_regexs(as->regmid); + free_regexs(as->regsuf); + free(as); + afdict->anysplit = NULL; +} + +/* + * Returns: Number of splits. + */ +static int split(int word_length, int nparts, split_cache *scl) +{ + size_t nsplits; + + if (NULL == scl->sp) + { + nsplits = split_and_cache(word_length, nparts, NULL); + //printf("nsplits %zu\n", nsplits); + if (0 == nsplits) + { + prt_error("Error: nsplits=0 (word_length=%d, nparts=%d)\n", + word_length, nparts); + return 0; + } + scl->sp = malloc(sizeof(p_start)*nparts * nsplits); + scl->p_selected = malloc(sizeof(*(scl->p_selected)) * nsplits); + scl->p_tried = malloc(sizeof(*(scl->p_tried)) * nsplits); + split_and_cache(word_length, nparts, scl); + scl->nsplits = nsplits; + } + + memset(scl->p_selected, false, sizeof(*(scl->p_selected)) * scl->nsplits); + memset(scl->p_tried, false, sizeof(*(scl->p_tried)) * scl->nsplits); + return scl->nsplits; +} + +/** + * Return a number between 0 and nsplits-1, including. + * No need for a good randomness; mediocre randomness is enough. + * We suppose int is 32 bit. + */ +static int rng_uniform(unsigned int *seedp, size_t nsplits) +{ + int res; + + res = rand_r(seedp); + + /* I don't mind the slight skew */ + return res % nsplits; + +} + +/* lutf is the length of the string, measured in code-points, + * blen is the length of the string, measured in bytes. + */ +#define D_MM 7 +static bool morpheme_match(Sentence sent, + const char *word, size_t lutf, p_list pl) +{ + Dictionary afdict = sent->dict->affix_table; + anysplit_params *as = afdict->anysplit; + size_t bos = 0, cpos = 0; /* byte offset, code-point offset */ + int p; + Regex_node *re; + size_t blen = strlen(word); + char *prefix_string = alloca(blen+1); + + lgdebug(+D_MM, "word=%s: ", word); + for (p = 0; p < as->nparts; p++) + { + size_t b = utf8_strncpy(prefix_string, &word[bos], pl[p]-cpos); + prefix_string[b] = '\0'; + bos += b; + + /* For flexibility, REGRPE is matched only to the prefix part, + * REGMID only to the middle suffixes, and REGSUF only to the + * suffix part - which cannot be the prefix. */ + if (0 == p) re = as->regpre; + else if (pl[p] == (int) lutf) re = as->regsuf; + else re = as->regmid; + lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, prefix_string); + + /* A NULL regex always matches */ + if ((NULL != re) && (NULL == match_regex(re, prefix_string))) + { + lgdebug(D_MM, "No match\n"); + return false; + } + + cpos = pl[p]; + if (cpos == lutf) break; + } + + lgdebug(D_MM, "Match\n"); + return true; +} +#undef D_MM + +static Regex_node * regbuild(const char **regstring, int n, int classnum) +{ + Regex_node *regex_root = NULL; + Regex_node **tail = ®ex_root; /* Last Regex_node in list */ + Regex_node *new_re; + int i; + + for (i = 0; i < n; i++) + { + const char *r = regstring[i]; + + /* Create a new Regex_node and add to the list. */ + new_re = malloc(sizeof(*new_re)); + new_re->name = strdup(afdict_classname[classnum]); + new_re->re = NULL; + new_re->next = NULL; + new_re->neg = ('!' == r[0]); + if (new_re->neg || (0 == strncmp(r, "\\!", 2))) r++; + new_re->pattern = strdup(r); + /* read_entry() (read-dict.c) invokes patch_subscript() also for the affix + * file. As a result, if a regex contains a dot it is patched by + * SUBSCRIPT_MARK. We undo it here. */ + patch_subscript_mark(new_re->pattern); + + *tail = new_re; + tail = &new_re->next; + } + return regex_root; +} + + +/** + * Affix classes: + * REGPARTS Max number of word partitions. Value 0 disables anysplit. + * REGPRE Regex for prefix + * REGMID Regex for middle suffixes + * REGSUF Regex for suffix + * REGALTS Number of alternatives to issue for a word. + * Two values: minimum and maximum. + * If the word has more possibilities to split than the minimum, + * but less then the maximum, then issue them unconditionally. + */ + +/** + * Initialize the anysplit parameter and cache structure. + * Return true if initialization succeeded, or if dictionary does not use + * anysplit (its not an error to not use anysplit!). Return false if + * init failed. + */ +#define D_AI (D_DICT+0) +bool anysplit_init(Dictionary afdict) +{ + anysplit_params *as; + size_t i; + + Afdict_class *regpre = AFCLASS(afdict, AFDICT_REGPRE); + Afdict_class *regmid = AFCLASS(afdict, AFDICT_REGMID); + Afdict_class *regsuf = AFCLASS(afdict, AFDICT_REGSUF); + + Afdict_class *regalts = AFCLASS(afdict, AFDICT_REGALTS); + Afdict_class *regparts = AFCLASS(afdict, AFDICT_REGPARTS); + + if (0 == regparts->length) + { + if (verbosity_level(+D_AI)) + prt_error("Warning: File %s: Anysplit disabled (%s not defined)\n", + afdict->name, afdict_classname[AFDICT_REGPARTS]); + return true; + } + if (1 != regparts->length) + { + prt_error("Error: File %s: Must have %s defined with one value\n", + afdict->name, afdict_classname[AFDICT_REGPARTS]); + return false; + } + + as = malloc(sizeof(anysplit_params)); + for (i = 0; i < ARRAY_SIZE(as->scl); i++) as->scl[i].sp = NULL; + afdict->anysplit = as; + + as->regpre = regbuild(regpre->string, regpre->length, AFDICT_REGPRE); + as->regmid = regbuild(regmid->string, regmid->length, AFDICT_REGMID); + as->regsuf = regbuild(regsuf->string, regsuf->length, AFDICT_REGSUF); + + if (compile_regexs(as->regpre, NULL) != 0) return false; + if (compile_regexs(as->regmid, NULL) != 0) return false; + if (compile_regexs(as->regsuf, NULL) != 0) return false; + + as->nparts = atoi(regparts->string[0]); + if (as->nparts < 0) + { + free_anysplit(afdict); + prt_error("Error: File %s: Value of %s must be a non-negative number\n", + afdict->name, afdict_classname[AFDICT_REGPARTS]); + return false; + } + if (0 == as->nparts) + { + free_anysplit(afdict); + prt_error("Warning: File %s: Anysplit disabled (0: %s)\n", + afdict->name, afdict_classname[AFDICT_REGPARTS]); + return true; + } + + if (2 != regalts->length) + { + free_anysplit(afdict); + prt_error("Error: File %s: Must have %s defined with 2 values\n", + afdict->name, afdict_classname[AFDICT_REGALTS]); + return false; + } + as->altsmin = atoi(regalts->string[0]); + as->altsmax = atoi(regalts->string[1]); + if ((atoi(regalts->string[0]) <= 0) || (atoi(regalts->string[1]) <= 0)) + { + free_anysplit(afdict); + prt_error("Error: File %s: Value of %s must be 2 positive numbers\n", + afdict->name, afdict_classname[AFDICT_REGALTS]); + return false; + } + + return true; +} +#undef D_AI + +/** + * Split randomly. + * Return true on success. + * Return false when: + * - disabled (i.e. when doing regular language processing). + * - an error occurs (the behavior then is undefined). + * Such an error has not been observed yet. + */ +#define D_AS 5 +bool anysplit(Sentence sent, Gword *unsplit_word) +{ + const char * word = unsplit_word->subword; + Dictionary afdict = sent->dict->affix_table; + anysplit_params *as; + Afdict_class * stemsubscr; + + size_t l = strlen(word); + size_t lutf = utf8_strlen(word); + p_list pl; + size_t bos, cpos; /* byte offset, codepoint offset */ + int p; + int sample_point; + size_t nsplits; + size_t rndtried = 0; + size_t rndissued = 0; + size_t i; + unsigned int seed = sent->rand_state; + char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */ + bool use_sampling = true; + + if (NULL == afdict) return false; + as = afdict->anysplit; + + if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */ + + if (lutf > MAX_WORD_TO_SPLIT) + { + Gword *alt = issue_word_alternative(sent, unsplit_word, "AS>", + 0,NULL, 1,&word, 0,NULL); + tokenization_done(sent, alt); + return true; + } + + if (0 == l) + { + prt_error("Warning: anysplit(): word length 0\n"); + return false; + } + + stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR); + + // seed = time(NULL)+(unsigned int)(long)&seed; + +#if DEBUG_ANYSPLIT + gw = word; +#endif + + nsplits = split(lutf, as->nparts, &as->scl[lutf]); + if (0 == nsplits) + { + prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n"); + return false; + } + + if (as->altsmax >= nsplits) + { + /* Issue everything */ + sample_point = -1; + use_sampling = false; + } + + lgdebug(+D_AS, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, " + "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no", + word, nsplits, as->nparts, as->altsmin, as->altsmax); + + while (rndtried < nsplits && (!use_sampling || (rndissued < as->altsmax))) + { + if (use_sampling) + { + sample_point = rng_uniform(&seed, nsplits); + + if (sample_point < 0) /* Cannot happen with rand_r() */ + { + prt_error("Error: rng: %s\n", strerror(errno)); + return false; + } + } + else + { + sample_point++; + } + + lgdebug(D_AS, "Sample: %d ", sample_point); + if (as->scl[lutf].p_tried[sample_point]) + { + lgdebug(D_AS+1, "(repeated)\n"); + continue; + } + lgdebug(D_AS+1, "(new)"); + rndtried++; + as->scl[lutf].p_tried[sample_point] = true; + if (morpheme_match(sent, word, lutf, &as->scl[lutf].sp[sample_point*as->nparts])) + { + as->scl[lutf].p_selected[sample_point] = true; + rndissued++; + } + else + { + lgdebug(D_AS, "\n"); + } + } + + lgdebug(D_AS, "Results: word '%s' (byte-length=%zu utf-chars=%zu): %zu/%zu:\n", + word, lutf, l, rndissued, nsplits); + + for (i = 0; i < nsplits; i++) + { + const char **affixes = NULL; + int num_sufixes; + int num_affixes = 0; + + if (!as->scl[lutf].p_selected[i]) continue; + + pl = &as->scl[lutf].sp[i*as->nparts]; + bos = 0; + cpos = 0; + for (p = 0; p < as->nparts; p++) + { + size_t b = 0; + if (pl[0] == (int)lutf) /* This is the whole word */ + { + b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); + affix[b] = '\0'; + } + else + if (0 == cpos) /* The first, but not the only morpheme */ + { + b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); + affix[b] = '\0'; + } + else /* 2nd and subsequent morphemes */ + { + b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); + affix[b] = '\0'; + num_affixes++; + } + altappend(sent, &affixes, affix); + + bos += b; + cpos = pl[p]; + // if (cpos == lutf) break; /* Same thing as below...*/ + if (bos == l) break; + } + + const char **prefix_position, **stem_position , **suffix_position; + switch (num_affixes) + { + case 0: + prefix_position = NULL; + stem_position = &affixes[0]; /* May be just a word here */ + suffix_position = NULL; + num_sufixes = 0; + break; + case 1: + prefix_position = NULL; + stem_position = &affixes[0]; + suffix_position = &affixes[1]; + num_sufixes = 1; + break; + default: + prefix_position =&affixes[0]; + stem_position = &affixes[1]; + suffix_position = &affixes[2]; + num_sufixes = num_affixes - 1; + break; + } + if (num_affixes > 0) + { + if (0 != stemsubscr->length) { + strcpy(affix, stem_position[0]); + strcat(affix, stemsubscr->string[0]); + stem_position[0] = affix; + } + } + + // XXX FIXME -- this is wrong - it assumes a + // variable number of suffixes. + /* Here a leading INFIX_MARK is added to the suffixes if needed. */ + Gword *alt = issue_word_alternative(sent, unsplit_word, "AS", + (NULL == prefix_position) ? 0 : 1, prefix_position, + 1, stem_position, + num_sufixes, suffix_position); + tokenization_done(sent, alt); + free(affixes); + } + + /* 0 == sent->rand_state denotes "repeatable rand". */ + if (0 != sent->rand_state) sent->rand_state = seed; + return true; +} +#undef D_AS diff -Nru link-grammar-5.3.16/link-grammar/tokenize/anysplit.h link-grammar-5.5.0/link-grammar/tokenize/anysplit.h --- link-grammar-5.3.16/link-grammar/tokenize/anysplit.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/anysplit.h 2017-05-28 11:14:43.000000000 +0000 @@ -0,0 +1,17 @@ +/*************************************************************************/ +/* Copyright (c) 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include "api-types.h" +#include "link-includes.h" + +bool anysplit(Sentence, Gword *); +void free_anysplit(Dictionary); +bool anysplit_init(Dictionary); diff -Nru link-grammar-5.3.16/link-grammar/tokenize/README.md link-grammar-5.5.0/link-grammar/tokenize/README.md --- link-grammar-5.3.16/link-grammar/tokenize/README.md 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/README.md 2018-04-25 17:04:50.000000000 +0000 @@ -0,0 +1,387 @@ +Version 5.5.0 - Official wordgraph support +========================================== + +As of version 5.5.0, the default configuration includes the word-graph +display. A new API function +`bool sentence_display_wordgraph(Sentence sent, const char *modestr);` +has been added, and `link-parser` can use it (controlled by the +`!wordgraph` user variable). + +Version 5.3.0 - Introduction of a word-graph for tokenizing +=========================================================== + +In this version the tokenizer code has been heavily modified once again. +Instead of tokenizing all the words in one pass directly into the +word-array of the parser, it now tokenizes them using a graph which its +nodes are "subwords". Initially the original sentence words are tokenized +to subwords only by whitespace. After that step, each subword is handled in +its turn, and gets split to further subwords if needed. A special care is +taken if one of the alternatives of a subword is the subword itself +(alone, which is usual, or with a dict-cap token - a new experimental +mechanism which is described below). + +The new way of tokenizing is much more flexible that the old one, and it +serves as an infrastructure on which new tokenizing and display features +can be implemented. One of them, that is implemented in this release, is +spelling for languages other then English. This is now possible because the +spell result can now pass further tokenization if needed. This also +enhances the spelling for English in case the spell result is a +contraction. In addition, the multi-level of tokenization, inherent to the +word-graph, allows multiple run-on and spell-correction fixes on the same +word at once. + +The tokenizing code is still based much on the old code and further +work is needed to clean it up (or to replace it, e.g. by a +regex-tokenizer). It still doesn't use the full power of the word-graph, +and there are constructs that need to be tokenized but they are not (they +are also not in the sentence test batches). E.g. `--` between words without +whitespace. + +There is still no API to get information from the word-graph. In particular, +it is not possible to find out the sentence words after punctuation +tokenization, as in previous releases. + +Since the parser cannot currently use the word-graph directly, there is a +need to convert it to the 2D-word-array that it uses. This is implemented +by the function `flatten_wordgraph()`, which uses a complex algorithm: It +scans all the word-graph paths in parallel, trying to advance to the next +words and to issue them into the 2D-word-array. + +It advances to the next word of a given word in rounds, using two passes, one +word per word-graph path on each round: +Pass 1. Next words which are in the same alternative of the given word. +Pass 2. Next words which are in a different alternative (of the common +ancestor word) of words that has already been advanced to in pass 1 +and this pass. + +The words that got advanced to are issued into the 2D-word-array. It is +possible that the second pass above cannot get advance in particular +word-graph path, because the next word is in the same alternative of one of +the next words in other paths. In that case an empty word is issued. This +constraint arises because all the next words in each word-graph advance +round, are issued into the same 2D-word-array "slot", which cannot hold +words from the same alternative. + +As in the previous versions, due to the way alternatives are implemented, +morphemes from different word-tokenization alternatives can be mixed in a +linkage. Such linkages are of course useless and need to be detected and +discarded. This is done by the `sane_linkage_morphism()` function. In the +previous versions it validated that the chosen words (the words in the +linkage) that are subwords (e.g. morphemes) of a sentence word, all belong +to a single tokenization alternative of that word. +It now works in another way - it validates that the chosen words create a +path in the word-graph. In case of "null-words" - words with no linkage - +the first path which is encountered is used. It means that a word in the +word-graph path corresponding to a null-word, may be only one of the potential +possibilities. + +Word-graph display +------------------ +Another feature that has been implemented, mainly for debug (but it can +also be useful for inspiration and fun), is displaying a graphical +representation of the word graph. The graphs can be displayed in several +ways, controlled by one-letter flags. The command `!test=wg` enables the +display of a graphs which includes no "prev" links for clarity, but +includes "unsplit word" links. Other graphical representation modes can be +enabled by `!test=wg:FLAGS`, when FLAGS are lowercase letters as defined in +wordgraph.h. For example, `!test=wg:sl` displays unsplit words as subgraphs, +with a legend, and adding the `p` flag (i.e. `!test=wg:slp`) adds "prev" links. +The graphical display still needs improvements in order to be able to +display complex word-graph situations in a simple manner. The graphical +display code is not compiled by default because it involves invocation of +an external program (`dot`) and in addition, files are created, both things +may not be desired by some users. Use `--enable-wordgraph-display` to enable +this feature. + +On Windows this feature is enabled when compiled with `USE_WORDGRAPH_DISPLAY`. +See "../msvcNN/RDADME" (NN is the MSVC version) for further details. + +Quote handling +-------------- +Quotes now are not discarded, but are considered to be regular dict tokens. +In this version they have no significant linkage and always attach to the word +before them (or to the LEFT-WALL). In order to compare detailed batch runs with +previous versions of the library, a `!test=removeZZZ` can be used to remove the +quote display. + +Handling capitalized words +-------------------------- +Not as in previous releases, capital letters which got downcased are not +restored for display if the affected words have a linkage. + +A new experimental handling of capital words using the dictionary has been +introduced. It inserts the token `1stCAP` before the uc version, and `nonCAP` +before the lc one, as discussed in: +https://groups.google.com/forum/?hl=en#!topic/link-grammar/hmK5gjXYWbk +It is enabled by `!test=dictcap`. The special "dictcap" tokens are not yet +discarded, so in order to compare results to previous library versions, the +following can be used: `!test=dictcap,removeZZZ`. + + +HOWTO use the new regex tokenizer/splitter +========================================== +It's new, experimental code. + +To compile: `../configure --enable-regex-tokenizer` + + + +- At the linkparser> prompt, enter: +`!/REGEX/,tokentosplit` + +Currently, if tokentosplit contains white space, command-line.c discards +it. +Also, case is currently observed. + +The regex syntax is designed so the regex is a valid one (although +meaningless) as written, so compiling it would reveal syntax errors in +it (the result of this initial compilation is not used). + +- All the /regexes/ are anchored at their start and end, as if `^` and `$` + were used. +- Mentioning a word class (x is an optional constraint, defaults to + `.*`): + +`(?x)` + +CLASS can be: + * DICTWORD, to match a word from `4.0.dict`. + * An affix class name (takes priority if there is a regex with the same + name). + * A regex name from `4.0.regex` (prefix it with `r` if there is such an + affix class). + +For regexes from `4.0.regex`, the code combine the ones with the same +name, taking care to omit the `^` and `$` from each, if exist (constraints +are said to be supported (not tested) and can be added if needed, but I +could not find an example of anything useful). + +DICTWORD can be optionally followed by a word mark, which is taken from +the affix file: + + * DICTWORDaM append M to DICTWORD before looking it up. + * DICTWORDpM prepend M to DICTWORD before looking it up. + +If M contains more than one word (in the affix file), only the first one +is currently used. + + +Examples: + * `(?)` match a suffix from the affix file + * `(?)` match the regex `NUMBER`. + * `(?)` match `UNITS` from the affix file. + * `(?)` match `UNITS` from the regex file. + * `(?)` match a dictionary word. + * `)` match word.= (if `STEMSUBSCR` is ^C=). + * `x)` ): +Matching single letters by DISTWORD (because they are in the dict) may +note be desired. +In such a case _x_ can be constrained to include 2 letters at least, plus +the desired 1-letter words. +E.g.: `(?.{2,}|a)` , which matches words of 2 letters and more, +plus the word `a`. + +- Currently the outer part of the regex should not contain alternations. + This is because I was too lazy to add code for adding `(?:...)` over it +in such cases. So in order to contain alternations the `(?:...)` should +currently be added by hand, as in: + +`/(?:(?)|(?))*/,dfs,dsfadsdsa,.?!sfads` + +- Holes are not supported. For example, this is not fine (and not + tested): + +`/(?)-(?)/,khasdkflhdsfa` + +because the `-` character would create a hole in the result. +But this is fine (and also not tested...): + +`/(?)(-)(?)/,asdfkjajfahlad` + +Currently, since named capturing groups are used for classes, if the same +class name is used more than once, there may be a need to start the regex +by `(?J)`. This will be fixed later. + +- The regex cannot include nested capture groups, so inner groups, if + needed, should be non-capturing ones. + +This is because currently the matching groups create a linear string, +without holes. +If you will find a use for internal capture groups, I can use them. +Because of that, backreferences in regexes from the regex file are not +supported (but there are currently none...). + +So this is not valid (a DICTWORD which matches a `NUMBER`): + +`/(?))/,qazwsx` + +and this too (a nonsense constraint for demo): + +`/(?([A-Z][0-9])*)/,qazwsx` + +but this should be fine: + +`/(?(?:[A-Z][0-9])*)/,qazwsx` + + +Some fun examples: + +``` +!/(.*)*/,test +Modified pattern: (?:(.*)(?C))*$(?C1) +Alternative 1: + 0 (1): test (0,4) + 1 (1): (4,4) +Alternative 2: + 0 (1): test (0,4) +Alternative 3: + 0 (1): tes (0,3) + 1 (1): t (3,4) + 2 (1): (4,4) +Alternative 4: + 0 (1): tes (0,3) + 1 (1): t (3,4) +Alternative 5: + 0 (1): te (0,2) + 1 (1): st (2,4) + 2 (1): (4,4) +[...] +Alternative 14: + 0 (1): t (0,1) + 1 (1): e (1,2) + 2 (1): st (2,4) +Alternative 15: + 0 (1): t (0,1) + 1 (1): e (1,2) + 2 (1): s (2,3) + 3 (1): t (3,4) + 4 (1): (4,4) +Alternative 16: + 0 (1): t (0,1) + 1 (1): e (1,2) + 2 (1): s (2,3) + 3 (1): t (3,4) +``` + +(Some appear "twice" due to the terminating null match. I think I will +discard such matches.). + +With splits to 2 parts only: +``` +linkparser> !/(.*){2}/,test +Modified pattern: (?:(.*)(?C)){2}$(?C1) +Alternative 1: + 0 (1): test (0,4) + 1 (1): (4,4) +Alternative 2: + 0 (1): tes (0,3) + 1 (1): t (3,4) +Alternative 3: + 0 (1): te (0,2) + 1 (1): st (2,4) +Alternative 4: + 0 (1): t (0,1) + 1 (1): est (1,4) +Alternative 5: + 0 (1): test (0,4) +linkparser> +``` + +``` +!/(?:(?.{2,}|a)(?)?)+/,theriver,dangeroustonavigatebutimportantforcommerce,hasmanyshoals. +``` +(This is one long line, just test it...) + + +`!/(?)(?)*(?)*/,123.2milligram/terag/?!`
+(test it...) + +``` +!/(?)(?)/,there's +Modified pattern: (?:(?.*)(?C))(?:(?.*)(?C))$(?C1) +Alternative 1: + 0 (1): there (0,5) [DICTWORD] + 1 (2): 's (5,7) [SUF] +linkparser> +``` + + +In the next example, we get only whole word and double-dash because +it can only match wpwp (when w is DICTWORD and p is `--`). + +``` +!/(?:(?)(?))+/,this--is-- +Modified pattern: (?:(?:(?.*)(?C))(?:(?.*)(?C)))+$(?C1) +Alternative 1: + 0 (1): this (0,4) [DICTWORD] + 1 (2): -- (4,6) [LPUNC] + 2 (1): is (6,8) [DICTWORD] + 3 (2): -- (8,10) [LPUNC] + 4 (1): (10,10) [DICTWORD] +linkparser> +``` + +However, this breaks to single characters, as expected: +``` +!/(?:(?)(?:(?))*)+/,this--is-- +... +Alternative 360: + 0 (1): t (0,1) [DICTWORD] + 1 (1): h (1,2) [DICTWORD] + 2 (1): i (2,3) [DICTWORD] + 3 (1): s (3,4) [DICTWORD] + 4 (1): - (4,5) [DICTWORD] + 5 (1): - (5,6) [DICTWORD] + 6 (1): i (6,7) [DICTWORD] + 7 (1): s (7,8) [DICTWORD] + 8 (1): - (8,9) [DICTWORD] + 9 (1): - (9,10) [DICTWORD] +10 (1): (10,10) [DICTWORD] +linkparser> +``` + +But this stops after the first match: +``` +!/(?:(?)(?:(?)(*COMMIT))*)+/,this--is-- +Alternative 1: + 0 (1): this (0,4) [DICTWORD] + 1 (2): -- (4,6) [LPUNC] + 2 (1): is (6,8) [DICTWORD] + 3 (2): -- (8,10) [LPUNC] + 4 (1): (10,10) [DICTWORD] +linkparser> +```` + +And this is even more interesting: +``` +!/(?:(?)(*COMMIT)(?:(?))*)+/,this--is-- +Alternative 1: + 0 (1): this (0,4) [DICTWORD] + 1 (2): -- (4,6) [LPUNC] + 2 (1): is (6,8) [DICTWORD] + 3 (2): -- (8,10) [LPUNC] + 4 (1): (10,10) [DICTWORD] +Alternative 2: + 0 (1): this (0,4) [DICTWORD] + 1 (2): -- (4,6) [LPUNC] + 2 (1): is (6,8) [DICTWORD] + 3 (2): - (8,9) [LPUNC] + 4 (2): - (9,10) [LPUNC] + 5 (1): (10,10) [DICTWORD] +Alternative 3: + 0 (1): this (0,4) [DICTWORD] + 1 (2): -- (4,6) [LPUNC] + 2 (1): is (6,8) [DICTWORD] + 3 (2): - (8,9) [LPUNC] + 4 (1): - (9,10) [DICTWORD] + 5 (1): (10,10) [DICTWORD] +linkparser> +``` + +It seems as if conditional matching using (?(condition)yes-pattern|no-pattern) +or `(*THEN)` can do some fun things, but I don't have useful examples yet. + +The question is how to use this code for tokenization. I have some +ideas, more on that later. diff -Nru link-grammar-5.3.16/link-grammar/tokenize/regex-tokenizer.c link-grammar-5.5.0/link-grammar/tokenize/regex-tokenizer.c --- link-grammar-5.3.16/link-grammar/tokenize/regex-tokenizer.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/regex-tokenizer.c 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,1117 @@ +/*************************************************************************/ +/* Copyright (c) 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +/* FIXME: Fold long lines. */ + +#ifdef USE_REGEX_TOKENIZER + +#include +#include +#include +#include +#include +#include + +#include "regex-tokenizer.h" +#include "dict-common/dict-api.h" +#include "dict-common/dict-common.h" +#include "dict-common/regex-morph.h" +#include "dict-common/dict-defines.h" +#include "dict-common/dict-affix.h" +#include "error.h" +#include "externs.h" +#include "tokenize.h" +#include "utilities.h" +#include "dict-file/read-dict.h" + +#include "pcre.h" + +/* Tokenizer flags. */ +#define MARK_TOKENS 0x1 /* stem/affix marks in the result tokens (not implemented) */ + +/* Debug signature for the initial end of sub-pattern 0 (subp[0].e). If we find + * it while printing an alternative, it means we have a logic failure, because + * we should not reach the end of the pattern without a match which assigns + * there the matched sub-pattern string end. */ +#define SUBP0END_DEBUG_SIGNATURE -2 + +//extern const char const * afdict_classname[]; + +typedef enum +{ + CALLBACK_REP, + CALLBACK_END, + CALLBACK_CONSTANT_START, /* UNUSED */ + CALLBACK_CONSTANT_END /* UNUSED */ +} callback_num; + +typedef struct ov_s +{ + int s; + int e; +} ov_t; + +#ifdef REGEX_TOKENIZER_CACHE +/* TODO */ +/* Match cache bit vector. */ +typedef struct bitvec_s +{ + int len; /* current vector length, in bytes */ + char *vec; + int get; /* cache get counter */ + int set; /* cache set counter */ +} bitvec_t; +#endif + +/* info per capture group number */ +typedef struct cgnum_s +{ + Dictionary dict; /* dictionary to use */ + const char *afclass; /* affix class, or NULL for main dict */ + const char *lookup_mark; /* potential stem or infix marks */ + char lookup_mark_pos; /* "+" (append) or "-" (prepend) */ + const char *name; /* currently only for result tagging printout */ +#ifdef REGEX_TOKENIZER_CACHE /* TODO */ + bitvec *mcache; /* substring match cache */ +#endif + /* FIXME: Maybe add formatting function for SUF, PRE, STEM */ +} cgnum_t; + +#define MAX_SUBP 100 +typedef struct callout_data_s +{ + int function; /* callout function multiplexing */ + const char *pattern; + int test; + ov_t subp[MAX_SUBP]; /* sub-pattern array */ + int capture_level[MAX_SUBP]; + int subp_i; /* current sub-pattern index */ + bool subp_ovfl; /* subp array overflow */ + int capture_last; /* UNUSED */ + const char ***wordlist; + cgnum_t **cgnum; + // bool is_constant; /* a constant alternation - don't lookup (FIXME. UNUSED)*/ + int alt_counter; /* counter for number of alternatives */ +} callout_data_t; + +/** + * Get a regex (of 4.0.regex) by name. + * Replace all capturing groups by non-capturing ones, since the invoking + * function cannot currently handle them. Hence back references are not + * supported. This can be fixed if needed. + * + * If a regex name appears multiple times, concatenate them using an alternation + * bar. Remove anchors ^ and $ if exist (suppose they can only appear at the + * start and end of the regex, as currently in 4.0.regex). + */ +static char *get_regex_by_name(Dictionary const dict, const char * const name) +{ + dyn_str * const pat = dyn_str_new(); + char *result = NULL; + Regex_node *re = dict->regex_root; + const char *p; + + while (NULL != re) + { + if (0 == strcmp(re->name, name)) + { + /* re analyze state */ + bool insqb = false; /* in square brackets */ + bool qn = false; /* quote next character */ + + p = re->pattern; + if ('\0' != pat->str[0]) dyn_strcat(pat, "|"); + if ('^' == *p) p++; + + /* Change groups in POSIX regex to PCRE non-capturing groups. + * FIXME: Add support for PCRE syntax, + * especially, skip (?...) and (*...). + * The following code supports backslash and square brackets. + * It supposes the regex is valid. */ + for (; '\0' != *p; p++) + { + char c0[2] = "\0\0"; + + if (qn) + { + qn = false; + } + else + { + switch (*p) + { + case '\\': + qn = true; + break; + case '[': + insqb = true; + break; + case ']': + if (p > re->pattern && '[' == p[-1]) break; + insqb = false; + break; + case '(': + if (insqb) break; + dyn_strcat(pat, "(?:"); + continue; + } + } + if ('$' != *p || '\0' != p[1]) + { + c0[0] = *p; + dyn_strcat(pat, c0); + } + } + } + re = re->next; + } + + if ('\0' != pat->str[0]) result = strdup(pat->str); + dyn_str_delete(pat); + return result; +} + +static void printov(const char *str, ov_t *pov, int top, callout_data_t *cd, bool is_pcreov) +{ + int i; + const cgnum_t *cgnump = NULL; + + for (i = 0; i < top; i++) + { + printf("%2d", i); + if (!is_pcreov && (NULL != cd) && (NULL != cd->capture_level)) + printf(" (%d)", (pov[i].e < 0) ? 0 : cd->capture_level[i]); + printf(": "); + if (pov[i].s < 0) + { + printf(" "); + } else + { + if (pov[i].e < 0) + printf(" END<0 (%d,%d)", pov[i].s, pov[i].e); + else + printf(" %.*s (%d,%d)", pov[i].e - pov[i].s, str + pov[i].s, pov[i].s, pov[i].e); + + } + + /* Find the tokenizer capture group info for the current OV element: + * - For PCRE OV, use its index (if > 0) as capture group. + * - For the tokenizer OV, use the recorded capture level. + * Since the cgnum array is 0-based and the first parenthesized capture + * group is 1, subtract 1 to get the actual index. */ + if ((NULL != cd) && (NULL != cd->capture_level) && (NULL != cd->cgnum) && + (!is_pcreov || (i > 0)) && pov[i].e >= 0) + cgnump = cd->cgnum[(is_pcreov ? i : cd->capture_level[i]) - 1]; + + if (NULL != cgnump) + { + const char *a = "", *p = ""; + char lookup_mark[10]; + char *sm; + + if (NULL != cgnump->lookup_mark) + { + if ('a' == cgnump->lookup_mark_pos) + { + safe_strcpy(lookup_mark, cgnump->lookup_mark, sizeof(lookup_mark)); + sm = strrchr(lookup_mark, SUBSCRIPT_MARK); + if (NULL != sm) *sm = '.'; + a = lookup_mark; + } + else + { + p = cgnump->lookup_mark; + } + } + printf(" [%s%s%s]", p, cgnump->name, a); + } + + printf("\n"); + } +} + +/** + * Compare a portion of the tokenized string, starting at word_stat with length + * of numchar, to the dictionary or affix class word that is defined in the + * capture group whose info is pointed to by cgnump. + * + * FIXME: Return int instead of bool, see the comment at E1 below. + */ +static bool is_word(const char *word_start, int numchar, cgnum_t *cgnump) +{ + Dictionary const dict = cgnump->dict; + const char * const afclass = cgnump->afclass; + const int lookup_mark_len = + (NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0; + char * const word = alloca(numchar+lookup_mark_len+1); +#ifdef AFFIX_DICTIONARY_TREE + const Dict_node *dn; +#endif + const Afdict_class *ac; + size_t i; + + /* Append/prepend stem/infix marks. */ + if (NULL == cgnump->lookup_mark) + { + strncpy(word, word_start, numchar); + word[numchar] = '\0'; + } + else + { + switch (cgnump->lookup_mark_pos) + { + case 'p': /* prepend a mark */ + strcpy(word, cgnump->lookup_mark); + strncat(word, word_start, numchar); + word[numchar+lookup_mark_len] = '\0'; + break; + case 'a': /* append a mark */ + strncpy(word, word_start, numchar); + strcpy(word+numchar, cgnump->lookup_mark); + break; + default: + printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark); + strncpy(word, word_start, numchar); + word[numchar] = '\0'; + } + } + + lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name); + if (0 == afclass) return boolean_dictionary_lookup(dict, word); + + /* We don't have for now a tree representation of the affix file, only lists */ +#ifdef AFFIX_DICTIONARY_TREE + dn = lookup_list(dict, word); + printf("WORD %s afclass %s dn %p\n", word, afclass, dn); + if (NULL == dn) return false; + + for (; NULL != dn; dn = dn->left) + { + const char *con = word_only_connector(dn); + if (NULL == con) + { + /* Internal error - nothing else to do for now unless we don't + * rerun bool, but return an int so -1 signifies an error. */ + printf("is_word(%s):E1 ", word); + } + printf("CON '%s'\n", con); + if (0 == strcmp(afclass, con)) return true; + } +#else + /* Make it the hard way. */ + ac = afdict_find(dict, afclass, /*notify_err*/false); + if (NULL == ac) + { + /* Internal error - nothing else to do for now unless we don't + * rerun bool, but return an int so -1 signifies an error. */ + printf("is_word(%s):E2 ", word); + } + + for (i = 0; i < ac->length; i++) + { + if (0 == strcmp(ac->string[i], word)) return true; + } +#endif + + return false; +} + +static int callout(pcre_callout_block *cb) +{ + callout_data_t *cd = cb->callout_data; + ov_t *cb_ov = (ov_t *)&cb->offset_vector[2*cb->capture_last]; + +#if 0 + const char **wordlist = NULL; +#endif + cgnum_t *pcgnum = NULL; + const char *openp; + const char *endname; + bool subp_updated = false; + + if ((NULL != cd->cgnum) && (-1 != cb->capture_last)) + { + pcgnum = cd->cgnum[cb->capture_last-1]; + } + lgdebug(6, "Callout %d: capture_last %d cgnum %p\n", + cb->callout_number, cb->capture_last, pcgnum); + + if (verbosity >= 6) + printov(cb->subject, (ov_t *)cb->offset_vector, cb->capture_top, cd, /*is_pcreov*/true); + + switch(cb->callout_number) + { + case CALLBACK_REP: + if (cb->capture_last > 0) + { + int subp_i = cd->subp_i; + ov_t *subp = &cd->subp[subp_i]; + + lgdebug(2, "Current capture %d: s=%d, e=%d\n", + cb->capture_last, cb_ov->s, cb_ov->e); + assert(cb_ov->s>=0 && cb_ov->e>=0, "Bad start/end in capture group %d: s=%d e=%d", + cb->capture_last, cb_ov->s, cb_ov->e); + + if (verbosity >= 6) + { + printf("INITIAL subp:\n"); + if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ + printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); + } + + /* Record all the captures into the subp (sub-pattern) vector. + * If we capture a continuation to another capture then it is a new + * capture. Else we update a previous position in subp. There should be + * no gaps between the capture strings. + * FIXME: Handled null matches properly. Need to use cd->capture_level + * to remember at which level a null match has been captured. + * FIXME: Move after the word lookup (efficiency). + * FIXME: Increment subp instead of cd->subp_i (cosmetic fix). */ + + if (cb_ov->s > subp->s) + { + if (cb_ov->s == subp->e) + { + cd->subp_i++; + if (cd->subp_i == MAX_SUBP) + { + cd->subp_ovfl = true; + return PCRE_ERROR_CALLOUT; + } + lgdebug(2, "OV start gt, update next sub-pattern %d\n", cd->subp_i); + cd->subp[cd->subp_i] = *cb_ov; + subp_updated = true; + } + else + { + printf("Capture group %d (s=%d e=%d) makes a hole (subp_i %d: s=%d e=%d)\n", + cb->capture_last, subp->s, subp->e, subp_i, cb_ov->s, cb_ov->e); + return PCRE_ERROR_CALLOUT; + } + } + else + { + /* A backtrack occurred. */ + for (subp_i = cd->subp_i; subp_i >= 0; subp_i--) + { + subp = &cd->subp[subp_i]; + + lgdebug(2, "Checking recorded sub-pattern %d: s=%d e=%d: ", + subp_i, subp->s, subp->e); + + if (cb_ov->s == subp->s) + { + lgdebug(2, "OV start eq, update sub-pattern %d\n", subp_i); + *subp = *cb_ov; + cd->subp_i = subp_i; + subp_updated = true; + break; + } + lgdebug(2, "Backtrack handling\n"); + } + } + assert(subp_i >= 0, "Recorded sub-pattern index"); + assert(subp_updated); + cd->capture_level[cd->subp_i] = cb->capture_last; + + if (verbosity >= 6) + { + printf("AFTER: subp:\n"); + if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ + printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); + } + + /* Make a dictionary lookup for NAME in capture groups (?x) + * (x is a constraint for the initial pattern-match comparison done by + * PCRE). */ + // if (pcgnum && * cd->is_constant) printf("is_constant\n"); + + /* If we have a cgnum structure with a dict, check if the string to be + * matched is in the dict or belongs to the given affix class. + * A NULL cgnum->dict means this is a regex from the regex file. */ + + if (pcgnum && pcgnum->dict) + { /* && !cd->is_constant */ + int numchar = cb_ov->e - cb_ov->s; + + /* Debug: Sanity check. */ + assert(numchar>=0, "numchar=%d", numchar); + endname = NULL; + for (openp = &cd->pattern[cb->pattern_position-5]; *openp; openp--) + { + if (*openp == '>') endname = openp; + if (*openp == '(' && openp[1] == '?' && openp[2] == '<' && openp[3] != '=') break; + } + if (NULL != openp && *openp == '(' && NULL != endname && strncmp(openp, "(?<", 3) == 0 && endname > openp) + ; /* Everything is OK. */ + else + { + assert(0, "Error: Not in a named group!"); + } + lgdebug(6, "GROUP NAME %.*s, cgnum %d, ptr %p, numchar %d\n", + (int)(endname - openp - 3), openp+3, cb->capture_last-1, pcgnum, numchar); + /* End of debug sanity check. */ + + lgdebug(2, "Try match '%.*s': ", numchar, cb->subject+cb_ov->s); + +#if 0 + if (0 == numchar) + { + lgdebug(2, "Zero match denied\n"); + return 1; + } +#endif + + if (!is_word(cb->subject+cb_ov->s, numchar, pcgnum)) + { + lgdebug(2, "NO MATCH\n"); + return 1; + } + lgdebug(6, "MATCH\n"); + } + } +#if 0 + if (verbosity >= 6) + { + printf("DEBUG subp:\n"); + if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ + printov(cb->subject, cd->subp, cd->subp_i+1, cd); + } +#endif + + // cd->is_constant = false; + return 0; /* continue to match the rest of the regex */ + break; + +#if 0 + case CALLBACK_CONSTANT_START: + // cd->is_constant = true; + return 0; + break; + + case CALLBACK_CONSTANT_END: + // cd->is_constant = false; + return 0; + break; +#endif + + case CALLBACK_END: + cd->alt_counter++; + printf("Alternative %d:\n", cd->alt_counter); + /* See the comment for SUBP0END_DEBUG_SIGNATURE. */ + assert(cd->subp[0].e>=0, "subp[0].e is %d!", cd->subp[0].e); + printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); + + /* Remove the last sub-pattern, in case it is a null string (no need to + * check, it can be removed anyway since if it is not a null string it is + * going to be replaced on the next match). Else the next match, which + * will be without this null string, we emit it again as the last + * sub-pattern component. FIXME: It doesn't always help. */ + + if (cd->subp_i > 0) + { + cd->capture_level[cd->subp_i] = -3; /* mark as invalid, for debug */ + cd->subp_i--; + } + + // cd->is_constant = false; + return 1; /* signify a backtrack in order to find the next alternative */ + break; + + default: + assert("Callout: Unreached" && 0); + } + + return 0; /* Really unreached. */ + +/* + printf("Callout %d, data test %d\n" + "version %d\n" + "subject '%s\n" + "subject_length %d\n" + "start_match %d\n" + "current_position %d\n" + "capture_top %d\n" + "capture_last %d\n" + "pattern_position %d\n" + "next_item_length %d\n", + cb->callout_number, ((callout_data *)cb->callout_data)->test, + cb->version, cb->subject, cb->subject_length, cb->start_match, + + cb->current_position, + cb->capture_top, + cb->capture_last, + + cb->pattern_position, + cb->next_item_length); + return 0; +*/ + +} + +/* Was main() of the test program... */ +static int regex_split(const char *inpat, int flags, const char *str, Dictionary dict) +{ + const char *p; + dyn_str *pat; + int plevel; /* paren level */ + int cglevel; /* capture group level */ + int nplevel; /* paren level within named capture group */ + int icgnum; /* capture group number*/ + int options; + const char *errptr; + int erroffset; + pcre *pcre; + const char * const prog = "regex_tokenizer_test"; + int rc; + pcre_extra *extra = NULL; +#define OVCNT 15 + int ovector[OVCNT]; + callout_data_t callout_data; + +#if 0 + const char **wordlist; +#endif + bool word_compare_flag = true; +#ifdef notdef + dyn_str *wordalts; +#endif + const char *group_name = NULL; + char *word_classname; + char c0[2] = "\0\0"; + + /* FIXME: validate we use PCRE version 2 at least. */ + + /* Find the number of capturing groups in the input pattern. */ + icgnum = 0; + for (p = inpat; '\0' != *p; p++) + { + /* Count as capture groups only (string) or (?). Especially, avoid + * counting (?<=...) (positive look behind) and (?(condition)...) (the + * (condition) part). + * FIXME: support () inside []. + * FIXME: support \. */ + if ((*p == '(') && (*p != '*') && + ((p[1] != '?') || ((p[2] == '<') && (p[3] != '='))) && + ((p-inpat < 2) || (p[-2] != '(') || (p[-1] != '?'))) + { + icgnum++; + } + } + if (0 == icgnum) + { + printf("%s: pattern must include at least one () group (was: %s)\n", prog, inpat); + return 9; + } +#if 0 + if (p[-1] != '$') + { + /* FIXME: add $ if needed */ + printf("%s: pattern must end with $ (was: %s)\n", prog, inpat); + return 9; + } +#endif + + /* Regex syntax check of the pattern. + * FIXME: Add support for "(?J)" */ + options = PCRE_UTF8; + pcre = pcre_compile(inpat, options, &errptr, &erroffset, NULL); + if (NULL == pcre) + { + printf("%s: pcre_compile: Error in pattern '%s' at offset %d: %s\n", + prog, inpat, erroffset, errptr); + return 2; + } + + callout_data.wordlist = NULL; + callout_data.cgnum = NULL; + if (word_compare_flag) + { + int i; +#if 0 + callout_data.wordlist = malloc(sizeof(*callout_data.wordlist)*icgnum); +#endif + callout_data.cgnum = malloc(sizeof(*callout_data.cgnum)*icgnum); + //printf("ALLOCATED callout_data.cgnum %ld for %d groups\n", + //sizeof(*callout_data.wordlist)*cgnum, icgnum); + for (i = 0; i < icgnum; i++) + { +#if 0 + callout_data.wordlist[i] = NULL; +#endif + callout_data.cgnum[i] = NULL; + + } + } + + /* Build the pattern that finds all possible matches. */ + pat = dyn_str_new(); + plevel = 0; + cglevel = 0; + icgnum = -1; /* First capture group (plevel==1) is icgnum==0. */ + + /* Convert the input regex to the tokenizer regex. + * cglevel counts named capture groups + * plevel counts all groups + * + * FIXME: Add support for: + * (?x) - comment mode. + * (?i) - ignore case. + * \ - backslash for ()<>?* . + * [] - () inside it + * FIXME: Add "(?: ... )" over the result pattern. + */ + //dyn_strcat(pat, "(?J)"); + for (p = inpat; '\0' != *p; p++) + { + char *re = NULL; /* a regex from the 4.0.regex file */ + + switch (*p) + { + const char *c; + + case '(': + if (cglevel > 0) + { + printf("Error at position %ld: Tokenizer capture groups cannot have nested groups\n", p-inpat); + } + plevel++; + if ((p[1] == '*') || + ((p[1] == '?') && ((p[2] != '<') || (p[3] == '='))) || + ((p-inpat > 1) && (p[-2] == '(') && (p[-1] == '?'))) + { + break; + } + cglevel++; + if (cglevel > 1) + { + printf("Error at position %ld: Tokenizer aregex cannot have capture group level > 1\n", p-inpat); + free(callout_data.cgnum); + return 199; + } + icgnum++; + dyn_strcat(pat, "(?:"); + group_name = NULL; + break; + case ')': + plevel--; + if (cglevel > 0) + { + cglevel--; + /* Add the dict lookup and capturing callback. */ + dyn_strcat(pat, ")(?C)"); + } + group_name = NULL; + break; + case '<': + /* Remember it as a potential start of a named group. */ + if ((p-2 >= inpat) && (p[-2] == '(') && (p[-1] == '?') && (p[1] != '=')) + { + group_name = p + 1; + } + else + group_name = NULL; + break; + case '>': + if (NULL != group_name) + { + /* Check if this is actually a group name */ + for (c = group_name; c < p; c++) + { + /* FIXME: 'a' and 'p' are part of a hack for lookup_mark. + * FIXME: 'r' is part of a hack for regex names that match affix + * class names. The fix is not to use matching names. */ + if ((*c > 'Z' || *c < 'A') && *c != 'a' && *c != 'p' && *c != 'r') break; + } + if (c == p) + { + word_classname = malloc(p-group_name+1); + strncpy(word_classname, group_name, p-group_name); + word_classname[p-group_name] = '\0'; + } else + { + printf("%s: Invalid class name in group name found at '%s'\n", + prog, group_name-4); + word_classname = NULL; + } + } else + { + word_classname = NULL; + } + if (!word_classname) + { + group_name = NULL; + break; + } + dyn_strcat(pat, ">"); + + lgdebug(6, "Found word-class %s\n", word_classname); +#if 0 + wordlist = readwords(word_classname); + if (NULL == wordlist) + { + printf("i%s: Invalid class name %s in group name\n", prog, word_classname); + return 100; + } + + if (!word_compare_flag) + { + printf("Invocation without -w is not supported\n"); + return 103; + } +#endif + + if (word_compare_flag) + { + char *t; + const char *lookup_mark = NULL; +#if 0 + callout_data.wordlist[icgnum] = wordlist; + printf("WORDLIST %p at cgnum %d\n", wordlist, icgnum); +#endif + /* Allocate per group info */ + callout_data.cgnum[icgnum] = malloc(sizeof(*(callout_data.cgnum)[0])); + callout_data.cgnum[icgnum]->name = NULL; + //printf("ALLOCATED cgnum[%d]=%p\n", icgnum, + //callout_data.cgnum[icgnum]); + + /* A hack for testing: Handle WORDpX or WORDaX. + * The above a/p marks mean append/prepend X to word before making + * the lookup. + * FIXME: Find another way to specify that, maybe in the affix file + * or in a tokenizer definition file. */ + t = strpbrk(word_classname, "pa"); + if (NULL != t) + { + Afdict_class *ac; + + callout_data.cgnum[icgnum]->lookup_mark_pos = *t; + *t = '\0'; + ac = afdict_find(dict->affix_table, t+1, /*notify_err*/false); + if (NULL == ac) + { + printf("%s: Unknown afclass '%s'\n", prog, t+1); + return 253; + } + + /* Check if the requested affix class is defined and is not an + * empty string (like the default INFIXMARK). */ + if (0 == ac->length || '\0' == ac->string[0][0]) + { + printf("%s: No value for afclass '%s'\n", prog, t+1); + return 252; + } + lookup_mark = ac->string[0]; /* FIXME: support more than one value. */ + } + + callout_data.cgnum[icgnum]->lookup_mark = lookup_mark; + callout_data.cgnum[icgnum]->name = word_classname; + + if (0 == strcmp(word_classname, "DICTWORD")) + { + /* Assign data for looking up a word in the main dict. */ + callout_data.cgnum[icgnum]->dict = dict; + callout_data.cgnum[icgnum]->afclass = NULL; + } + else + if (afdict_find(dict->affix_table, word_classname, /*notify_err*/false)) + { + callout_data.cgnum[icgnum]->dict = dict->affix_table; + callout_data.cgnum[icgnum]->afclass = word_classname; + } + else + { + if ('r' == word_classname[0]) word_classname++; + re = get_regex_by_name(dict, word_classname); + if (re) + { + lgdebug(6, "Regex %s with modified groups: '%s'\n", word_classname, re); + callout_data.cgnum[icgnum]->dict = NULL; + /* FIXME: No need to allocate callout_data.cgnum[icgnum] in this + * case. */ + } + else + { + printf("%s: Unknown word classname '%s'\n", prog, word_classname); + return 254; + } + } + /* TODO: Assign flags, e.g. for emitting the words with stem/infix marks. */ + + } else + { +#if 0 + wordalts = make_wordalts(wordlist); + dyn_strcat(pat, wordalts->str); + dyn_str_delete(wordalts); + free(wordlist); +#else + printf("%s: Invocation without -w is not supported\n", prog); + return 103; +#endif + } + /* Default match for dictionary lookup is ".*". + * Allow replacing it by something else. + * E.g: .{2,}|a */ + if (')' == p[1]) + { + if (NULL == re) + { + dyn_strcat(pat, ".*"); + } + else + { + dyn_strcat(pat, re); + free(re); + re = NULL; + } + } + else + { + nplevel = 1; + /* FIXME: Add support for: + * (?x) - comment mode. + * \ - backslash for ()<>?* . + * [] - () inside it + */ + for (; p[1] != '\0' && nplevel > 0; p++) + { + switch (p[1]) + { + case '(': + if (('?' != p[2]) && ('*' != p[2]) && + ((p[-1] != '(') || (p[0] != '?'))) + { + printf("%s: Capture_group %d: Nested capture group is not supported\n", + prog, icgnum+1); + return 250; + } + nplevel++; + break; + case ')': + nplevel--; + if (0 == nplevel) continue; /* we are done */ + break; + } + + c0[0] = p[1]; + dyn_strcat(pat, c0); + } + p--; + } + + word_classname = NULL; + group_name = NULL; + continue; + } + + c0[0] = *p; + dyn_strcat(pat, c0); + } + + /* Add '$' at the end if needed. */ + if ('$' != pat->str[pat->end-1]) dyn_strcat(pat, "$"); + /* Add the backtracking callback. */ + dyn_strcat(pat, "(?C1)"); + + printf("Modified pattern: %s", pat->str); + lgdebug(2, " (len %zu/%zu)", pat->end, pat->len); + printf("\n"); + + pcre_callout = callout; + + callout_data.function = 1; + callout_data.subp_i = 0; + callout_data.subp[0].s = 0; + callout_data.subp[0].e = SUBP0END_DEBUG_SIGNATURE; + callout_data.subp_ovfl = false; + callout_data.capture_last = 0; + callout_data.pattern = pat->str; + callout_data.alt_counter = 0; + + options = PCRE_UTF8; + pcre = pcre_compile(pat->str, options, &errptr, &erroffset, NULL); + if (NULL == pcre) + { + printf("%s: Internal error: pcre_compile: Error in pattern '%s' at offset %d: %s\n", + prog, pat->str, erroffset, errptr); + return 99; + } + + /* TODO: Check if using JIT may optimize out some needed callouts. */ + options = 0; //PCRE_STUDY_JIT_COMPILE; + extra = pcre_study(pcre, options, &errptr); + if (NULL == extra) + { + if (NULL != errptr) + { + printf("%s: pcre_study: Error for pattern '%s': %s\n", prog, pat->str, errptr); + return 3; + } + extra = malloc(sizeof(*extra)); + memset(extra, 0, sizeof(*extra)); + } else + { + /* For some reason JIT is sometimes done even though it was not requested. + * But the callouts are still invoked as expected in such cases. */ + lgdebug(6, "%s: pcre_study: JIT %ld\n", prog, extra->flags & PCRE_STUDY_JIT_COMPILE); + } + +#if 0 + extra->match_limit = 10000; + extra->match_limit_recursion = 10000; + extra->flags |= PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION; +#endif + + extra->callout_data = (void *)&callout_data; + extra->flags |= PCRE_EXTRA_CALLOUT_DATA; + +#if 0 + printf("CGNUM %d\n", icgnum); + if (NULL != callout_data.cgnum) + { + int i; + + for (i = 0; i <= icgnum; i++) + { + printf("callout_data.cgnum[%d] %p\n", i, callout_data.cgnum[i]); + } + } else + printf("CGNUM %p\n", callout_data.cgnum); +#endif + + options = PCRE_ANCHORED; /* XXX Maybe PCRE_NO_START_OPTIMIZE is needed too */ + rc = pcre_exec(pcre, extra, str, strlen(str), 0, options, ovector, OVCNT); + if (rc < 0) + { + if (PCRE_ERROR_NOMATCH == rc) + { + lgdebug(2, "No match (must always happen)\n"); + } else + { + printf("%s: pcre_exec: Error %d\n", prog, rc); + } + } else + { + printf("Internal error: Unexpected match, rc=%d\n", rc); + } + + if (0 == rc) + { + rc = OVCNT/3; + printf("ovector only has room for %d captured substrings\n", rc - 1); + } + + printov(str, (ov_t *)ovector, rc, NULL, /*is_pcreov*/true); + + if (verbosity > 6) + { + if (0 != callout_data.subp_i) + { + printf("Callout stack:\n"); + printov(str, callout_data.subp, callout_data.subp_i, &callout_data, /*is_pcreov*/false); + } + } + + /* Free everything. */ + dyn_str_delete(pat); /* note - callback_data uses parts of pat */ + pcre_free_study(extra); /* safe even if malloc'ed */ + free(pcre); + + if (NULL != callout_data.cgnum) + { + int i; + + for (i = 0; i <= icgnum; i++) + { + if (callout_data.cgnum[i]) + { + /* FIXME: Free also word_classname. */ + free(callout_data.cgnum[i]); + } + } + free(callout_data.cgnum); + } + +#if 0 + if (NULL != callout_data.wordlist) + { + int i; + + for (i = 0; i < icgnum; i++) + { + free(callout_data.wordlist[i]); + } + free(callout_data.wordlist); + } +#endif + + return 0; +} + +/** + * Test the RegEx tokenizer. + * line - REGEX/,token + */ +int regex_tokenizer_test(Dictionary dict, const char *line) +{ + int linelen = strlen(line); + char *regex = alloca(linelen+1); + char *token = alloca(linelen); + char *regex_end; + int tokenizer_flags; + + strcpy(regex, line); + regex_end = index(regex, '/'); + if (NULL == regex_end) + { + printf("Missing terminating '/' in regex.\nUsage: /REGEX/,token\n"); + return 101; + } + *regex_end = '\0'; + regex_end++; + /* FIXME: Add iterations for more flags if needed. */ + switch (*regex_end) + { + case 'M': + tokenizer_flags = MARK_TOKENS; + regex_end++; + break; + } + if (',' != *regex_end) + { + printf("Missing terminating ',' after regex end.\nUsage: /REGEX/,token\n"); + return 102; + } + strcpy(token, regex_end + 1); + if ('\0' == token[0]) + { + printf("Missing token\nUsage: /REGEX/,token\n"); + return 103; + } + + return regex_split(regex, tokenizer_flags, token, dict); +} + +#else /* USE_REGEX_TOKENIZER */ + +/* Mac OSX will fail to link if this dummy is not defined. + * But why is it needed? Because it shows up in the exported + * symbols list (link-grammar.def) and if its there, it must + * also be in the code. Thus, the below. + */ +#include "regex-tokenizer.h" +int regex_tokenizer_test(Dictionary dict, const char *line) +{ + return 0; +} +#endif /* USE_REGEX_TOKENIZER */ + diff -Nru link-grammar-5.3.16/link-grammar/tokenize/regex-tokenizer.h link-grammar-5.5.0/link-grammar/tokenize/regex-tokenizer.h --- link-grammar-5.3.16/link-grammar/tokenize/regex-tokenizer.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/regex-tokenizer.h 2017-05-28 11:14:43.000000000 +0000 @@ -0,0 +1,4 @@ +#include "link-includes.h" + +int regex_tokenizer_test(Dictionary, const char *); + diff -Nru link-grammar-5.3.16/link-grammar/tokenize/spellcheck-aspell.c link-grammar-5.5.0/link-grammar/tokenize/spellcheck-aspell.c --- link-grammar-5.3.16/link-grammar/tokenize/spellcheck-aspell.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/spellcheck-aspell.c 2018-01-24 01:19:58.000000000 +0000 @@ -0,0 +1,175 @@ +/*************************************************************************/ +/* Copyright (c) 2009 Vikas N. Kumar */ +/* Copyright (c) 2009 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifdef HAVE_ASPELL + +#include +#include +#include +#include +#include + +#include "link-includes.h" +#include "spellcheck.h" + +#define ASPELL_LANG_KEY "lang" +/* FIXME: Move to a definition file (affix file?). */ +static const char *spellcheck_lang_mapping[] = +{ +/* link-grammar language , Aspell language key */ + "en", "en_US", + "ru", "ru_RU", + "he", "he_IL", + "de", "de_DE", + "lt", "lt_LT", +}; + +struct linkgrammar_aspell +{ + AspellConfig *config; + AspellSpeller *speller; +}; + +/** + * create a new spell-checker for the language 'lang' + */ +void * spellcheck_create(const char * lang) +{ + struct linkgrammar_aspell *aspell = NULL; + size_t i = 0; + AspellCanHaveError *spell_err = NULL; + + for (i = 0; i < sizeof(spellcheck_lang_mapping)/sizeof(char *); i += 2) + { + if (0 != strcmp(lang, spellcheck_lang_mapping[i])) continue; + aspell = (struct linkgrammar_aspell *)malloc(sizeof(struct linkgrammar_aspell)); + if (!aspell) + { + prt_error("Error: out of memory. Aspell not used.\n"); + aspell = NULL; + break; + } + aspell->config = NULL; + aspell->speller = NULL; + aspell->config = new_aspell_config(); + if (aspell_config_replace(aspell->config, ASPELL_LANG_KEY, + spellcheck_lang_mapping[i]) == 0) + { + prt_error("Error: failed to set language in aspell: %s\n", lang); + delete_aspell_config(aspell->config); + free(aspell); + aspell = NULL; + break; + } + spell_err = new_aspell_speller(aspell->config); + if (aspell_error_number(spell_err) != 0) + { + prt_error("Error: Aspell: %s\n", aspell_error_message(spell_err)); + delete_aspell_can_have_error(spell_err); + delete_aspell_config(aspell->config); + free(aspell); + aspell = NULL; + break; + } + aspell->speller = to_aspell_speller(spell_err); + break; + } + return aspell; +} + +/** + * Free memory structures used wiith spell-checker 'chk' + */ +void spellcheck_destroy(void * chk) +{ + struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk; + if (aspell) + { + delete_aspell_speller(aspell->speller); + delete_aspell_config(aspell->config); + free(aspell); + aspell = NULL; + } +} + +/** + * Ask the spell-checker if the spelling looks good. + * Return true if the spelling is good, else false. + */ +bool spellcheck_test(void * chk, const char * word) +{ + int val = 0; + struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk; + if (aspell && aspell->speller) + { + /* this can return -1 on failure */ + val = aspell_speller_check(aspell->speller, word, -1); + } + return (val == 1); +} + +// Despite having a thread-compatible API, it appears that apsell +// is not actually thread-safe. Bummer. +static pthread_mutex_t aspell_lock = PTHREAD_MUTEX_INITIALIZER; + +int spellcheck_suggest(void * chk, char ***sug, const char * word) +{ + struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk; + if (!sug) + { + prt_error("Error: Aspell. Corrupt pointer.\n"); + return 0; + } + + if (aspell && aspell->speller) + { + const AspellWordList *list = NULL; + AspellStringEnumeration *elem = NULL; + const char *aword = NULL; + unsigned int size, i; + char **array = NULL; + + pthread_mutex_lock(&aspell_lock); + list = aspell_speller_suggest(aspell->speller, word, -1); + elem = aspell_word_list_elements(list); + size = aspell_word_list_size(list); + + /* allocate an array of char* for returning back to link-parser */ + array = (char **)malloc(sizeof(char *) * size); + if (!array) + { + prt_error("Error: Aspell. Out of memory.\n"); + delete_aspell_string_enumeration(elem); + pthread_mutex_unlock(&aspell_lock); + return 0; + } + + i = 0; + while ((aword = aspell_string_enumeration_next(elem)) != NULL) + { + array[i++] = strdup(aword); + } + delete_aspell_string_enumeration(elem); + *sug = array; + pthread_mutex_unlock(&aspell_lock); + return size; + } + return 0; +} + +void spellcheck_free_suggest(void *chk, char **sug, int size) +{ + for (int i = 0; i < size; ++i) free(sug[i]); + free(sug); +} + +#endif /* #ifdef HAVE_ASPELL */ diff -Nru link-grammar-5.3.16/link-grammar/tokenize/spellcheck.h link-grammar-5.5.0/link-grammar/tokenize/spellcheck.h --- link-grammar-5.3.16/link-grammar/tokenize/spellcheck.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/spellcheck.h 2017-06-17 01:29:32.000000000 +0000 @@ -0,0 +1,32 @@ +/*************************************************************************/ +/* Copyright (c) 2009 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _SPELLCHECK_H +#define _SPELLCHECK_H + +#include "api-types.h" + +#if (defined HAVE_HUNSPELL) || (defined HAVE_ASPELL) +void * spellcheck_create(const char * lang); +void spellcheck_destroy(void *); +bool spellcheck_test(void *, const char * word); +int spellcheck_suggest(void * chk, char ***sug, const char * word); +void spellcheck_free_suggest(void * chk, char **sug, int size); + +#else +static inline void * spellcheck_create(const char * lang) { return NULL; } +static inline void spellcheck_destroy(void * chk) {} +static inline bool spellcheck_test(void * chk, const char * word) { return false; } +static inline int spellcheck_suggest(void * chk, char ***sug, const char * word) { return 0; } +static inline void spellcheck_free_suggest(void * chk, char **sug, int size) {} +#endif + +#endif /* _SPELLCHECK_H */ diff -Nru link-grammar-5.3.16/link-grammar/tokenize/spellcheck-hun.c link-grammar-5.5.0/link-grammar/tokenize/spellcheck-hun.c --- link-grammar-5.3.16/link-grammar/tokenize/spellcheck-hun.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/spellcheck-hun.c 2017-05-28 11:14:43.000000000 +0000 @@ -0,0 +1,138 @@ +/*************************************************************************/ +/* Copyright (c) 2009 Linas Vepstas */ +/* Copyright (c) 2009 Vikas N. Kumar */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include +#include +#include "link-includes.h" +#include "spellcheck.h" + +#ifdef HAVE_HUNSPELL + +#ifndef HUNSPELL_DICT_DIR +#define HUNSPELL_DICT_DIR (char *)0 +#endif /* HUNSPELL_DICT_DIR */ + +static const char *hunspell_dict_dirs[] = { + "/usr/share/myspell/dicts", + "/usr/share/hunspell/dicts", + "/usr/local/share/myspell/dicts", + "/usr/local/share/hunspell/dicts", + "/usr/share/myspell", + "/usr/share/hunspell", + "/usr/local/share/myspell", + "/usr/local/share/hunspell", + HUNSPELL_DICT_DIR +}; + +static const char *spellcheck_lang_mapping[] = { +/* link-grammar language, Hunspell filename */ + "en", "en-US", + "en", "en_US", + "ru", "ru-RU", + "ru", "ru_RU", + "he", "he-IL", + "he", "he_IL", + "de", "de-DE", + "de", "de_DE", + "lt", "lt-LT", + "lt", "lt_LT", +}; + +#define FPATHLEN 256 +static char hunspell_aff_file[FPATHLEN]; +static char hunspell_dic_file[FPATHLEN]; + +#include +#include + +void * spellcheck_create(const char * lang) +{ + size_t i = 0, j = 0; + Hunhandle *h = NULL; + + memset(hunspell_aff_file, 0, FPATHLEN); + memset(hunspell_dic_file, 0, FPATHLEN); + for (i = 0; i < sizeof(spellcheck_lang_mapping)/sizeof(char *); i += 2) + { + if (0 != strcmp(lang, spellcheck_lang_mapping[i])) continue; + + /* check in each hunspell_dict_dir if the files exist */ + for (j = 0; j < sizeof(hunspell_dict_dirs)/sizeof(char *); ++j) + { + FILE *fh; + /* if the directory name is NULL then ignore */ + if (hunspell_dict_dirs[j] == NULL) continue; + + snprintf(hunspell_aff_file, FPATHLEN, "%s/%s.aff", hunspell_dict_dirs[j], + spellcheck_lang_mapping[i+1]); + snprintf(hunspell_dic_file, FPATHLEN, "%s/%s.dic", hunspell_dict_dirs[j], + spellcheck_lang_mapping[i+1]); + + /* Some versions of Hunspell_create() will succeed even if + * there are no dictionary files. So test for permissions. + */ + fh = fopen(hunspell_aff_file, "r"); + if (fh) fclose (fh); + else continue; + + fh = fopen(hunspell_dic_file, "r"); + if (fh) fclose (fh); + else continue; + + h = Hunspell_create(hunspell_aff_file, hunspell_dic_file); + /* if hunspell handle was created break from loop */ + if (h != NULL) + break; + } + /* if hunspell handle was created break from loop */ + if (h != NULL) break; + } + return h; +} + +void spellcheck_destroy(void * chk) +{ + Hunhandle *h = (Hunhandle *) chk; + Hunspell_destroy(h); +} + +/** + * Return boolean: 1 if spelling looks good, else zero + */ +bool spellcheck_test(void * chk, const char * word) +{ + if (NULL == chk) + { + prt_error("Error: no spell-check handle specified!\n"); + return 0; + } + + return (bool) Hunspell_spell((Hunhandle *)chk, word); +} + +int spellcheck_suggest(void * chk, char ***sug, const char * word) +{ + if (NULL == chk) + { + prt_error("Error: no spell-check handle specified!\n"); + return 0; + } + + return Hunspell_suggest((Hunhandle *)chk, sug, word); +} + +void spellcheck_free_suggest(void *chk, char **sug, int size) +{ + Hunspell_free_list((Hunhandle *)chk, &sug, size); +} + +#endif /* #ifdef HAVE_HUNSPELL */ diff -Nru link-grammar-5.3.16/link-grammar/tokenize/tokenize.c link-grammar-5.5.0/link-grammar/tokenize/tokenize.c --- link-grammar-5.3.16/link-grammar/tokenize/tokenize.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/tokenize.c 2018-04-27 00:24:32.000000000 +0000 @@ -0,0 +1,3508 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2009, 2012-2014 Linas Vepstas */ +/* Copyright (c) 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _WIN32 +#include +#endif +#include + +#include "anysplit.h" +#include "api-structures.h" +#include "dict-common/dict-affix.h" +#include "dict-common/dict-api.h" +#include "dict-common/dict-common.h" +#include "dict-common/dict-defines.h" // for MAX_WORD +#include "dict-common/dict-utils.h" +#include "dict-common/regex-morph.h" +#include "error.h" +#include "externs.h" +#include "print/print.h" +#include "print/print-util.h" +#include "spellcheck.h" +#include "string-set.h" +#include "tokenize.h" +#include "tok-structures.h" +#include "utilities.h" +#include "wordgraph.h" +#include "word-structures.h" + +#define MAX_STRIP 10 +#define MAX_STRIP_ALT 5 +#define SYNTHETIC_SENTENCE_MARK '>' /* A marking of a synthetic sentence. */ +#define D_SW 6 /* debug level for word splits */ +#define D_UN 6 /* debug level for units/punct */ + +typedef const char *stripped_t[MAX_STRIP]; + +/* These are no longer in use, but are read from the 4.0.affix file */ +/* I've left these here, as an example of what to expect. */ +/*static char * strip_left[] = {"(", "$", "``", NULL}; */ +/*static char * strip_right[] = {")", "%", ",", ".", ":", ";", "?", "!", "''", "'", "'s", NULL};*/ +/* Single-quotes are used for abbreviations, don't mess with them */ +/*//const char * qs = "\"\'«»《》【】『』‘’`„“"; */ +/*const char* qs = "\"«»《》【】『』`„“"; */ + +#define ENTITY_MARKER "" +#define COMMON_ENTITY_MARKER "" +#define REPLACEMENT_MARK "~" /* a mark for a replacement word */ + +/* Dictionary capitalization handling */ +#define CAP1st "1stCAP" /* Next word is capitalized */ +#define CAPnon "nonCAP" /* Next word the lc version of a capitalized word */ + + +/** + * is_common_entity - Return true if word is a common noun or adjective + * Common nouns and adjectives are typically used in corporate entity + * names -- e.g. "Sun State Bank" -- "sun", "state" and "bank" are all + * common nouns. + */ +static bool is_common_entity(Dictionary dict, const char * str) +{ + if (word_contains(dict, str, COMMON_ENTITY_MARKER) == 1) + return true; + return false; +} + +static bool is_entity(Dictionary dict, const char * str) +{ + const char * regex_name; + if (word_contains(dict, str, ENTITY_MARKER) == 1) + return true; + regex_name = match_regex(dict->regex_root, str); + if (NULL == regex_name) return false; + return word_contains(dict, regex_name, ENTITY_MARKER); +} + +#if defined HAVE_HUNSPELL || defined HAVE_ASPELL +/** + * Return true if word is a proper name. + * XXX This is a cheap hack that works only in English, and is + * broken for German! We need to replace this with something + * language-specific. + * + * Basically, if word starts with upper-case latter, we assume + * its a proper name, and that's that. + */ +static bool is_proper_name(const char * word, locale_t dict_locale) +{ + return is_utf8_upper(word, dict_locale); +} + +/** + * Returns true if the word contains digits. + */ +static bool contains_digits(const char * s, locale_t dict_locale) +{ + mbstate_t mbs; + int nb = 1; + wchar_t c; + + memset(&mbs, 0, sizeof(mbs)); + while ((*s != 0) && (0 < nb)) + { + nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs); + if (nb < 0) return false; + if (iswdigit_l(c, dict_locale)) return true; + s += nb; + } + return false; +} + +#if 0 +/** + * Return true if an alternative has been issued for the given word. + * If there is an alternative, the previous word points to it. + * Its unsplit_word is the given word. + * + * Return true if an alternative is found, else false. + * XXX need to check correctness. + * XXX It seems this function is not needed any more. Remove if so. + */ +static bool word_has_alternative(const Gword *word) +{ + const Gword **n; + + for (n = word->prev[0]->next; *n; n++) + { + if ((*n)->unsplit_word == word) return true; + } + return false; +} +#endif +#endif /* defined HAVE_HUNSPELL || defined HAVE_ASPELL */ + +/** + * Find if a string is equal to a character in afdict class afdict_classnum. + * The parameter afdict_classnum is one of the following affix classes: + * AFDICT_BULLETS - anything that can be construed to be a bullet. + * AFDICT_QUOTES - anything that can be construed to be a quotation mark. + * + * Return TRUE if the character is in afdict_classnum. + */ + +/** + * Search in s the first character from utf-8 string xc. + */ +static char *utf8_str1chr(const char *s, const char *xc) +{ + int len = utf8_charlen(xc); + if (len < 0) return NULL; /* Invalid UTF-8 */ + char *xc1 = strndupa(xc, len); + + return strstr(s, xc1); +} + +static bool in_afdict_class(Dictionary dict, afdict_classnum cn, const char *s) +{ + if (0 == AFCLASS(dict->affix_table, cn)->length) return false; + const char *classchars = AFCLASS(dict->affix_table, cn)->string[0]; + + return NULL != utf8_str1chr(classchars, s); +} + +/** + * Return TRUE if the character is white-space + */ +static bool is_space(wchar_t wc, locale_t dict_locale) +{ + if (iswspace_l(wc, dict_locale)) return true; + + /* 0xc2 0xa0 is U+00A0, c2 a0, NO-BREAK SPACE */ + /* For some reason, iswspace doesn't get this */ + if (0xa0 == wc) return true; + + /* iswspace seems to use somewhat different rules than what we want, + * so over-ride special cases in the U+2000 to U+206F range. + * Caution: this potentially screws with arabic, and right-to-left + * languages. + */ +/*** later, not now .. + if (0x2000 <= wc && wc <= 0x200f) return true; + if (0x2028 <= wc && wc <= 0x202f) return true; + if (0x205f <= wc && wc <= 0x206f) return true; +***/ + + return false; +} + +#if 0 +/** + * Returns true if the word can be interpreted as a number. + * The ":" is included here so we allow "10:30" to be a number. + * The "." and "," allow numbers in both US and European notation: + * e.g. American million: 1,000,000.00 Euro million: 1.000.000,00 + * We also allow U+00A0 "no-break space" + */ +static bool is_number(Dictionary dict, const char * s) +{ + mbstate_t mbs; + int nb = 1; + wchar_t c; + if (!is_utf8_digit(s, Dictionary dict)) return false; + + memset(&mbs, 0, sizeof(mbs)); + while ((*s != 0) && (0 < nb)) + { + nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs); + /* XXX check nb < 0 */ + if (iswdigit_l(dict, c)) { s += nb; } + + /* U+00A0 no break space */ + else if (0xa0 == c) { s += nb; } + + else if ((*s == '.') || (*s == ',') || (*s == ':')) { s++; } + else return false; + } + return true; +} +#endif + +static void gwordqueue_add(const Sentence sent, Gword *const word) +{ + word_queue_t *wq_element = malloc(sizeof(word_queue_t)); + + assert((NULL == sent->word_queue) == (NULL == sent->word_queue_last)); + + if (NULL == sent->word_queue) + sent->word_queue = wq_element; + else + sent->word_queue_last->next = wq_element; + + wq_element->word = word; + wq_element->next = NULL; + sent->word_queue_last = wq_element; + +} + +/** + * Update the label of the word (for debug). + * The word label marks which part(s) of the program issued the word. + * This label accumulated if several parts touch the same word. + */ +static void word_label(Sentence sent, Gword *w, const char *op, + const char *label) +{ + const size_t s = (NULL == w->label) ? 0 : strlen(w->label); + char *new_label = alloca(s + strlen(label) + 1 + 2 + 1); /* len+op+()+NUL */ + + if (0 != s) + strcpy(new_label, w->label); + else + new_label[0] = '\0'; + + if (NULL == op) + strcat(new_label, "("); + else if ('\0' != new_label[0]) + strcat(new_label, op); + strcat(new_label, label); + if (NULL == op) strcat(new_label, ")"); + w->label = string_set_add(new_label, sent->string_set); +} + +#ifdef CHECK_DUPLICATE_ALTS // Not defined - apparently not a problem by now +#define D_WSAA 9 +/** + * Disallow unsplit_word alternatives with the same subword string path. + * (I.e. we are talking about preventing Wordgraph paths consisting of the same + * word strings in the same order, not consisting of same graph nodes.) + * + * The first subword of the candidate alternative is checked against the first + * subword of all the existing alternatives of the unsplit_word (this allows the + * first alternative unconditionally). + * + * Parameters: + * - unsplit_word: the unsplit_word that shouldn't have redundant paths. + * - altword0: the first word of the candidate alternative. + * Return true if the alternative is redundant, false if it is fine. + * + * Assumptions and heuristics: + * + * - This function is invoked only for subwords that we try to split in every + * possible combination, i.e. LR-split. It is not invoked for morpheme + * splits, because then an identical first subword may be valid due to + * a different way of performing the splits. + * + * - If the first subword of the checked candidate alternative is the same as an + * already existing alternative, it means the candidate alternative is + * redundant. This is because we are going to generate all the subword + * combinations for the other subwords, a thing that would generate an + * identical Wordgraph sub-path otherwise. So the alternative is disallowed. + * + * - The previous step may allow an unknown first subword to not split further + * to a known word plus punctuation, a thing that will leave an unwanted + * unknown word in the Wordgraph. To prevent this we also check if an existing + * first subword is a prefix of the first (unknown) subword of the candidate + * alternative. If it is, it means that a future split would be prevented by + * the previous step. + * + * Examples (the words and alternatives are shown in () for clarity): + * + * 1. An input word ('50s,) splits to (' 50s,). An additional split of this + * input word to ('50s ,) would be prevented if '50s was not a known word, as + * (') is a prefix of ('50s). The split of ('50s) to (' 50s) is prevented by + * this function because its first subword matches the first subword of an + * existing alternative (' 50s,). + * + * 2. The input word ("who") gets split to (" who"). Its split to ("who ") is + * then prevented, as (") is a prefix of ("who) which is unknown. + * + * XXX I don't have a mathematical proof of correctness, it just happens to work + * on the example sentences. [ap] + * + * FIXME XXX What if a non-first subword is unknown and cannot split further? + * For example, for ('50s,) we get an alternative (' 50s ,) in which (50s) + * cannot split further because another alternative also starts with (50), but + * (50s) is an unknown word (that doesn't usually cause a trouble because + * tokens in that alternative don't have a linkage). It will be a good idea to + * find out exactly how it happens and a way to avoid that. A fix "by force" + * may be to explicitly mark unknown-words in separate_word() (as originally + * designed) and in flatten_wordgraph() ignore alternatives that have + * so-unmarked unknown words. + */ +static bool word_start_another_alternative(Dictionary dict, + Gword *unsplit_word, + const char *altword0) +{ + Gword **n; + + lgdebug(+D_WSAA, "\n"); /* Terminate a previous partial trace message. */ + lgdebug(+D_WSAA, "Checking %s in alternatives of %zu:%s (prev %zu:%s)\n", + altword0, unsplit_word->node_num, unsplit_word->subword, + unsplit_word->prev[0]->node_num, unsplit_word->prev[0]->subword); + + for (n = unsplit_word->prev[0]->next; NULL != *n; n++) + { + if ((*n)->unsplit_word != unsplit_word) continue; + lgdebug(D_WSAA, "Comparing alt %s\n\\", (*n)->subword); + if ((0 == strcmp((*n)->subword, altword0) || + ((0 == strncmp((*n)->subword, altword0, strlen((*n)->subword))) && + !find_word_in_dict(dict, altword0)))) + { + lgdebug(+D_UN, "Preventing alt starts with %s due to existing %zu:%s\n", + altword0, (*n)->node_num, (*n)->subword); + return true; + } + } + return false; +} +#undef D_WSAA +#endif /* CHECK_DUPLICATE_ALTS */ + +/** + * Find if a suffix is of a contraction. + * XXX This is appropriate for English and maybe for some other languages, and + * may need a generalization. + * FIXME? Try to work-around the current need of this functions. + */ +static char const *contraction_char[] = { "'", "’" }; + +#if 0 +static bool is_contraction_suffix(const char *s) +{ + size_t len = strlen(s); + + for (size_t i = 0; i < ARRAY_SIZE(contraction_char); i++) + { + size_t cclen = strlen(contraction_char[i]); + if (len < cclen) continue; + if (0 == strncmp(s+len-cclen, contraction_char[i], cclen)) return true; + } + + return false; +} + +static bool is_contraction_prefix(const char *s) +{ + for (size_t i = 0; i < ARRAY_SIZE(contraction_char); i++) + { + size_t cclen = strlen(contraction_char[i]); + if (0 == strncmp(s, contraction_char[i], cclen)) return true; + } + return false; +} +#endif + +static bool is_contraction_word(Dictionary dict, const char *s) +{ + if (dict->affix_table && dict->affix_table->anysplit) + return false; + + for (size_t i = 0; i < ARRAY_SIZE(contraction_char); i++) + { + if (NULL != strstr(s, contraction_char[i])) return true; + } + return false; +} + +/* + * Return true iff the given word is an AFDICT_xPUNC. + * + * FIXME: + * We cannot directly find if a word is an AFDICT_xPUNC, because + * we have no way to mark that in strip_left()/strip_right()/split_mpunc(), + * and we don't have a direct search function in the afdict (since it + * doesn't have an in-memory tree structure). + */ +static bool is_afdict_punc(const Dictionary afdict, const char *word) +{ + if (NULL == afdict) return false; + int punc_types[] = { AFDICT_RPUNC, AFDICT_MPUNC, AFDICT_LPUNC, 0 }; + + for (int affix_punc = 0; punc_types[affix_punc] != 0; affix_punc++) + { + const Afdict_class * punc_list = AFCLASS(afdict, punc_types[affix_punc]); + size_t l_strippable = punc_list->length; + const char * const * punc = punc_list->string; + + for (size_t i = 0; i < l_strippable; i++) + if (0 == strcmp(word, punc[i])) return true; + } + + return false; +} + +static bool regex_guess(Dictionary dict, const char *word, Gword *gword) +{ + const char *regex_name = match_regex(dict->regex_root, word); + if ((NULL != regex_name) && boolean_dictionary_lookup(dict, regex_name)) + { + gword->status |= WS_REGEX; + gword->regex_name = regex_name; + return true; + } + return false; +} + +#define PER_GWORD_FUNC(f) Gword *(f)(Sentence sent, Gword *w, unsigned int *arg) +/** + * Perform gword_func() on each gword of the given alternative. + */ +static Gword *for_word_alt(Sentence sent, Gword *altp, + PER_GWORD_FUNC(*gword_func), unsigned int *arg) +{ + if (NULL == altp) return NULL; + Gword *alternative_id = altp->alternative_id; + + for (; altp->alternative_id == alternative_id; altp = altp->next[0]) + { + if (NULL == altp) break; /* Just in case this is a dummy word. */ + + Gword *gw = gword_func(sent, altp, arg); + if (NULL != gw) return gw; + + + /* The alternative ends on one of these conditions: + * 1. A different word alternative_id (checked in the loop conditional). + * 2. No next word. + * 3. The word has been issued alone as its own alternative + * (In that case its alternative_id may belong to a previous + * longer alternative). + */ + if ((NULL == altp->next) || altp->issued_unsplit) + break; /* Only one token in this alternative. */ + } + + return NULL; +} + +/** + * Return the Gword at the requested position in the given alternative. + * @param w A pointer to the Gword at the start of the alternative. + * @param arg The requested position. This parameter is destroyed. + * @return The Gword at position arg, or NULL if not enough Gwords in the + * alternative. + */ +static PER_GWORD_FUNC(gword_by_ordinal_position) + //(Sentence sent, Gword *w, int *arg) +{ + if (0 == arg[0]) return w; + arg[0]--; + + return NULL; +} + +/** + * Set WS_INDICT / WS_REGEX if the word is in the dict / regex files. + * The first one which is found is set. + */ +static PER_GWORD_FUNC(set_word_status)//(Sentence sent, Gword *w, int *arg) +{ + int status = *arg; + switch (status) + { + case WS_INDICT|WS_REGEX: + if (!(w->status & (WS_INDICT|WS_REGEX))) + { + if (boolean_dictionary_lookup(sent->dict, w->subword)) + { + w->status |= WS_INDICT; + } + else + { + regex_guess(sent->dict, w->subword, w); + } + } + break; + +#if defined HAVE_HUNSPELL || defined HAVE_ASPELL + case WS_RUNON: + case WS_SPELL: + /* Currently used to mark words that are a result of a spelling. */ + if ((w->status & WS_INDICT) && + !boolean_dictionary_lookup(sent->dict, w->subword)) + { + status &= ~WS_INDICT; + } + w->status |= status; + break; +#endif /* HAVE_HUNSPELL */ + + default: + assert(0, "set_dict_word_status: Invalid status 0x%x\n", status); + } + + lgdebug(+D_SW, "Word %s: status=%s\n", w->subword, gword_status(sent, w)); + + return NULL; +} + +static PER_GWORD_FUNC(set_tokenization_step) + //(Sentence sent, Gword *w, int *arg) +{ + set_word_status(sent, w, (unsigned int []){WS_INDICT|WS_REGEX}); + w->tokenizing_step = *arg; + + lgdebug(+D_SW, "Word %s: status=%s tokenizing_step=%d\n", + w->subword, gword_status(sent, w), w->tokenizing_step); + + return NULL; +} + +/** + * Prevent a further tokenization. + * To be used on terminal alternatives. + */ +void tokenization_done(Sentence sent, Gword *altp) +{ + for_word_alt(sent, altp, set_tokenization_step, (unsigned int[]){TS_DONE}); +} + +/** + * Issue candidate subwords for unsplit_word (an "alternative"). + * Issue prefnum elements from prefix, stemnum elements from stem, and suffnum + * elements from suffix. Mark the prefixes and suffixes with INFIX_MARK (the + * stems are assumed to be already marked with one of the STEMSUBSCR + * possibilities. Set the Morpheme_type of the subwords. + * + * The label is used in the wordgraph display, to indicate which section of + * tokenizing code has inserted the token. If its first character is + * REPLACEMENT_MARK, the token is not necessarily a substring of the word. This + * may happen with spell corrections and with the experimental "dictcap" + * feature, and is used for setting the word position. + * + * Return a pointer to the first word of the added alternative. + * + * TODO Support also middle morphemes if needed. + */ +#define D_IWA 6 +Gword *issue_word_alternative(Sentence sent, Gword *unsplit_word, + const char *label, + int prefnum, const char * const *prefix, + int stemnum, const char * const *stem, + int suffnum, const char * const *suffix) +{ + int ai = 0; /* affix index */ + const char * const *affix; /* affix list pointer */ + const char * const * const affixlist[] = { prefix, stem, suffix, NULL }; + const int numlist[] = { prefnum, stemnum, suffnum }; + enum affixtype { PREFIX, STEM, SUFFIX, END }; + enum affixtype at; + const char infix_mark = INFIX_MARK(sent->dict->affix_table); + Gword *subword; /* subword of the current token */ + Gword *psubword = NULL; /* subword of the previous token */ + const int token_tot = prefnum + stemnum + suffnum; /* number of tokens */ + Morpheme_type morpheme_type; + Gword *alternative_id = NULL; /* to be set to the start subword */ + bool subword_eq_unsplit_word; + bool last_split = false; /* this is a final token */ + int *strlen_cache = alloca(token_tot * sizeof(int)); /* token length cache */ +#ifdef DEBUG + Gword *sole_alternative_of_itself = NULL; +#endif + + if (unsplit_word->split_counter > MAX_SPLITS) + { + prt_error("Error: Word %s reached %d splits. " + "It will not get split further. The result is undefined.\n" + "Run with !verbosity="STRINGIFY(D_SW)" to debug\n", + unsplit_word->subword, MAX_SPLITS); + unsplit_word->tokenizing_step = TS_DONE; + return NULL; + } + /* The incremented split_counter will be assigned to the created subwords. */ + + lgdebug(+D_IWA, "(%s) Gword %zu:%s split (split_counter=%zu) into", label, + unsplit_word->node_num, unsplit_word->subword, + unsplit_word->split_counter); + + /* Allocate memory which is enough for the longest token. */ + int maxword = 0; + for (ai = 0, at = PREFIX; at < END; at++) + { + int affixnum = numlist[at]; + char morpheme_sym[] = "pts"; + + /* This loop computes too things: + * 1. strlen_cache - Token lengths - up to a SUBSCRIPT_MARK if exists. + * 2. maxword - Maximum such token length. */ + for (affix = affixlist[at]; affixnum-- > 0; affix++, ai++) + { + strlen_cache[ai] = (int)strcspn(*affix, subscript_mark_str()); + //printf("'%s' strlen_cache[%d]=%d\n",*affix,ai,strlen_cache[ai]); + maxword = MAX(maxword, strlen_cache[ai]); + lgdebug(D_IWA, " %c:%s", morpheme_sym[at], + ('\0' == (*affix)[0]) ? "[null]" : *affix); + } + } + + char * const buff = alloca(maxword + 2); /* strlen + INFIX_MARK + NUL */ + const char *token; + + for (ai = 0, at = PREFIX; at < END; at++) + { + int affixnum = numlist[at]; + + for (affix = affixlist[at]; affixnum-- > 0; affix++, ai++) + { + token = *affix; /* avoid copying if possible */ + switch (at) + { + /* Mark the token with INFIX_MARK if needed. */ + case PREFIX: /* set to word= */ + if ('\0' != infix_mark) + { + size_t sz = strlen_cache[ai]; + memcpy(buff, *affix, sz); + buff[sz] = infix_mark; + buff[sz+1] = '\0'; + last_split = true; + token = buff; + } + if (is_contraction_word(sent->dict, unsplit_word->subword)) + morpheme_type = MT_CONTR; + else + morpheme_type = MT_PREFIX; + break; + case STEM: /* already word, word.=, word.=x */ + /* Stems are already marked with a stem subscript, if needed. + * The possible marks are set in the affix class STEMSUBSCR. */ + if (is_stem(token)) + { + morpheme_type = MT_STEM; + last_split = true; + } + else if (is_afdict_punc(sent->dict->affix_table, token)) + { + morpheme_type = MT_PUNC; + } + else + { + morpheme_type = MT_WORD; + } + break; + case SUFFIX: /* set to =word */ + /* XXX If the suffix starts with an apostrophe, don't mark it. + * Actually - any non-alpha is checked. The random-splitting + * "languages" always need the suffix marking. */ + if (((NULL == sent->dict->affix_table->anysplit) && + ('\0' != (*affix)[0]) && + !is_utf8_alpha(*affix, sent->dict->lctype)) || + '\0' == infix_mark) + { + if (is_contraction_word(sent->dict, unsplit_word->subword)) + morpheme_type = MT_CONTR; + else + morpheme_type = MT_WORD; + break; + } + last_split = true; + buff[0] = infix_mark; + strcpy(&buff[1], *affix); + morpheme_type = MT_SUFFIX; + token = buff; + break; + case END: + assert(true, "affixtype END reached"); + } + +#ifdef CHECK_DUPLICATE_ALTS + /* FIXME Use another method instead of checking the label. */ + if ((0 == ai) && (1 < token_tot) && (label[0] == 'r') && + word_start_another_alternative(sent->dict, unsplit_word, token)) + { + /* When called due to left/right strip, the code shouldn't use the + * returned value due to the possibility of this returned NULL. */ + return NULL; + } +#endif /* CHECK_DUPLICATE_ALTS */ + + subword_eq_unsplit_word= (0 == strcmp(unsplit_word->subword, token)); + + if ((1 == token_tot) && subword_eq_unsplit_word) + { + /* Prevent adding a subword as a sole alternative to itself. */ + Gword **q; + + unsplit_word->issued_unsplit = true; + + /* + * If WS_HASALT is unset, then this is the first alternative. + */ + if (!(unsplit_word->status & WS_HASALT)) + { + /* The unsplit_word itself got issued here as the first + * alternative of itself. In order that it will not become the + * sole alternative of itself, just return. In + * remqueue_gword(), issue_word_alternative() is invoked + * again if needed - see the next comment. */ + word_label(sent, unsplit_word, "+", label); + word_label(sent, unsplit_word, NULL, "IU"); + lgdebug(D_IWA, " (issued_unsplit)\n"); + /* Note: The original morpheme_type is preserved. + * The morpheme_type value set above is just ignored. */ + return unsplit_word; + } + + if (unsplit_word->status & WS_UNSPLIT) + { + /* If we are here, there is tokenization logic error in the + * program, as the word has been issued as an alternative of + * itself an additional time. If we proceed it would mess the + * Wordgraph pointers. Just warn (if verbosity>0) and return. + * The return value is not likely to be used in such a case, + * since this is an issuing of a single word. + * + * Note: In case a new tokenization logic permits adding a + * word more than once, just remove this warning. */ + if (0 < verbosity) + { + prt_error("Warning: Internal error: " + "word \"%s\" got issued more than once\n", + unsplit_word->subword); + } + return NULL; + } + + /* We arrive when a word is issued as an alternative of itself and + * it already has at least one another alternative. This may happen + * when the word is issued as a second and on alternative, or when + * we are invoked from remqueue_gword() if it finds that + * unsplit_word->issued_unsplit is true and there are + * alternatives. Due to the alternatives, the unsplit_word is not + * connected to the word flow. We reconnect it here to its former + * prev/next words so it will serve as an alternative too. */ + + /* Scan its "prev" words and add it as their "next" word */ + for (q = unsplit_word->prev; *q; q++) + gwordlist_append(&(*q)->next, unsplit_word); + /* Scan its "next" words and add it as their "prev" word */ + for (q = unsplit_word->next; *q; q++) + gwordlist_append(&(*q)->prev, unsplit_word); + word_label(sent, unsplit_word, "+", label); + word_label(sent, unsplit_word, NULL, "R"); + unsplit_word->status |= WS_UNSPLIT; + + alternative_id = unsplit_word->alternative_id; +#ifdef DEBUG + sole_alternative_of_itself = unsplit_word; +#endif + lgdebug(D_IWA, " (reconnected)"); + + } + else + { + /* Add the token as a subword of this alternative */ + subword = gword_new(sent, token); + subword->unsplit_word = unsplit_word; + subword->split_counter = unsplit_word->split_counter + 1; + subword->morpheme_type = morpheme_type; + if (MT_PUNC == morpheme_type) /* It's a terminal token */ + tokenization_done(sent, subword); + + if (last_split) + { +#if 0 + /* XXX the new Turkish experimental dictionary depend on + * specifying compound suffixes which are not in the dict file, + * in the SUF affix class. This allows them to split farther. + * However, there is a need to detail all the supported + * combinations of compound suffixes. + * FIXME: There is a need for a real multi affix splitter. + * (last_split will get optimized out by the compiler.) */ + + /* This is a stem, or an affix which is marked by INFIX_MARK. + * Hence it must be a dict word - regex/spell are not done + * for stems/affixes. Also, it cannot split further. + * Save resources by marking it accordingly. */ + subword->status |= WS_INDICT; + subword->tokenizing_step = TS_DONE; +#endif + } + word_label(sent, subword, "+", label); + + /* If the subword is equal to the unsplit_word (may happen when the + * word is issued together with "virtual" morphemes) we should not + * queue it for further processing, in order to prevent an infinite + * loop. */ + if (!subword_eq_unsplit_word) + gwordqueue_add(sent, subword); + + /* The spelling properties are inherited over morpheme split */ + if (unsplit_word->status & (WS_SPELL|WS_RUNON)) + subword->status |= unsplit_word->status & (WS_SPELL|WS_RUNON); + + if (0 == ai) /* first subword of this alternative */ + { + subword->start = unsplit_word->start; + if (REPLACEMENT_MARK[0] == label[0]) + { + /* This is a replacement word (a spell correction or a + * "feature" word). Set its end position to the whole + * unsplit_word. For "feature" words this may not be accurate, + * but it doesn't matter for now ("dictcap" is experimental). */ + subword->end = unsplit_word->end; + } + else + { + subword->end = subword->start + strlen_cache[ai]; + /* Account for case conversion length difference. */ + if (subword->status & WS_FIRSTUPPER) + { + int uclen = utf8_charlen(unsplit_word->subword); + int lclen = utf8_charlen(token); + if ((uclen > 0) && (lclen > 0)) + subword->end += uclen - lclen; + } + //printf(">>>SUBWORD '%s' %ld:%ld\n", subword->subword, subword->start-sent->orig_sentence, subword->end-sent->orig_sentence); + } + + if (unsplit_word->status & WS_FIRSTUPPER) + subword->status |= WS_FIRSTUPPER; + + /* Arrange for subword to be the "next" word of the previous + * words of unsplit_word. There are 2 cases: + * - If this is not the first alternative - add the subword to + * their "next" links. + * - If this is the first alternative - replace the "next" link + * pointing to unsplit_word with a link to subword, + * disconnecting unsplit_word from its RHS. */ + Gword **p; + + alternative_id = subword; + + //previous_wordgraph_nextalts(sent, unsplit_word, subword); + /* Scan the said previous words. */ + for (p = unsplit_word->prev; NULL != *p; p++) + { + Gword **n; + + /* Create the "prev" link for subword */ + gwordlist_append(&subword->prev, *p); + + if (unsplit_word->status & WS_HASALT) + { + gwordlist_append(&(*p)->next, subword); + } + else + { + /* Scan the said "next" links */ + for(n = (*p)->next; NULL != *n; n++) + { + if (*n == unsplit_word) + { + /* Now finally replace the "next" link */ + *n = subword; + break; + } + } + assert(NULL != *n, "Adding subword '%s': " + "No corresponding next link for a prev link: " + "prevword='%s' word='%s'", + subword->subword, (*p)->subword, unsplit_word->subword); + } + } + } + + if (token_tot-1 == ai) /* last subword of this alternative */ + { + + /* Arrange for subword to be the "prev" word of the next words of + * unsplit_word. There are 2 cases: + * - If this is not the first alternative - add the subword to + * their "prev" links. + * - If this is the first alternative - replace the "prev" link + * pointing to unsplit_word with a link to subword, + * disconnecting unsplit_word from its LHS. + */ + Gword **n; + + //next_wordgraph_prevalts(sent, unsplit_word, subword); + /* Scan the said next words. */ + for (n = unsplit_word->next; NULL != *n; n++) + { + Gword **p; + + /* Create the "next" link for subword */ + gwordlist_append(&subword->next, *n); + + if (unsplit_word->status & WS_HASALT) + { + gwordlist_append(&(*n)->prev, subword); + } + else + { + /* Scan the said "prev" links */ + for(p = (*n)->prev; NULL != *p; p++) + { + if (*p == unsplit_word) + { + /* Now finally replace the "prev" link */ + *p = subword; + break; + } + } + assert(NULL!=*p, + "Adding subword '%s': " + "No corresponding prev link for a next link" + "nextword='%s' word='%s'", + subword->subword, (*n)->subword, unsplit_word->subword); + } + + } + } + + if (0 < ai) /* not the first subword */ + { + if (REPLACEMENT_MARK[0] == label[0]) + { + subword->start = unsplit_word->start; + subword->end = unsplit_word->end; + } + else + { + subword->start = psubword->end; + subword->end = subword->start + strlen_cache[ai]; + } + + gwordlist_append(&psubword->next, subword); + gwordlist_append(&subword->prev, psubword); + } + + subword->alternative_id = alternative_id; + psubword = subword; + } + } + } + + unsplit_word->status |= WS_HASALT; + lgdebug(D_IWA, "\n"); + +#ifdef DEBUG + /* Check if the alternative that has just been added already exists. + * If it exists - just warn. */ + { + Gword **prev = unsplit_word->prev; + Gword *curr_alt = sole_alternative_of_itself ? + sole_alternative_of_itself : alternative_id; + Gword **alts; + + assert(curr_alt, "'%s': No alt mark", unsplit_word->subword); + assert(prev, "'%s': No prev", unsplit_word->subword); + assert(prev[0], "'%s': No prev[0]", unsplit_word->subword); + assert(prev[0]->next, "%s': No next",prev[0]->subword); + assert(prev[0]->next[0], "'%s': No next[0]",prev[0]->subword); + for (alts = prev[0]->next; *alts; alts++) + { + if ((*alts)->unsplit_word != unsplit_word) continue; + + Gword *calt = curr_alt; /* check alternative */ + Gword *oalt; /* old alternatives */ + size_t token_no = token_tot; + + if (*alts == curr_alt) break; + for (oalt = *alts; token_no > 0; oalt = oalt->next[0]) + { + if (0 != (strcmp(oalt->subword, calt->subword))) + break; + calt = calt->next[0]; + token_no--; + } + if (token_tot) continue; + prt_error("Error: >>>DEBUG>>>: '%s' " + "(alternative start '%s', len=%d): " + "Alternative already exists!\n", + curr_alt->subword, unsplit_word->subword, token_tot); + } + } +#endif + + return alternative_id; +} +#undef D_IWA + +#define D_RWW 6 +static void remqueue_gword(const Sentence sent) +{ + word_queue_t *const wq = sent->word_queue; + assert(NULL!=wq, "Trying to remove a word from an empty word queue"); + Gword *w = wq->word; + + lgdebug(+D_RWW, "Word '%s'%s%s\n", w->subword, + w->issued_unsplit ? " issued_unsplit" : "", + w->status & WS_HASALT ? " WS_HASALT" : ""); + + /* If the word should have an alternative which includes itself, add it as an + * additional alternative (unless it has already been added, as indicated by + * WS_UNSPLIT). + * See the comments in issue_word_alternative() where remqueue_gword is + * mentioned. */ + if (w->issued_unsplit && (w->status & WS_HASALT) && !(w->status & WS_UNSPLIT)) + { + issue_word_alternative(sent, w, "RQ" ,0,NULL, 1,&w->subword, 0,NULL); + } + +#if WORDGRAPH_PARSER /* not defined */ + /* If the parsers are modified to work directly on the Wordgraph. */ + build_expressions(wq->word); +#endif + + /* Finally, remove the word from the queue. */ + sent->word_queue = wq->next; + free(wq); +} +#undef D_RWW + +static Gword *wordgraph_getqueue_word(Sentence sent) +{ + Gword *w; + + if (NULL == sent->word_queue) return NULL; + w = sent->word_queue->word;; + + return w; +} + +static const char ** resize_alts(const char **arr, size_t len) +{ + arr = realloc(arr, (len+2) * sizeof(char *)); + arr[len+1] = NULL; + return arr; +} + +void altappend(Sentence sent, const char ***altp, const char *w) +{ + size_t n = altlen(*altp); + + *altp = resize_alts(*altp, n); + (*altp)[n] = string_set_add(w, sent->string_set); +} + +/* + Here's a summary of how subscripts are handled: + + Reading the dictionary: + + If the last "." in a string is followed by a non-digit character, + then the "." and everything after it is considered to be the subscript + of the word. + + Quote marks are used to allow you to define words in the dictionary + which would otherwise be considered part of the dictionary, as in + + ";": {@Xca-} & Xx- & (W+ or Qd+) & {Xx+}; + "%" : (ND- & {DD-} & & + ( or B*x+)) or (ND- & (OD- or AN+)); + + Rules for chopping words from the input sentence: + + First the prefix chars are stripped off of the word. These + characters are "(" and "$" (and now "``") + + Now, repeat the following as long as necessary: + + Look up the word in the dictionary. + If it's there, the process terminates. + + If it's not there and it ends in one of the right strippable + strings (see "strip_right") then remove the strippable string + and make it into a separate word. + + If there is no strippable string, then the process terminates. + + When does a word in the sentence match a word in the dictionary? + + The matching is done disregarding the subscript of the dictionary + word. This means a sentence word can match at most one dictionary word + that doesn't have a subscript, and many words that have a subscript. + + So, under this system, the dictionary could have the words "Ill" and + also the word "Ill." It could also have the word "i.e.", which could be + used in a sentence. +*/ + +#ifdef DEBUG +/** + * Split special synthetic words, for Wordgraph handling debug. + * Word syntax (recursively): LABEL(WORD+WORD+...|WORD+...) + * Notations in the word syntax: + * +: a separator between words of the same alternative. + * |: a separator between alternatives. + * LABEL: (optional) mark the graph node by a name (for convenience). + * (): refers to the unsplit word, in order to generate it as an + * alternative to itself. E.g. (A|()) generates A as one alternative and the + * whole unsplit word as the other one. + * Example sentence: K Ax(BC((mD2+e+F)+(G+h)|(v+w)) C(3|J)) L (()|X+Y) + * If no split is needed, word syntax errors are silently ignored. + * Null-string subwords are not allowed, e.g.: A(|B) C(+) D(E|) + */ +static bool synthetic_split(Sentence sent, Gword *unsplit_word) +{ + const char *const w = unsplit_word->subword; + const char *c = w; + const char *s = w; + int plevel = 0; + const char **alts = NULL; + bool can_split = false; + const size_t len = strlen(c); + char *alt = alloca(len+1); +#define SYNTHSPLIT_ERROR(e) ("Error: synthetic_split(): word '%s':" e "\n") + + /* Synthetic sentences are marked by a special initial character. */ + if (SYNTHETIC_SENTENCE_MARK != sent->orig_sentence[0]) return false; + + assert(0 != len, "synthetic_split(): empty-string word"); + if (')' != w[len-1]) return false; /* no split needed (syntax not checked) */ + + do + { + switch (*c) + { + case '(': + if (0 == plevel) s = c + 1; + plevel++; + break; + case ')': + case '+': + case '|': + if (1 == plevel) + { + if (c == s) + { + prt_error(SYNTHSPLIT_ERROR("(empty subword)."), w); + goto error; + } + strncpy(alt, s, c-s); + alt[c-s] = '\0'; + if (0 == strcmp(alt, "()")) + { + /* The word is an alternative to itself. It is not going to + * loop due to a special handling in issue_word_alternative(). + */ + strcpy(alt, w); + } + altappend(sent, &alts, alt); + s = c + 1; + + if ('|' == *c) + { + if (alts) + issue_word_alternative(sent, unsplit_word, "SS", 0,NULL, + altlen(alts),alts, 0,NULL); + can_split = true; + free(alts); + alts = NULL; + } + } + if (')' == *c) plevel--; + break; + default: + if (!(((*c >= 'a') && (*c <= 'z')) || + ((*c >= 'A') && (*c <= 'Z')) || + ((*c >= '0') && (*c <= '9')) || + ('_' == *c))) + { + prt_error(SYNTHSPLIT_ERROR("('%c' not alphanumeric)."), w, *c); + goto error; + } + } + if (0 > plevel) + { + prt_error(SYNTHSPLIT_ERROR("extra ')'"), w); + goto error; + } + + } while ('\0' != *++c); + + if (0 < plevel) + { + prt_error(SYNTHSPLIT_ERROR("missing '('."), w); + goto error; + } + + if (alts) + { + issue_word_alternative(sent, unsplit_word, "SS", 0,NULL, + altlen(alts),alts, 0,NULL); + can_split = true; + } + +error: + free(alts); + return can_split; +} +#endif + +/** + * Add the given prefix, word and suffix as an alternative. + * If STEMSUBSCR is define in the affix file, use its values as possible + * subscripts for the word. In that case, if the word cannot be found in + * the dict with any of the given stem suffixes, the alternative is not + * valid and thus not added. + * + * If unsplit_word is null, this function actually only checks whether + * the alternative is valid as described above. This is used for checking + * is a spell guess result if valid if the word itself is not in the dict. + * FIXME: If a word can split it doesn't follow it is a "real" dictionary + * word, as there can still be no links between some of its parts. + * + * Return true if the alternative is valid, else false. + */ +static bool add_alternative_with_subscr(Sentence sent, + Gword * unsplit_word, + const char * prefix, + const char * word, + const char * suffix) +{ + Dictionary dict = sent->dict; + Afdict_class * stemsubscr_list = + AFCLASS(dict->affix_table, AFDICT_STEMSUBSCR); + const char ** stemsubscr = stemsubscr_list->string; + size_t stemsubscr_count = stemsubscr_list->length; + bool word_is_in_dict = false; + bool issue_alternatives = (NULL != unsplit_word); + + if (0 == stemsubscr_count) + { + if (issue_alternatives) + { + word_is_in_dict = true; + issue_word_alternative(sent, unsplit_word, "AWS", + (prefix ? 1 : 0),&prefix, 1,&word, + (suffix ? 1 : 0),&suffix); + } + else + { + /* This is a compound-word spell check. Reject unknown words. + * XXX: What if the word is capitalized? */ + word_is_in_dict = boolean_dictionary_lookup(dict, word); + } + } + else + { + size_t si; + size_t wlen = strlen(word); + size_t slen = 0; + char *w; + + for (si = 0; si < stemsubscr_count; si++) + { + slen = MAX(slen, strlen(stemsubscr[si])); + } + w = alloca(wlen + slen + 1); + strcpy(w, word); + + for (si = 0; si < stemsubscr_count; si++) + { + strcpy(&w[wlen], stemsubscr[si]); + + /* We should not match regexes to stems. */ + if (boolean_dictionary_lookup(dict, w)) + { + word_is_in_dict = true; + if (issue_alternatives) + { + issue_word_alternative(sent, unsplit_word, "AWS", + (prefix ? 1 : 0),&prefix, 1,(const char **)&w, 1,&suffix); + } + } + } + } + + lgdebug(+D_SW,"Stem subscript not found: p:%s t:%s s:%s\n", + prefix ? prefix : "(none)", word, suffix ? suffix : "(none)"); + return word_is_in_dict; +} + +/** + * Split word into prefix, stem and suffix. + * It can also split contracted words (like he's). + * Alternatives are generated if issue_alternatives=true. + * Return value: + * + * The prefix code is only lightly validated by actual use. + * + * If unsplit_word is null, this function actually only checks whether + * the word can split. This is used for checking if a spell guess result is + * valid if the word itself is not in the dict. See also + * add_alternative_with_subscr(). + */ +static bool suffix_split(Sentence sent, Gword *unsplit_word, const char *w) +{ + int i, j; + Afdict_class *prefix_list, *suffix_list; + int p_strippable, s_strippable; + const char **prefix, **suffix; + const char *no_suffix = NULL; + bool word_can_split = false; + const Dictionary dict = sent->dict; + const char *wend = w + strlen(w); + char *newword = alloca(wend-w+1); + + /* Set up affix tables. */ + if (NULL == dict->affix_table) return false; + prefix_list = AFCLASS(dict->affix_table, AFDICT_PRE); + p_strippable = prefix_list->length; + prefix = prefix_list->string; + suffix_list = AFCLASS(dict->affix_table, AFDICT_SUF); + s_strippable = suffix_list->length; + suffix = suffix_list->string; + + if (INT_MAX == s_strippable) return false; + + /* Go through once for each suffix; then go through one + * final time for the no-suffix case (i.e. to look for + * prefixes only, without suffixes). */ + for (i = 0; i <= s_strippable; i++, suffix++) + { + bool did_split = false; + size_t suflen = 0; + if (i < s_strippable) + { + suflen = strlen(*suffix); + /* The remaining w is too short for a possible match. + * In addition, don't allow empty stems. */ + if ((wend-suflen) < (w+1)) continue; + + /* A lang like Russian allows empty suffixes, which have a real + * morphological linkage. In the following check, the empty suffix + * always matches. */ + if (0 == strncmp(wend-suflen, *suffix, suflen)) + { + size_t sz = (wend-w)-suflen; + strncpy(newword, w, sz); + newword[sz] = '\0'; + + /* Check if the remainder is in the dictionary. + * In case we try to split a contracted word, the first word + * may match a regex. Hence find_word_in_dict() is used and + * not boolean_dictionary_lookup(). + * Note: Not like a previous version, stems cannot match a regex + * here, and stem capitalization need to be handled elsewhere. */ + if ((is_contraction_word(dict, w) && + find_word_in_dict(dict, newword)) || + boolean_dictionary_lookup(dict, newword)) + { + did_split = true; + word_can_split |= + add_alternative_with_subscr(sent, unsplit_word, + NULL, newword, *suffix); + } + } + } + else + { + suflen = 0; + suffix = &no_suffix; + } + + /* + * Try stripping off prefixes. Avoid double-counting and + * other trouble by doing this only if we split off a suffix, + * or if there is no suffix. + */ + if (did_split || 0==suflen) + { + for (j = 0; j < p_strippable; j++) + { + size_t prelen = strlen(prefix[j]); + /* The remaining w is too short for a possible match. + * NOTE: A zero length "stem" is not allowed here. In any + * case, it cannot be handled (yet) by the rest of the code. */ + if ((wend-w) - suflen <= prelen) continue; + if (strncmp(w, prefix[j], prelen) == 0) + { + size_t sz = MIN((wend-w) - suflen - prelen, MAX_WORD); + + strncpy(newword, w+prelen, sz); + newword[sz] = '\0'; + /* ??? Do we need a regex match? */ + if (boolean_dictionary_lookup(dict, newword)) + { + word_can_split |= + add_alternative_with_subscr(sent, unsplit_word, prefix[j], + newword, *suffix); + } + } + } + } + } + + return word_can_split; +} + +#define HEB_PRENUM_MAX 5 /* no more than 5 prefix "subwords" */ +#define HEB_UTF8_BYTES 2 /* Hebrew UTF8 characters are always 2-byte */ +#define HEB_CHAREQ(s, c) (strncmp(s, c, HEB_UTF8_BYTES) == 0) +/** + * Handle "formative letters" ב, ה, ו, כ, ל, מ, ש. + * Split word into multiple prefix "subwords" (1-3 characters each) + * and an unprefixed word (which must be in the dictionary or be null) + * in all possible ways (even when the prefix combination is not valid, + * the LG rules will resolve that). + * If the whole word (i.e. including the prefixes) is in the dictionary, + * the word will be added in separate_word(). + * Add all the alternatives. + * The assumptions used prevent a large number of false splits. + * They may be relaxed later. + * + * XXX Because the grammatical rules of which prefixes are valid for the + * remaining word are not checked, non-existing words may get split. In such a + * case there is no opportunity for a regex or spell check of this unknown word. + * FIXME Before issuing an alternative, validate that the combination is + * supported by the dict. + * + * Note: This function currently does more than absolutely needed for LG, + * in order to simplify the initial Hebrew dictionary. + * It may be latter replaced by a simpler version. + * + * These algorithm is most probably very Hebrew-specific. + * These assumptions are used: + * - the prefix consists of subwords + * - longer subwords have priority over shorter ones + * - subwords in a prefix are unique ('ככ' is considered here as one "subword") + * - input words with length <= 2 don't have a prefix + * - each character uses 2 bytes (can be fixed) + * - the input word contains only Hebrew characters + * - the letter "ו" (vav) can only be the first prefix subword + * - if the last prefix subword is not "ו" and the word (length>2) starts + * with 2 "ו", the actual word to be looked up starts with one "ו" + * (see also TBD there) + * - a prefix can be stand-alone (an input word that consists of prefixes) + * + * To implement this function in a way which is appropriate for more languages, + * Hunspell-like definitions (but more general) are needed. + */ +static bool mprefix_split(Sentence sent, Gword *unsplit_word, const char *word) +{ + int i; + Afdict_class *mprefix_list; + int mp_strippable; + const char **mprefix; + const char *newword; + const char *w; + int sz = 0; + bool word_is_in_dict = false; + int split_prefix_i = 0; /* split prefix index */ + const char *split_prefix[HEB_PRENUM_MAX]; /* the whole prefix */ + bool *pseen; /* prefix "subword" seen (not allowed again) */ + int pfound; /* index of longer prefix found at a prefix level */ + Dictionary dict = sent->dict; + int wordlen; + int wlen; + int plen = 0; + Gword *altp; + bool split_check = (NULL == unsplit_word); + + /* Set up affix table */ + if (NULL == dict->affix_table) return false; + mprefix_list = AFCLASS(dict->affix_table, AFDICT_MPRE); + mp_strippable = mprefix_list->length; + if (0 == mp_strippable) return false; + /* The mprefix list is revered-sorted according to prefix length. + * The code here depends on that. */ + mprefix = mprefix_list->string; + + pseen = alloca(mp_strippable * sizeof(*pseen)); + /* Assuming zeroed-out bytes are interpreted as false. */ + memset(pseen, 0, mp_strippable * sizeof(*pseen)); + + w = word; + wordlen = strlen(word); /* guaranteed < MAX_WORD by separate_word() */ + do + { + pfound = -1; + + for (i=0; i 0) && + HEB_CHAREQ(mprefix[i], "ו") && (HEB_CHAREQ(w, "ו"))) + { + continue; + } + + plen = strlen(mprefix[i]); + wlen = strlen(w); + sz = wlen - plen; + if (strncmp(w, mprefix[i], plen) == 0) + { + if (-1 == pfound) pfound = i; + newword = w + plen; + /* Check for non-vav before vav */ + if (!HEB_CHAREQ(mprefix[i], "ו") && (HEB_CHAREQ(newword, "ו"))) + { + /* Non-vav before a single-vav - not in a prefix */ + if (!HEB_CHAREQ(newword+HEB_UTF8_BYTES, "ו")) + continue; + + /* Non-vav before 2-vav */ + if (newword[HEB_UTF8_BYTES+1]) + newword += HEB_UTF8_BYTES; /* strip one 'ו' */ + /* TBD: check word also without stripping. */ + } + pseen[i] = true; + split_prefix[split_prefix_i] = mprefix[i]; + if (0 == sz) /* stand-alone prefix */ + { + word_is_in_dict = true; + /* Add the prefix alone */ + lgdebug(+D_UN, "Whole-word prefix: %s\n", word); + if (split_check) return true; + altp = issue_word_alternative(sent, unsplit_word, "MPW", + split_prefix_i+1,split_prefix, 0,NULL, 0,NULL); + tokenization_done(sent, altp); + /* If the prefix is a valid word, + * It has been added in separate_word() as a word */ + break; + } + if (find_word_in_dict(dict, newword)) + { + word_is_in_dict = true; + lgdebug(+D_UN, "Splitting off a prefix: %.*s-%s\n", + wordlen-sz, word, newword); + if (split_check) return true; + altp = issue_word_alternative(sent, unsplit_word, "MPS", + split_prefix_i+1,split_prefix, 1,&newword, 0,NULL); + tokenization_done(sent, altp); + } + } + } + if ((-1 != pfound) && (i != pfound)) + { + /* A previous prefix is the longer one - use it */ + split_prefix[split_prefix_i] = mprefix[pfound]; + plen = strlen(mprefix[pfound]); + w += plen; + } +#if 0 + else + { + /* Uneeded? */ + w = newword; + } +#endif + split_prefix_i++; + /* "wlen + sz < wordlen" is true if a vav has been stripped */ + } while ((sz > 0) && (-1 != pfound) && (split_prefix_i < HEB_PRENUM_MAX)); + + return word_is_in_dict; +} + +/* Return true if the word might be capitalized by convention: + * -- if its the first word of a sentence + * -- if its the first word following a colon, a period, a question mark, + * or any bullet (For example: VII. Ancient Rome) + * -- if its the first word following an elipsis + * -- if its the first word of a quote + * + * XXX FIXME: These rules are rather English-centric. Someone should + * do something about this someday. + */ +static bool is_capitalizable(const Dictionary dict, const Gword *word) +{ + /* Words at the start of sentences are capitalizable */ + if (MT_WALL == word->prev[0]->morpheme_type) return true; + if (MT_INFRASTRUCTURE == word->prev[0]->morpheme_type) return true; + + /* Words following colons are capitalizable. */ + /* Mid-text periods and question marks are sentence-splitters. */ + if (strcmp(":", word->prev[0]->subword) == 0 || + strcmp(".", word->prev[0]->subword) == 0 || + strcmp("...", word->prev[0]->subword) == 0 || + strcmp("…", word->prev[0]->subword) == 0 || + strcmp("?", word->prev[0]->subword) == 0 || + strcmp("!", word->prev[0]->subword) == 0 || + strcmp("?", word->prev[0]->subword) == 0 || + strcmp("!", word->prev[0]->subword) == 0 ) + return true; + if (in_afdict_class(dict, AFDICT_BULLETS, word->prev[0]->subword)) + return true; + if (in_afdict_class(dict, AFDICT_QUOTES, word->prev[0]->subword)) + return true; + + return false; +} + +#define D_MS 6 +/* + * Split the given word "word" to morphemes. + * If unsplit_word is not NULL then issue alternatives. + * Else only check the word can split (to validate a spell guess). + */ +static bool morpheme_split(Sentence sent, Gword *unsplit_word, const char *word) +{ + bool word_can_split; + + if (0 < AFCLASS(sent->dict->affix_table, AFDICT_MPRE)->length) + { + word_can_split = mprefix_split(sent, unsplit_word, word); + lgdebug(+D_MS, "Tried mprefix_split word=%s can_split=%d\n", + word, word_can_split); + } + else + { + word_can_split = suffix_split(sent, unsplit_word, word); + lgdebug(+D_MS, "Tried to split word=%s can_split=%d\n", + word, word_can_split); + + /* XXX WS_FIRSTUPPER marking is missing here! */ + if ((NULL != unsplit_word) && is_utf8_upper(word, sent->dict->lctype) && + is_capitalizable(sent->dict, unsplit_word) && + !(unsplit_word->status & (WS_SPELL|WS_RUNON))) + { + int downcase_size = strlen(word)+MB_LEN_MAX+1; + char *const downcase = alloca(downcase_size); + + downcase_utf8_str(downcase, word, downcase_size, sent->dict->lctype); + word_can_split |= + suffix_split(sent, unsplit_word, downcase); + lgdebug(+D_MS, "Tried to split lc=%s now can_split=%d\n", + downcase, word_can_split); + } + } + + return word_can_split; +} + +#if defined HAVE_HUNSPELL || defined HAVE_ASPELL +static bool is_known_word(Sentence sent, const char *word) +{ + return (boolean_dictionary_lookup(sent->dict, word) || + morpheme_split(sent, NULL, word)); +} + +/** + * Try to spell guess an unknown word, and issue the results as alternatives. + * There are two kind of guesses: + * - Separating run-on words into an exact combination of words, usually 2. + * - Find similar words. These are limited to use_spell_guess alternatives. + * + * Return true if corrections have been issued, else false. + * + * Note: spellcheck_suggest(), which is invoked by this function, returns + * guesses for words containing numbers (including words consisting of digits + * only). Hence this function should not be called for such words. + * + * Note that a lowercase word can be spell-corrected to an uppercase word. + * FIXME? Should we allow that only if the lc version of the corrected word + * is the same? + */ +static bool guess_misspelled_word(Sentence sent, Gword *unsplit_word, + Parse_Options opts) +{ + Dictionary dict = sent->dict; + int runon_word_corrections = 0; + int num_guesses = 0; + int j, n; + char *sp = NULL; + const char *wp; + char **alternates = NULL; + const char *word = unsplit_word->subword; + + /* If the spell-checker knows about this word, and we don't ... + * Dang. We should fix it someday. Accept it as such. */ + if (spellcheck_test(dict->spell_checker, word)) return false; + + /* Else, ask the spell-checker for alternate spellings + * and see if these are in the dict. */ + n = spellcheck_suggest(dict->spell_checker, &alternates, word); + if (verbosity_level(+D_SW)) + { + lgdebug(0, "spellcheck_suggest for %s:\\", word); + if (0 == n) + lgdebug(0, " (nothing)\n"); + else + lgdebug(0, "\n\\"); + + for (j=0; j= opts->use_spell_guess) break; + } + if (alternates) spellcheck_free_suggest(dict->spell_checker, alternates, n); + + return ((num_guesses > 0) || (runon_word_corrections > 0)); +} +#endif /* HAVE_HUNSPELL */ + +static int split_mpunc(Sentence sent, const char *word, char *w, + stripped_t stripped) +{ + const Dictionary afdict = sent->dict->affix_table; + const Afdict_class * mpunc_list; + const char * const * mpunc; + size_t l_strippable; + int n_stripped = 0; + + if (NULL == afdict) return 0; + mpunc_list = AFCLASS(afdict, AFDICT_MPUNC); + l_strippable = mpunc_list->length; + mpunc = mpunc_list->string; + + strcpy(w, word); + + // +1: mpunc in start position is not allowed + for (char *sep = w+1; '\0' != *sep; sep++) + { + for (size_t i = 0; i < l_strippable; i++) + { + size_t sz = strlen(mpunc[i]); + if (0 == strncmp(sep, mpunc[i], sz)) + { + if ('\0' == sep[sz]) continue; // mpunc in end position + + lgdebug(D_UN, "w='%s' found mpunc '%s'\n", w, mpunc[i]); + + if (sep != w) + { + *sep = '\0'; + if (n_stripped >= MAX_STRIP-1) goto max_strip_ovfl; + stripped[n_stripped++] = w; + } + if (n_stripped >= MAX_STRIP-1) goto max_strip_ovfl; + stripped[n_stripped++] = mpunc[i]; + + w = sep + sz; + sep += sz - 1; + break; + } + } + } + + if (n_stripped > 0) stripped[n_stripped++] = w; + + return n_stripped; + +max_strip_ovfl: + lgdebug(+D_SW, "Too many tokens (>%d)\n", MAX_STRIP); + return 0; +} + +/** + * Strip off punctuation, etc. on the left-hand side. + */ +static const char *strip_left(Sentence sent, const char * w, + stripped_t stripped, + size_t *n_stripped) +{ + const Dictionary afdict = sent->dict->affix_table; + const Afdict_class * lpunc_list; + const char * const * lpunc; + size_t l_strippable; + size_t i; + + if (NULL == afdict) return (w); + lpunc_list = AFCLASS(afdict, AFDICT_LPUNC); + l_strippable = lpunc_list->length; + lpunc = lpunc_list->string; + + *n_stripped = 0; + + do + { + for (i=0; idict; + Dictionary afdict = dict->affix_table; + const char * temp_wend = *wend; + char *word = alloca(temp_wend-w+1); + size_t sz; + size_t i; + size_t nrs = 0; + size_t len = 0; + + Afdict_class *rword_list; + size_t rword_num; + const char * const * rword; + + if (*n_stripped >= MAX_STRIP-1) + return false; + + assert(temp_wend>w, "strip_right: unexpected empty-string word"); + if (NULL == afdict) return false; + + rword_list = AFCLASS(afdict, classnum); + rword_num = rword_list->length; + rword = rword_list->string; + + do + { + size_t altn = 0; + + for (i = 0; i < rword_num; i++) + { + const char *t = rword[i]; + + /* Units contain a subscript mark. Punctuation do not contain it. + * Find the token length, but stop at the subscript mark if exists. */ + len = strcspn(t, subscript_mark_str()); + + /* The remaining word is too short for a possible match */ + if ((temp_wend-w) < (int)len) continue; + + if (strncmp(temp_wend-len, t, len) == 0) + { + if (0 == altn) + { + lgdebug(+D_UN, "%d: %s: w='%s' rword '%.*s' at stripped[0,%zu]\n", + p, afdict_classname[classnum], temp_wend-len, (int)len, t, nrs); + stripped[1][*n_stripped+nrs] = NULL; + if (SUBSCRIPT_MARK == t[len]) + { + /* stripped[0][] are the unsubscripted word parts. */ + stripped[0][*n_stripped+nrs] = + string_set_add(strndupa(t, len), sent->string_set); + } + else + { + /* This is an unsubscripted token. We are not going to + * have alternatives to it.*/ + stripped[0][*n_stripped+nrs] = t; + nrs++; + temp_wend -= len; + break; + } + altn = 1; + } + /* The stripped[1..MAX_STRIP_ALT-1][] elements are subscripted. */ + lgdebug(+D_UN, "%d: %s: w='%s' rword '%s' at stripped[%zu,%zu]\n", + p, afdict_classname[classnum], temp_wend-len, t, altn, nrs); + stripped[altn][*n_stripped+nrs] = t; + if (altn < MAX_STRIP_ALT-1) + stripped[altn+1][*n_stripped+nrs] = NULL; + + /* Note: rword_list is reverse-sorted by len. */ + if ((i+1 < rword_num) && (0 == strncmp(rword[i+1], rword[i], len))) + { + /* Next rword has same base word, different subscript. + * Assign it in the next loop round as an alternative. + * To that end altn is incremented but not nrs. */ + altn++; + if (altn >= MAX_STRIP_ALT) + { + /* It is not supposed to happen... */ + lgdebug(+1, "Warning: Ignoring %s: Too many %.*s units (>%d)\n", + rword[i], (int)len, rword[i], MAX_STRIP_ALT); + break; + } + } + else + { + nrs++; + temp_wend -= len; + break; + } + } + } + } while ((i < rword_num) && (temp_wend > w) && rootdigit && + (*n_stripped+nrs < MAX_STRIP)); + assert(w <= temp_wend, "A word should never start after its end..."); + + sz = temp_wend-w; + if ((0 == sz) && (1 == nrs)) + return false; /* No need to strip off anything. */ + + strncpy(word, w, sz); + word[sz] = '\0'; + + /* If there is a non-null root, we require that it ends with a number, + * to ensure we stripped off all units. This prevents striping + * off "h." from "20th.". + * FIXME: is_utf8_digit(temp_wend-1, dict) here can only check ASCII digits, + * since it is invoked with the last byte... */ + if (rootdigit && (sz > 0) && !is_utf8_digit(temp_wend-1, dict->lctype)) + { + lgdebug(+D_UN, "%d: %s: return FALSE; root='%s' (%c is not a digit)\n", + p, afdict_classname[classnum], word, temp_wend[-1]); + return false; + } + + lgdebug(+D_UN, "%d: %s: return %s; n_stripped=%zu+%zu, " + "wend='%s' temp_wend='%s'\n", + p, afdict_classname[classnum], (nrs>0)?"TRUE":"FALSE", + *n_stripped, nrs, *wend, temp_wend); + + *n_stripped += nrs; + *wend = temp_wend; + return nrs > 0; +} + +/** + * Issue an alternative that starts with w and continue with stripped[]. + * If wend is NULL, w is Null-terminated. + */ +static void issue_r_stripped(Sentence sent, + Gword *unsplit_word, + const char *w, + const char *wend, + const stripped_t r_stripped[], + size_t n_stripped, + const char *label) +{ + const size_t sz = (NULL==wend) ? strlen(w) : (size_t)(wend-w); + char *word; + const char **rtokens = NULL; + size_t ntokens = 0; + size_t i; + size_t altn = 0; + Gword *rstrip_alt; + + if (0 != sz) + { + /* Issue the root word as the first token. */ + word = strndupa(w, sz); + altappend(sent, &rtokens, word); + lgdebug(+D_SW, "Issue root word w='%s' (alt %s)\n", word, label); + ntokens++; + } + + /* Reverse r_stripped, because the tokens got stripped from right to left. */ + for (i = n_stripped-1; (ssize_t)i >= 0; i--) + { + lgdebug(+D_SW, "Issue r_stripped w='%s' at [0,%zu] (%s)\n", + r_stripped[altn][i], i, label); + altappend(sent, &rtokens, r_stripped[altn][i]); + ntokens++; + } + rstrip_alt = issue_word_alternative(sent, unsplit_word, label, + 0,NULL, ntokens,rtokens, 0,NULL); + + for_word_alt(sent, rstrip_alt, set_word_status, + (unsigned int []){WS_INDICT|WS_REGEX}); + + /* Issue additional alternatives if exist. + * The tokens are scanned from last to first because the original + * alternative gets shortened by inserting the additional alternatives + * (if scanned from first to last - for_word_alt() would not be able to + * reach beyond the last insertion point). */ + for (i = 0; i < n_stripped; i++) + { + unsigned int position = (int)(n_stripped - i - 1); /* Value destroyed. */ + + if (ntokens > n_stripped) position++; /* Account for a root word. */ + Gword *add_alt = for_word_alt(sent, rstrip_alt, gword_by_ordinal_position, + &position); + if (NULL == add_alt) + { + lgdebug(+1, "Warning: Internal error - r_striped alt too short.\n"); + return; /* Avoid a crash if this ever happens. */ + } + add_alt->tokenizing_step = TS_DONE; /* Last tokenization step follows. */ + + char *replabel = NULL; /* GCC: may be used uninitialized. */ + if (NULL != r_stripped[1][i]) + { + /* We are going to issue a subscripted word which is not a + * substring of it's unsplit_word. For now, the token position + * computation code needs an indication for that. */ + replabel = strdupa(label); + replabel[0] = REPLACEMENT_MARK[0]; + } + for (size_t n = 1; n < MAX_STRIP_ALT; n++) + { + if (NULL == r_stripped[n][i]) break; + + lgdebug(+D_SW, "Issue r_stripped w='%s' at [%zu,%zu] (%s)\n", + r_stripped[n][i], n, i, replabel); + Gword *altp = issue_word_alternative(sent, add_alt, replabel, + 0,NULL, 1,&r_stripped[n][i], 0,NULL); + tokenization_done(sent, altp); /* ... since it is subscripted */ + } + } + + free(rtokens); +} + +static void issue_dictcap(Sentence sent, bool is_cap, + Gword *unsplit_word, const char *word) +{ + const char *dictcap[2]; + Gword *altp; + + dictcap[0] = is_cap ? CAP1st : CAPnon; + dictcap[1] = word; + lgdebug(+D_SW, "Adding %s word=%s RE=%s\n", dictcap[0], word, + NULL == unsplit_word->regex_name ? "" : unsplit_word->regex_name); + altp = issue_word_alternative(sent, unsplit_word, REPLACEMENT_MARK "dictcap", + 0,NULL, 2,dictcap, 0,NULL); + + if (NULL == altp) + { + prt_error("Warning: Word %s: Internal error: Issuing %s failed\n", + dictcap[1], dictcap[0]); + return; + } + /* Set the dictcap[0] word fields */ + altp->status |= WS_INDICT; /* already checked to be in the dict */ + altp->morpheme_type = MT_FEATURE; + altp->tokenizing_step = TS_DONE; /* no further tokenization */ + + /* Set the alternative word fields. */ + if(is_cap && (NULL != unsplit_word->regex_name)) + { + /* This is the uc word. */ + altp->next[0]->status |= WS_REGEX; + altp->next[0]->regex_name = unsplit_word->regex_name; + /* issue_word_alternative() will mark it as TS_DONE because it appears in + * an alternative of itself. */ + } + else + { + /* This is the lc version. The original word can be restored later, if + * needed, through the unsplit word. */ + altp->status |= WS_FIRSTUPPER; + } +} + +/* r_stripped debug printout */ +static const char *print_rev_word_array(Sentence sent, const char **w, + size_t size) +{ + dyn_str *s = dyn_str_new(); + int i; + const char *r; + + for (i = size - 1; i >= 0; i--) + append_string(s, "[%d]='%s'%s", i, w[i], i>0 ? "," : ""); + + r = string_set_add(s->str, sent->string_set); + dyn_str_delete(s); + return r; +} + +/** + * Check if the word is capitalized according to the regex definitions. + * XXX Not nice - try to avoid the need of using it. + */ +static bool is_re_capitalized(const char *regex_name) +{ + return ((NULL != regex_name) && (NULL != strstr(regex_name, "CAPITALIZED"))); +} + +/** + * Separate a word to subwords in all the possible ways. + * unsplit_word is the current Wordgraph word to be separated to subwords. + * This function splits up the word if necessary, and calls + * "issue_word_alternatives()" on each of the resulting parts ("subwords"), + * creating an "alternative" to the original unsplit_word. + * + * This is used to, e.g, split Russian words into stem+suffix, issuing a + * separate subword for each. In addition, there are many English + * constructions that need splitting: + * + * 86mm -> 86 + mm (millimeters, measurement) + * $10 -> $ + 10 (dollar sign plus a number) + * Surprise! -> surprise + ! (pry the punctuation off the end of the word) + * you've -> you + 've (undo contraction, treat 've as synonym for 'have') + * + * The original separate_word() function directly created the 2D-word-array used + * by the parser. This version of separate_word() is a rewrite that creates a + * word graph, referred in the comments as Wordgraph. It is later converted to + * the said 2D-word-array by flatten_wordgraph(). + * + * The current separate_word() code is still too similar to the old one, even + * though some principles of operation are radically different: the separated + * subwords are now put in a central word queue, from which they are pulled out + * one by one. If a word is marked by TS_DONE, it will be removed from + * the word queue without further processing. + * + * The function gets each word in the queue, separates it to subwords and create + * alternatives from each such separation, until all the separating + * possibilities are exhausted. + * + * FIXME: The old code, although working, is convoluted and contains redundant + * parts. It needs much cleanup efforts, also to make it more flexible and + * efficient, and at the same time prevent extra splitting (i.e. prevent issuing + * alternatives which create graph paths with the same sequence of subwords as + * existing parallel graph paths). + * A test case: By the '50s, he was very prosperous. + * + * XXX This function is being rewritten (work in progress). + */ +static void separate_word(Sentence sent, Gword *unsplit_word, Parse_Options opts) +{ + Dictionary dict = sent->dict; + bool word_is_known = false; + bool word_can_split; + bool word_can_lrmsplit = false; /* This is needed to prevent spelling on + * compound subwords, like "Word." while + * still allowing capitalization handling + * and regex match. */ + bool lc_word_is_in_dict = false; + bool stripped; + const char *wp; + const char *temp_wend; + + size_t n_stripped = 0; + stripped_t x_stripped; /* these were stripped from the left/middle */ + + /* For units alternative */ + const char *units_wend = NULL; /* end of string consisting of units */ + size_t units_n_stripped = 0; + + size_t sz = strlen(unsplit_word->subword); + const char *word = unsplit_word->subword; + const char *wend = &unsplit_word->subword[sz]; + + /* Dynamic allocation of working buffers. */ + int downcase_size = sz+MB_LEN_MAX+1; /* pessimistic max. size of dc buffer */ + char *const downcase = alloca(downcase_size); /* downcasing buffer */ + char *const temp_word = alloca(downcase_size); /* tmp word buffer */ + char *const seen_word = alloca(downcase_size); /* loop-prevention buffer */ + + downcase[0] = '\0'; + + lgdebug(+D_SW, "Processing word: '%s'\n", word); + + if (boolean_dictionary_lookup(dict, word)) + { + lgdebug(+D_SW, "0: Adding '%s' as is, before split tries, status=%s\n", + word, gword_status(sent, unsplit_word)); + issue_word_alternative(sent, unsplit_word, "W", 0,NULL, 1,&word, 0,NULL); + unsplit_word->status |= WS_INDICT; + word_is_known = true; + } + + if (unsplit_word->status & (WS_SPELL|WS_RUNON)) + { + /* The word is a result of spelling, so it doesn't need right/left + * stripping. Skip it. */ + } + else + { + if ((MT_CONTR == unsplit_word->morpheme_type)) + { + /* The word is the contracted part of a contraction. It was most + * probably been marked as dict word by the check above (unless there + * is a definition error and it is only PRE or SUF without being in the + * dict). + * It should also not pass any more handling, so return here. + * Especially it should not pass right-strip. Else y' gets split to + * y ' and 'll gets split as units to ' l l + * FIXME This prevents separating double contraction (that still may + * not be done even otherwise). + * http://en.wiktionary.org/wiki/Category:English_double_contractions*/ + if (!word_is_known) + { + /* Note: If we are here it means dict->affix_table is not NULL. */ + prt_error("Warning: Contracted word part %s is in '%s/%s' " + "but not in '%s/%s'\n", word, + dict->lang, dict->affix_table->name, + dict->lang, dict->name); + } + return; + } + + /* + * This is essentially the old LR stripping code, from the pre-Wordgraph + * version. It still seems to work fine. Work should be done here in + * order to simplify it. + */ + + wp = strip_left(sent, word, x_stripped, &n_stripped); + if (wp != word) + { + /* If n_stripped exceed max, the "word" is most likely includes a long + * sequence of periods. Just accept it as an unknown "word", + * and move on. + * FIXME: Word separation may still be needed, e.g. for a table of + * contents: + * ............................something + * FIXME: "return" here prevents matching a regex. + */ + if (n_stripped >= MAX_STRIP-1) + { + lgdebug(+D_SW, "Left-strip of >= %d tokens\n", MAX_STRIP-1); + return; /* XXX */ + } + + if ('\0' != *wp) + x_stripped[n_stripped++] = wp; + + issue_word_alternative(sent, unsplit_word, "rL", + 0,NULL, n_stripped,x_stripped, 0,NULL); + + /* Its possible that the token consisted entirely of + * left-punctuation, in which case, wp is an empty-string. + * In case this is a single token (n_stripped == 1), we have + * to continue processing, because it may match a regex. */ + if ('\0' == *wp && n_stripped != 1) + { + /* Suppose no more alternatives in such a case. */ + lgdebug(+D_SW, "1: Word '%s' all left-puncts - done\n", + unsplit_word->subword); + return; + } + + n_stripped = 0; + word_can_lrmsplit = true; + } + + lgdebug(+D_SW, "1: Continue with word %s status=%s\n", + word, gword_status(sent, unsplit_word)); + + /* Strip off punctuation and units, etc. on the right-hand side. Try + * rpunc, then units, then rpunc, then units again, in a loop. We do this + * to handle expressions such as 12sqft. or 12lbs. (notice the period at + * end). That is, we want to strip off the "lbs." with the dot, first, + * rather than stripping the dot as punctuation, and then coming up + * empty-handed for "sq.ft" (without the dot) in the dict. But if we are + * NOT able to strip off any units, then we try punctuation, and then + * units. This allows commas to be removed (e.g. 7grams,). */ + + stripped_t r_stripped[MAX_STRIP_ALT]; + + seen_word[0] = '\0'; + do + { + int temp_n_stripped; + /* First, try to strip off a single punctuation, typically a comma or + * period, and see if the resulting word is in the dict (but not the + * regex). This allows "sin." and "call." to be recognized. If we don't + * do this now, then the next stage will split "sin." into + * seconds-inches, and "call." into calories-liters. */ + temp_n_stripped = n_stripped; + temp_wend = wend; + stripped = strip_right(sent, word, &wend, r_stripped, &n_stripped, + AFDICT_RPUNC, /*rootdigit*/false, 2); + if (stripped) + { + /* "wend" points to the end of the remaining word. */ + sz = wend-word; + strncpy(temp_word, word, sz); + temp_word[sz] = '\0'; + + /* If the resulting word is in the dict, we are done. */ + if (boolean_dictionary_lookup(dict, temp_word)) break; + /* Undo the check. */ + wend = temp_wend; + n_stripped = temp_n_stripped; + } + + /* Remember the results, for a potential alternative. */ + units_wend = wend; + units_n_stripped = n_stripped; + + /* Strip off all units, if possible. It is not likely that we strip + * here a string like "in." which is not a unit since we require a + * number before it when only a single component is stripped off. */ + stripped = strip_right(sent, word, &wend, r_stripped, &n_stripped, + AFDICT_UNITS, /*rootdigit*/true, 3); + if (!stripped) + { + units_wend = NULL; + /* Try to strip off punctuation, typically a comma or period. */ + stripped = strip_right(sent, word, &wend, r_stripped, &n_stripped, + AFDICT_RPUNC, /*rootdigit*/false, 4); + } + + /* w points to the remaining word, + * "wend" to the end of the word. */ + sz = wend-word; + strncpy(temp_word, word, sz); + temp_word[sz] = '\0'; + + /* Avoid an infinite loop in case of a repeating unknown remaining word */ + if (0 == strcmp(temp_word, seen_word)) break; + strcpy(seen_word, temp_word); + + /* Any remaining dict word stops the right-punctuation stripping. */ + } while (NULL == units_wend && stripped && (sz != 0) && + !boolean_dictionary_lookup(dict, temp_word)); + + lgdebug(+D_SW, "After strip_right: n_stripped=(%s) " + "word='%s' wend='%s' units_wend='%s' temp_word='%s'\n", + print_rev_word_array(sent, r_stripped[0], n_stripped), + word, wend, units_wend, temp_word); + + /* If n_stripped exceed max, the "word" most likely includes a long + * sequence of periods. Just accept it as an unknown "word", + * and move on. + * FIXME: Word separation may still be needed, e.g. for a table of + * contents: + * 10............................ + */ + if (n_stripped >= MAX_STRIP-1) + { + lgdebug(+D_SW, "Right-strip of >= %d tokens\n", MAX_STRIP-1); + return; /* XXX */ + } + + /* Check whether the "word" is in the dict (including + * regex). In such a case we need to generate an alternative. This happens + * if it is a part number, like "1234-567A". + */ + + if (units_n_stripped && (NULL != units_wend) && (0 != units_wend-word)) + { + /* units found */ + sz = units_wend-word; + strncpy(temp_word, word, sz); + temp_word[sz] = '\0'; + + if (find_word_in_dict(dict, temp_word)) + { + issue_r_stripped(sent, unsplit_word, temp_word, NULL, + r_stripped, units_n_stripped, "rR2"); + word_can_lrmsplit = true; + } + } + + + /* Add the strip result as an alternative if one of these conditions is + * true: + * - If the root word (temp_word) is known. + * - If the unsplit_word is unknown. This happens with an unknown word + * that has punctuation after it). */ + if (n_stripped > 0) + { + sz = wend-word; + strncpy(temp_word, word, sz); + temp_word[sz] = '\0'; + + if (!find_word_in_dict(dict, unsplit_word->subword) || + (0 == sz) || find_word_in_dict(dict, temp_word)) + { + issue_r_stripped(sent, unsplit_word, temp_word, NULL, + r_stripped, n_stripped, "rR3"); + word_can_lrmsplit = true; + } + } + } + + n_stripped = split_mpunc(sent, word, temp_word, x_stripped); + if (n_stripped > 0) + { + issue_word_alternative(sent, unsplit_word, "M", 0,NULL, + n_stripped,x_stripped, 0,NULL); + word_can_lrmsplit = true; + } + + lgdebug(+D_SW, "2: Continue with word=%s can_lrmsplit=%d status=%s\n", + word, word_can_lrmsplit, gword_status(sent, unsplit_word)); + + /* Generate random morphology */ + if ((dict->affix_table && dict->affix_table->anysplit) && !word_can_lrmsplit) + anysplit(sent, unsplit_word); + + /* OK, now try to strip affixes. */ + word_can_split = morpheme_split(sent, unsplit_word, word); + + /* If the word is unknown, then try to guess its category by regexes. + * A word that cannot split is considered known, unless it is a contraction, + * in which case we need a regex for things like 1960's. + * The first regex which matches (if any) is used. + * An alternative consisting of the word has already been generated. */ + if (!word_is_known && (!word_can_split || is_contraction_word(dict, word))) + { + regex_guess(dict, word, unsplit_word); + /* Even if a regex matches, don't set word_is_known=true yet. */ + } + + lgdebug(+D_SW, "After split step, word=%s can_split=%d is_known=%d RE=%s\n", + word, word_can_split, word_is_known, + (NULL == unsplit_word->regex_name) ? "" : unsplit_word->regex_name); + + /* FIXME: Handling of capitalized words that are a result of spelling. */ + if (is_utf8_upper(word, dict->lctype)) + { + if (!test_enabled("dictcap")) + { + /** Hard-coded English-centric capitalization handling. + * + * FIXME: Capitalization handling should be done using the dict. + * + * If the word is capitalized, then issue as alternatives: + * - Issue its lowercase version if it is in a capitalizable + * position and also it is in the dict. + * - Issue it (capitalized) too as a word to regex (so the + * capitalized-words regex disjuncts will be used), in these + * conditions (cumulative): + * -- It could not be split (else capitalization has been + * handled XXX). + * -- It is not in the dict (it has already been issued in + * that case). + * -- It is not in a capitalizable position in the sentence. + * -- Its lowercase version is in the dict file (not regex) and + * it is an entity (checked capitalized) or a common entity + * (checked as lowercase). + * + * Comments from a previous release: + * + * * Common entity (checked as lowercase): This allows common + * nouns and adjectives to be used for entity names: e.g. "Great + * Southern Union declares bankruptcy", allowing Great to be + * capitalized, while preventing an upper-case "She" being used + * as a proper name in "She declared bankruptcy". + * + * * Entity (checked capitalized): We need to *add* Sue.f (female + * name Sue) even though sue.v (the verb "to sue") is in the + * dict. So test for capitalized entity names. FIXME: [ap] + * Since capitalized words which are in the dict file are now + * issued anyway as uppercase, and the capitalized-words regexes + * are not marked in the dict as entities, this may have effect + * only for capitalized words that match non-capitalized-words + * regexes that are marked as entities. I don't know about such, + * and if there are indeed no such regexes, it looks like the + * is_entity() check is redundant. A test "is_entity" added + * below to check if there is any sentence in the batches that + * contradicts that. + */ + bool word_is_capitalizable = is_capitalizable(dict, unsplit_word); + + if ('\0' == downcase[0]) + downcase_utf8_str(downcase, word, downcase_size, dict->lctype); + lc_word_is_in_dict = boolean_dictionary_lookup(dict, downcase); + + if (word_is_capitalizable) + { + if (lc_word_is_in_dict) + { + /* Issue the lowercase version of the word. */ + Gword *lc; + + wp = downcase; + lgdebug(+D_SW, "Adding lc=%s is_capitalizable=1\n", wp); + lc = issue_word_alternative(sent, unsplit_word, "LC", + 0,NULL, 1,&wp, 0,NULL); + if (NULL == lc) + { + prt_error("Warning: Word %s: Internal error: Issuing lc failed\n", + wp); + return; + } + /* This is the lc version. The original word can be restored + * later, if needed, through the unsplit word. */ + lc->status |= WS_FIRSTUPPER; + } + else /* for a comment */ + { + /* If we are here, it is a capitalized word in a capitalized + * position which its lowercase version is not in the dict file. + * Should we try a regex match if the word is unknown? */ + } + } + + lgdebug(+D_SW, "Word=%s lc=%s in_dict=%d is_known=%d can_split=%d " + "is_capitalizable=%d lc_is_in_dict=%d " + "is_entity=%d is_common_entity=%d\n", + word, downcase, !!(unsplit_word->status & WS_INDICT), + word_is_known, word_can_split, + word_is_capitalizable, lc_word_is_in_dict, + is_entity(dict, word), is_common_entity(dict, downcase)); + + if (!word_can_split && !word_is_known && + (!word_is_capitalizable || (lc_word_is_in_dict && + (is_common_entity(dict, downcase) || is_entity(dict, word))))) + { + /* Issue it (capitalized) too */ + if ((NULL != unsplit_word->regex_name)) + { + lgdebug(+D_SW, "Adding uc word=%s RE=%s\n", word, + unsplit_word->regex_name); + issue_word_alternative(sent, unsplit_word, "REuc", + 0,NULL, 1,&word, 0,NULL); + word_is_known = true; + + if (test_enabled("is_entity") && is_entity(dict, word)) + prt_error("is_entity(%s): %s\n", word, sent->orig_sentence); + } + } + word_is_known |= lc_word_is_in_dict; + } + else + { + /* + * Experimental dictionary handling for capitalized words. + */ + + if (!boolean_dictionary_lookup(dict, CAP1st) || + !boolean_dictionary_lookup(dict, CAPnon)) + { + /* FIXME Move this check. Make it once. */ + prt_error("Error: Missing " CAP1st "/" CAPnon "in the dict\n"); + return; + } + + /* - If the (uc) word is in the dict, it has already been issued. + * - If the word is not a capitalized word according to the regex file, + * it also should not be issued, even if is_utf8_upper(word, dict), + * e.g Y'gonna or Let's. */ + if (!(unsplit_word->status & WS_INDICT) && + is_re_capitalized(unsplit_word->regex_name)) + { + issue_dictcap(sent, /*is_cap*/true, unsplit_word, word); + } + + downcase_utf8_str(downcase, word, downcase_size, dict->lctype); + /* Issue the lc version if it is known. + * FIXME? Issuing only known lc words prevents using the unknown-word + * device for words in capitalizable position (when the word is a uc + * version of an unknown word). */ + if (find_word_in_dict(sent->dict, downcase)) + issue_dictcap(sent, /*is_cap*/false, unsplit_word, downcase); + + word_is_known = true; /* We could just return */ + } + } + + /* Handle regex match. This is done for words which are not in the dict + * and cannot morpheme split. + * + * Contracted words, like 1960's should be tried - words that contain + * punctuation are not going to match). + * + * However, capital LR-split words which their lc version is in the dict, + * such as "As" (gets split to A s) shouldn't be tried here, as their + * capitalization handling has already been handled before we arrived here, + * and if a capital-word regex has not been issued there, we should prevent + * issuing it here. */ + if (!(word_is_known || lc_word_is_in_dict || + (word_can_split && !is_contraction_word(dict, word)))) + { + if ((NULL != unsplit_word->regex_name)) + { + lgdebug(+D_SW, "Adding word '%s' for regex, match=%s\n", + word, unsplit_word->regex_name); + issue_word_alternative(sent, unsplit_word, "RE", + 0,NULL, 1,&word, 0,NULL); + + word_is_known = true; /* make sure we skip spell guess */ + } + } + + word_is_known |= word_can_split; + +#if defined HAVE_HUNSPELL || defined HAVE_ASPELL + /* If the word is unknown, then it might be a run-on of two words or a + * misspelled word. Ask the spell-checker to split the word, if possible, + * and/or offer guesses. + * + * Do all of this only if the word is not a proper name, and if + * spell-checking is enabled and spell-checker is specified. A word which + * contains digits is considered a proper name (maybe of a part number). + * + * ??? Should we add spell guesses as alternatives in case: + * 1. The word if not in the main dict but matches a regex. + * 2. The word an unknown capitalized word. + */ + if (!word_can_lrmsplit && !word_is_known && + !contains_digits(word, dict->lctype) && + !is_proper_name(word, dict->lctype) && + opts->use_spell_guess && dict->spell_checker) + { + bool spell_suggest = guess_misspelled_word(sent, unsplit_word, opts); + lgdebug(+D_SW, "Spell suggest=%d\n", spell_suggest); + } +#endif /* defined HAVE_HUNSPELL || defined HAVE_ASPELL */ + + lgdebug(+D_SW, "END: Word '%s' in_dict=%d is_known=%d status=%s\n", + unsplit_word->subword, !!(unsplit_word->status & WS_INDICT), + word_is_known, gword_status(sent, unsplit_word)); +#if 0 + if (!word_is_known && + !(unsplit_word->status & (WS_INDICT|WS_REGEX))) + unsplit_word->status |= WS_UNKNOWN; +#endif +} + +/** + * Make the string 's' be the next word of the sentence. + * + * Do not issue the empty string. + * + * FIXME: We need it for now in order to initially populate the wordgraph. + */ +static Gword *issue_sentence_word(const Sentence sent, const char *const s) +{ + Gword *new_word; + Gword *last_word = sent->last_word; + + assert(NULL!=last_word); + assert(NULL!=s, "subword must not be NULL"); + assert('\0'!=s[0], "subword must not be an empty-string: " + "Last subword issued: '%s'", last_word->subword); + + new_word = gword_new(sent, s); + new_word->unsplit_word = sent->wordgraph; + new_word->label = "S"; /* a sentence word */ + + gwordlist_append(&last_word->next, new_word); + gwordlist_append(&new_word->prev, last_word); + + gwordqueue_add(sent, new_word); + + return new_word; +} + +static void add_gword(Sentence sent, const char *w, const char *wend, + Morpheme_type morpheme_type) +{ + const size_t sz = (NULL == wend) ? strlen(w) : (size_t)(wend - w); + char *const word = alloca(sz+1); + Gword *new_word; + + strncpy(word, w, sz); + word[sz] = '\0'; + + new_word = issue_sentence_word(sent, word); + new_word->morpheme_type = morpheme_type; + new_word->alternative_id = sent->wordgraph; + if (NULL != wend) + { + new_word->start = w; + new_word->end = wend; + } + if (MT_WORD != morpheme_type) + { + /* Skip tokenizing this word */ + new_word->tokenizing_step = TS_DONE; + if (MT_WALL == morpheme_type) + { + new_word->status |= WS_INDICT; + if (MT_INFRASTRUCTURE == new_word->prev[0]->morpheme_type) + new_word->start = sent->orig_sentence; + else + new_word->start = sent->orig_sentence + strlen(sent->orig_sentence); + new_word->end = new_word->start; + } + } +} + +/** + * Create the first node of the wordgraph. + * Its subword is the whole original sentence. + * It also serves as a leading dummy "word" for the sentence, like a "margin + * mark", in order to simplify the wordgraph pointer manipulation (no need to + * check for NULL "prev" pointer). The unsplit_word field is NULL only for the + * leading and trailing margin words. + */ +static void wordgraph_create(Sentence const sent) +{ + Gword *new_word; + + assert(NULL==sent->last_word, "wordgraph_create(): wordgraph exists"); + new_word = gword_new(sent, sent->orig_sentence); + + assert(NULL!=sent->orig_sentence, "wordgraph_create()"); + assert(NULL==sent->wordgraph, "wordgraph_create(): wordgraph exists"); + + sent->wordgraph = sent->last_word = new_word; + new_word->label = "D"; /* dummy word */ + new_word->morpheme_type = MT_INFRASTRUCTURE; +} + +/** + * Create a trailing dummy "word" for the sentence. + * It is a sentence "margin" trailing mark, in order to simplify the wordgraph + * pointer manipulation (no need to check for NULL "next" pointer). + * The unsplit_word field is NULL only for the margin words. */ +static void wordgraph_terminator(Sentence const sent) +{ + assert(NULL != sent->last_word, "wordgraph_terminator(): no wordgraph"); + add_gword(sent, "(T)", NULL, MT_INFRASTRUCTURE); /* cannot use "" */ + sent->last_word->unsplit_word = NULL; /* no unsplit word */ + sent->last_word->label = "D"; /* dummy word */ + sent->last_word->tokenizing_step = TS_DONE; /* not to be tokenized */ +} + +// If TOLERATE_BAD_UTF is defined, then we will attempt to parse +// sentences containing UTF-8 garbage, treating the garbage bytes +// as individual words. Else, just print an error, and punt. +#define TOLERATE_BAD_UTF +#ifdef TOLERATE_BAD_UTF + #define BAD_UTF { nb = 0; word_start ++; continue; } +#else + #define BAD_UTF goto failure; +#endif + +/** + * The string s has just been read in from standard input. + * This function breaks it up into words and stores these words in + * the sent->word[] array. Returns true if all is well, false otherwise. + */ +bool separate_sentence(Sentence sent, Parse_Options opts) +{ + const char * word_end; + //bool quote_found; + Dictionary dict = sent->dict; + mbstate_t mbs; + const char * word_start = sent->orig_sentence; + Gword *word; + + sent->length = 0; + + wordgraph_create(sent); + + if (dict->left_wall_defined) + add_gword(sent, LEFT_WALL_WORD, NULL, MT_WALL); + + /* Reset the multibyte shift state to the initial state */ + memset(&mbs, 0, sizeof(mbs)); + +#ifdef DEBUG + /* Skip a synthetic sentence mark, if any. See synthetic_split(). */ + if (SYNTHETIC_SENTENCE_MARK == sent->orig_sentence[0]) word_start++; +#endif + + for(;;) + { + wchar_t c; + int nb = mbrtowc(&c, word_start, MB_CUR_MAX, &mbs); + if (0 > nb) BAD_UTF; + + while (is_space(c, dict->lctype)) + { + word_start += nb; + nb = mbrtowc(&c, word_start, MB_CUR_MAX, &mbs); + if (0 == nb) break; + if (0 > nb) BAD_UTF; + } + + if ('\0' == *word_start) break; + + /* Loop over non-blank characters until word-end is found. */ + word_end = word_start; + nb = mbrtowc(&c, word_end, MB_CUR_MAX, &mbs); + if (0 > nb) BAD_UTF; + while (!is_space(c, dict->lctype) && (c != 0) && (0 < nb)) + { + word_end += nb; + nb = mbrtowc(&c, word_end, MB_CUR_MAX, &mbs); + if (0 > nb) break; + } + if (0 > nb) BAD_UTF; + + /* FIXME: Morpheme type of initial bad-sentence word may be wrong. + * E.g: He 's here. (Space before ' so 's is classified as MT_WORD). */ + add_gword(sent, word_start, word_end, MT_WORD); + word_start = word_end; + if ('\0' == *word_start) break; + } + + if (dict->right_wall_defined) + add_gword(sent, RIGHT_WALL_WORD, NULL, MT_WALL); + + wordgraph_terminator(sent); + + while ((word = wordgraph_getqueue_word(sent))) + { + if (TS_DONE == word->tokenizing_step) + { + remqueue_gword(sent); + continue; + } + + /* Perform prefix, suffix splitting, if needed */ +#ifdef DEBUG + if (SYNTHETIC_SENTENCE_MARK == sent->orig_sentence[0]) + synthetic_split(sent, word); +#else + if (0) + ; +#endif + else + separate_word(sent, word, opts); + + word->tokenizing_step = TS_DONE; + } + + /* Return true if at least one sentence word has been issued */ + for (word = sent->wordgraph; NULL != word->next; word = word->next[0]) + { + if ((word->morpheme_type != MT_INFRASTRUCTURE) && + (word->morpheme_type != MT_WALL)) + { + return true; + } + } + + /* Something is wrong */ + wordgraph_delete(sent); + return false; + +#ifndef TOLERATE_BAD_UTF +failure: +#ifdef _WIN32 + prt_error("Unable to process UTF8 input string.\n"); +#else + prt_error("Unable to process UTF8 input string in current locale %s\n", + nl_langinfo(CODESET)); +#endif + wordgraph_delete(sent); + return false; +#endif // TOLERATE_BAD_UTF +} + +static Word *word_new(Sentence sent) +{ + const size_t len = sent->length; + + sent->word = realloc(sent->word, (len+1)*sizeof(*sent->word)); + sent->word[len].d= NULL; + sent->word[len].x= NULL; + sent->word[len].unsplit_word = NULL; + sent->word[len].alternatives = NULL; + sent->word[len].optional = false; + sent->length++; + + return &sent->word[len]; +} + +/** + * build_word_expressions() -- build list of expressions for a word. + * + * Looks up a word in the dictionary, fetching from it matching words and their + * expressions. Returns NULL if it's not there. If there, it builds the list + * of expressions for the word, and returns a pointer to it. + * The subword of Gword w is used for this lookup, unless the subword is + * explicitly given as parameter s. The subword of Gword w is always used as + * the base word for each expression, and its subscript is the one from the + * dictionary word of the expression. + */ +static X_node * build_word_expressions(Sentence sent, const Gword *w, const char *s) +{ + Dict_node * dn, *dn_head; + X_node * x, * y; + Exp_list eli; + const Dictionary dict = sent->dict; + + eli.exp_list = NULL; + dn_head = dictionary_lookup_list(dict, NULL == s ? w->subword : s); + x = NULL; + dn = dn_head; + while (dn != NULL) + { + y = (X_node *) xalloc(sizeof(X_node)); + y->next = x; + x = y; + x->exp = copy_Exp(dn->exp); + if (NULL == s) + { + x->string = dn->string; + } + else + { + dyn_str *xs = dyn_str_new(); + const char *sm = strrchr(dn->string, SUBSCRIPT_MARK); + + dyn_strcat(xs, w->subword); + if (NULL != sm) dyn_strcat(xs, sm); + x->string = string_set_add(xs->str, sent->string_set); + dyn_str_delete(xs); + } + x->word = w; + dn = dn->right; + } + free_lookup_list (dict, dn_head); + free_Exp_list(&eli); + return x; +} + +/** + * Build the expression lists for a given word at the current word-array word. + * + * The resulted word-array is later used as an input to the parser. + * + * Algorithm: + * Apply the following step to all words w: + * - If w is in the dictionary, use it. + * - Else if w is identified by regex matching, use the appropriately + * matched disjunct collection. + * - Otherwise w is unknown - use the disjunct collection of UNKNOWN_WORD. + * + * FIXME For now, also add an element to the alternatives array, so the rest of + * program will work fine (print_sentence_word_alternatives(), + * sentence_in_dictionary(), verr_msg()). + */ +#define D_X_NODE 9 +#define D_DWE 8 +static bool determine_word_expressions(Sentence sent, Gword *w, + unsigned int *ZZZ_added) +{ + Dictionary dict = sent->dict; + const size_t wordpos = sent->length - 1; + + const char *s = w->subword; + X_node * we = NULL; + + lgdebug(+D_DWE, "Word %zu subword %zu:'%s' status %s", + wordpos, w->node_num, s, gword_status(sent, w)); + if (NULL != sent->word[wordpos].unsplit_word) + lgdebug(D_DWE, " (unsplit '%s')", sent->word[wordpos].unsplit_word); + + /* Generate an "alternatives" component. */ + altappend(sent, &sent->word[wordpos].alternatives, s); + w->sent_wordidx = wordpos; + + if (w->status & WS_INDICT) + { + we = build_word_expressions(sent, w, NULL); + } + else if (w->status & WS_REGEX) + { + we = build_word_expressions(sent, w, w->regex_name); + } + else + { +#ifdef DEBUG + if (boolean_dictionary_lookup(dict, w->subword)) + { + prt_error("Error: Word '%s': Internal error: Known word is unknown\n", + w->subword); + } +#endif /* DEBUG */ + if (dict->unknown_word_defined && dict->use_unknown_word) + { + we = build_word_expressions(sent, w, UNKNOWN_WORD); + assert(we, UNKNOWN_WORD " supposed to be defined in the dictionary!"); + w->status |= WS_UNKNOWN; + } + else + { + /* The word is unknown, but UNKNOWN_WORD cannot be used. + * An error message will eventually be printed. */ + prt_error("Error: Word '%s': word is unknown\n", w->subword); + return false; + } + } + +#ifdef DEBUG + assert(NULL != we, "Word '%s': NULL X-node", w->subword); +#else + if (NULL == we) + { + /* FIXME Change it to assert() when the Wordgraph version is mature. */ + prt_error("Error: Word '%s': Internal error: NULL X_node", w->subword); + return false; + } +#endif + + /* If the current word is an empty-word (or like it), add a + * connector for an empty-word (EMPTY_CONNECTOR - ZZZ+) to the + * previous word. See the comments at add_empty_word(). + * As a shortcut, only the first x-node is checked here for ZZZ-, + * supposing that the word has it in all of its dict entries + * (in any case, currently there is only 1 entry for each such word). + * Note that ZZZ_added starts by 0 and so also wordpos, and that the + * first sentence word (usually LEFT-WALL) doesn't need a check. */ + if ((wordpos != *ZZZ_added) && is_exp_like_empty_word(dict, we->exp)) + { + lgdebug(D_DWE, " (has ZZZ-)"); + add_empty_word(dict, sent->word[wordpos-1].x); + *ZZZ_added = wordpos; /* Remember it for not doing it again */ + } + lgdebug(D_DWE, "\n"); + + /* At last .. concatenate the word expressions we build for + * this alternative. */ + sent->word[wordpos].x = catenate_X_nodes(sent->word[wordpos].x, we); + if (verbosity_level(D_X_NODE)) + { + /* Print the X_node details for the word. */ + prt_error("Debug: Tokenize word/alt=%zu/%zu '%s' re=%s\n\\", + wordpos, altlen(sent->word[wordpos].alternatives), s, + w->regex_name ? w->regex_name : ""); + while (we) + { + prt_error("Debug: string='%s' expr=", we->string); + print_expression(we->exp); + we = we->next; + } + } + + return true; +} +#undef D_DWE + +#if 0 /* unused */ +/** + * Find whether w1 and w2 have been generated together in the same alternative. + */ +static bool is_alternative_next_word(const Gword *w1, const Gword *w2) +{ + assert(NULL != w1->alternative_id, "Word '%s' NULL alternative_id", + w1->subword); + lgdebug(+6, "w1='%s' (%p=%s) w2='%s' (%p=%s) \n", + w1->subword, w1->alternative_id, w1->alternative_id->subword, + w2->subword, w2->alternative_id, w2->alternative_id->subword); + return (w1->alternative_id == w2->alternative_id); +} +#endif + +#ifdef FIXIT /* unused */ +/* XXX WS_UNSPLIT */ +static bool same_unsplit_word(Sentence sent, const Gword *w1, const Gword *w2) +{ + return ((w1->unsplit_word == w2->unsplit_word) && + (w1->unsplit_word != sent->wordgraph)); +} +#endif + +#define D_WPP 8 +static void print_wordgraph_pathpos(const Wordgraph_pathpos *wp) +{ + size_t i = 0; + + if (NULL == wp) + { + lgdebug(+D_WPP, "Empty\n"); + return; + } + lgdebug(+D_WPP, "\n"); + for (; NULL != wp->word; wp++) + { + lgdebug(D_WPP, "%zu: %zu:word '%s', same=%d used=%d level=%zu\n", + i++, wp->word->node_num, wp->word->subword, wp->same_word, + wp->used, wp->word->hier_depth); + } +} +#undef D_WPP + +/** + * "Flatten" the wordgraph into a word array. + * Return false if an error was encountered. + */ +#define D_FW 8 +bool flatten_wordgraph(Sentence sent, Parse_Options opts) +{ + Wordgraph_pathpos *wp_new = NULL; + Wordgraph_pathpos *wp_old = NULL; + Wordgraph_pathpos *wpp_new, *wpp_old; + Gword *wg_word; /* A wordgraph word */ + Gword **next; /* The next words */ + const Gword *last_unsplit_word = NULL; + size_t max_words = 0; + bool error_encountered = false; + bool right_wall_encountered = false; + unsigned int ZZZ_added = 0; /* ZZZ+ has been added to previous word */ + + assert(0 == sent->length, "flatten_wordgraph(): Word array already exists."); + + /* Establish an upper bound on the total number of words, to prevent an + * infinite loop in case of a bug. At the same time, calculate the + * hierarchy position of the word. */ + for (wg_word = sent->wordgraph->chain_next; wg_word; + wg_word = wg_word->chain_next) + { + wordgraph_hier_position(wg_word); + max_words++; + } + + /* Populate the pathpos word queue */ + for (next = sent->wordgraph->next; *next; next++) + { + wordgraph_pathpos_add(&wp_new, *next, + false/* used */, false/* same_word */, + true/* diff_alternative */); + } + + /* Scan the wordgraph and flatten it. */ + do + { + Word *wa_word; /* A word-array word (for the parsing stage) */ + const Gword *unsplit_word; + + assert(NULL != wp_new, "pathpos word queue is empty"); + wp_old = wp_new; + wp_new = NULL; + print_wordgraph_pathpos(wp_old); + + /* Add a new word to the sentence word array. */ + assert(0 < max_words, "Too many words (it may be an infinite loop)"); + max_words--; /* For the assert() above */ + wa_word = word_new(sent); + + /* Find the sentence word. */ + unsplit_word = wp_old->word; + if (MT_INFRASTRUCTURE != unsplit_word->morpheme_type) + { + unsplit_word = wg_get_sentence_word(sent, (Gword *)unsplit_word); + if (unsplit_word != last_unsplit_word) + { + /* This is a new sentence word - use it as the unsplit word. */ + wa_word->unsplit_word = unsplit_word->subword; + last_unsplit_word = unsplit_word; + } + } + + /* Generate the X-nodes. */ + for (wpp_old = wp_old; NULL != wpp_old->word; wpp_old++) + { + wg_word = wpp_old->word; + if (NULL == wg_word->next) continue; /* XXX avoid termination */ + + if (wpp_old->same_word) + { + /* We haven't advanced to the next wordgraph word, so its X-node + * has already been generated in a previous word of the word + * array. This means we are in a longer alternative which has + * "extra" words that may not have links, and this is one of + * them. Mark it as "optional", so we consider that while + * parsing, and then remove it in case it doesn't have links. */ + sent->word[sent->length - 1].optional = true; + } + else + { + /* Words are not supposed to get issued more than once. */ + assert(!wpp_old->used, "Word %zu:%s has been used", + wg_word->node_num, wpp_old->word->subword); + + /* This is a new wordgraph word. + */ + assert(!right_wall_encountered, "Extra word"); + if (!determine_word_expressions(sent, wg_word, &ZZZ_added)) + error_encountered = true; + if ((MT_WALL == wg_word->morpheme_type) && + 0== strcmp(wg_word->subword, RIGHT_WALL_WORD)) + right_wall_encountered = true; + wpp_old->used = true; + } + } + + /* Scan the old pathpos queue, and check for which words we can advance + * in the wordgraph. Do it in two passes: + * 1. Advance to next words that are next in the alternative of old + * words. + * 2. Advance to next words that are in a different alternative than the + * words that are already in the new pathpos queue. + */ + + for (wpp_old = wp_old; NULL != wpp_old->word; wpp_old++) + { + wg_word = wpp_old->word; + if (NULL == wg_word->next) continue; /* XXX avoid termination word */ + + /* Here wg_word->next cannot be NULL. */ + assert(NULL != wg_word->next[0], "Bad wordgraph: " + "'%s'->next[0]==NULL", wg_word->subword); + assert((NULL != wg_word->next[0]->prev) && + (NULL != wg_word->next[0]->prev[0]), "Bad wordgraph: " + "'%s'->next[0]: No prev", wg_word->subword); + + for (next = wg_word->next; NULL != *next; next++) + { + if (wg_word->hier_depth <= (*next)->hier_depth && + (NULL == (*next)->prev[1])) + { + lgdebug(+D_FW, "Word %zu:%s(%zu) next %zu:%s(%zu) next_ok\n", + wg_word->node_num, wg_word->subword, wg_word->hier_depth, + (*next)->node_num, (*next)->subword, (*next)->hier_depth); + wpp_old->next_ok = true; + break; + } + } + + if (wpp_old->next_ok) + { + lgdebug(+D_FW, "Advancing %zu:%s next_ok\n", wg_word->node_num, + wg_word->subword); + for (next = wg_word->next; NULL != *next; next++) + { + wordgraph_pathpos_add(&wp_new, *next, + false/* used */, false/* same_word */, + true/* diff_alternative */); + } + } + } + + for (wpp_old = wp_old; NULL != wpp_old->word; wpp_old++) + { + wg_word = wpp_old->word; + + if (!wpp_old->next_ok) /* next_ok words have got handled above */ + { + bool same_alternative = false; + + if (NULL == wg_word->next) continue; /* termination word */ + + if (NULL != wp_new) + { + for (next = wg_word->next; NULL != *next; next++) + { + for (wpp_new = wp_new; NULL != wpp_new->word; wpp_new++) + { + if ((wpp_new->word != *next) && + in_same_alternative(wpp_new->word, *next)) + { + lgdebug(+D_FW, "same_alternative: %zu:%s and %zu:%s\n", + wpp_new->word->node_num, wpp_new->word->subword, + (*next)->node_num, (*next)->subword); + same_alternative = true; + break; + } + } + if (same_alternative) break; /* shortcut */ + } + } + + /* If there are already words in the pathpos queue from the same + * alternative of the common ancestor of the next word, we cannot + * put it yet in the queue, because we should not put in the same + * slot of the word-array, words from the same alternative since + * else only one of them can be chosen by the linkage. Hence put + * again in the pathpos queue the current word, marking it was + * "same_word". This will cause generation of an empty word in the + * next round. */ + lgdebug(+D_FW, "Advancing %zu:%s: ", wg_word->node_num, + wg_word->subword); + + if (same_alternative) + { + lgdebug(D_FW, "No (same alt) used=%d\n", wpp_old->used); + wordgraph_pathpos_add(&wp_new, wg_word, + wpp_old->used, true/* same_word */, + true/* diff_alternative */); + } + else + { + bool added = false; + + for (next = wg_word->next; NULL != *next; next++) + added |= wordgraph_pathpos_add(&wp_new, *next, + false/* used */, + false/* same_word */, + true/* diff_alternative */); + if (added) + { + lgdebug(D_FW, "Yes\n"); + } + else + { + lgdebug(D_FW, "No (existing)\n"); + } + } + } + } + + free(wp_old); + } while ((NULL != wp_new[1].word) || + (wp_new[0].word->morpheme_type != MT_INFRASTRUCTURE)); + + wp_new[0].word->sent_wordidx = sent->length; + free(wp_new); + lgdebug(+D_FW, "sent->length %zu\n", sent->length); + if (verbosity_level(D_SW)) + { + dyn_str *s = dyn_str_new(); + print_sentence_word_alternatives(s, sent, true, NULL, NULL); + char *out = dyn_str_take(s); + prt_error("Debug: Sentence words and alternatives:\n%s", out); + free(out); + } + + return !error_encountered; +} +#undef D_FW + +/** + * This just looks up all the words in the sentence, and builds + * up an appropriate error message in case some are not there. + * It has no side effect on the sentence. Returns true if all + * went well. + * + * This code is called only if the 'unknown-words' flag is set. + */ +bool sentence_in_dictionary(Sentence sent) +{ + bool ok_so_far; + size_t w; + const char * s; + Dictionary dict = sent->dict; + char temp[1024]; + + ok_so_far = true; + for (w=0; wlength; w++) + { + size_t ialt; + for (ialt=0; NULL != sent->word[w].alternatives[ialt]; ialt++) + { + s = sent->word[w].alternatives[ialt]; + if (!find_word_in_dict(dict, s)) + { + if (ok_so_far) + { + safe_strcpy(temp, "The following words are not in the dictionary:", sizeof(temp)); + ok_so_far = false; + } + safe_strcat(temp, " \"", sizeof(temp)); + safe_strcat(temp, s, sizeof(temp)); + safe_strcat(temp, "\"", sizeof(temp)); + } + } + } + if (!ok_so_far) + { + err_ctxt ec = { sent }; + err_msgc(&ec, lg_Error, "Sentence not in dictionary\n%s\n", temp); + } + return ok_so_far; +} diff -Nru link-grammar-5.3.16/link-grammar/tokenize/tokenize.h link-grammar-5.5.0/link-grammar/tokenize/tokenize.h --- link-grammar-5.3.16/link-grammar/tokenize/tokenize.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/tokenize.h 2018-01-24 01:19:58.000000000 +0000 @@ -0,0 +1,32 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _TOKENIZE_H +#define _TOKENIZE_H + +#include "api-types.h" +#include "link-includes.h" + +bool separate_sentence(Sentence, Parse_Options); +bool sentence_in_dictionary(Sentence); +bool flatten_wordgraph(Sentence, Parse_Options); +void wordgraph_delete(Sentence); +void tokenization_done(Sentence, Gword *); + +void altappend(Sentence, const char ***, const char *); + +Gword *issue_word_alternative(Sentence sent, Gword *unsplit_word, + const char *label, + int prefnum, const char * const *prefix, + int stemnum, const char * const *stem, + int suffnum, const char * const *suffix); +#endif /* _TOKENIZE_H */ diff -Nru link-grammar-5.3.16/link-grammar/tokenize/tok-structures.h link-grammar-5.5.0/link-grammar/tokenize/tok-structures.h --- link-grammar-5.3.16/link-grammar/tokenize/tok-structures.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/tok-structures.h 2017-12-02 07:30:28.000000000 +0000 @@ -0,0 +1,189 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* Copyright (c) 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _TOK_STRUCTURES_H_ +#define _TOK_STRUCTURES_H_ + +#include +#include "api-types.h" +#include "link-includes.h" + +// TODO provide gword access methods! + +/* conditional compiling flags */ +#define INFIX_NOTATION + /* If defined, then we're using infix notation for the dictionary */ + /* otherwise we're using prefix notation */ + +/* An ordered set of gword pointers, used to indicate the source gword + * (Wordgraph word) of disjuncts and connectors. Usually it contains only + * one element. However, when a duplicate disjunct is eliminated (see + * eliminate_duplicate_disjuncts()) and it originated from a different + * gword (a relatively rare event) its gword is added to the gword_set of + * the remaining disjunct. A set of 3 elements is extremely rare. The + * original order is preserved, in a hope for better caching on + * alternatives match checks in fast-match.c. + * + * Memory management: + * A copy-on-write semantics is used when constructing a new gword_set. It + * means that all the gword sets with one element are shared. These gword + * sets are part of the Gword structure. Copied and added element are + * alloc'ed and chained. The result is that the chain_next of the gword + * sets that are part of each gword contains the list of alloc'ed elements - + * to be used in gword_set_delete() called *only* in sentence_delete(). + * This ensures that the gword_set of connectors doesn't get stale when + * their disjuncts are deleted and later restored in one-step parse when + * min_null_count=0 and max_null count>0 (see classic_parse()). + */ +struct gword_set +{ + Gword *o_gword; + struct gword_set *next; + struct gword_set *chain_next; +}; + + +typedef enum +{ + MT_INVALID, /* Zero, to be changed to the correct type */ + MT_WORD, /* Regular word */ + MT_FEATURE, /* Pseudo morpheme, currently capitalization marks */ + MT_INFRASTRUCTURE, /* Start and end Wordgraph pseudo-words */ + MT_WALL, /* The LEFT-WALL and RIGHT-WALL pseudo-words */ + MT_EMPTY, /* Empty word FIXME: Remove it. */ + MT_UNKNOWN, /* Unknown word (FIXME? Unused) */ + /* Experimental for Semitic languages (yet unused) */ + MT_TEMPLATE, + MT_ROOT, + /* Experimental - for display purposes. + * MT_CONTR is now used in the tokenization step, see the comments there. */ + MT_CONTR, /* Contracted part of a contraction (e.g. y', 's) */ + MT_PUNC, /* Punctuation (yet unused) */ + /* We are not going to have >63 types up to here. */ + MT_STEM = 1<<6, /* Stem */ + MT_PREFIX = 1<<7, /* Prefix */ + MT_MIDDLE = 1<<8, /* Middle morpheme (yet unused) */ + MT_SUFFIX = 1<<9 /* Suffix */ +} Morpheme_type; +#define IS_REG_MORPHEME (MT_STEM|MT_PREFIX|MT_MIDDLE|MT_SUFFIX) + +/* Word status */ +/* - Tokenization */ +#define WS_UNKNOWN (1<<0) /* Unknown word */ +#define WS_REGEX (1<<1) /* Matches a regex */ +#define WS_SPELL (1<<2) /* Result of a spell guess */ +#define WS_RUNON (1<<3) /* Separated from words run-on */ +#define WS_HASALT (1<<4) /* Has alternatives (one or more)*/ +#define WS_UNSPLIT (1<<5) /* It's an alternative to itself as an unsplit word */ +#define WS_INDICT (1<<6) /* boolean_dictionary_lookup() is true */ +#define WS_FIRSTUPPER (1<<7) /* Subword is the lc version of its unsplit_word. + The original word can be restored if needed + through this unsplit_word. */ +/* - Post linkage stage. */ +#define WS_PL (1<<14) /* Post-Linkage, not belonging to tokenization */ + +#define WS_GUESS (WS_SPELL|WS_RUNON|WS_REGEX) + +/*Only TS_DONE is actually used. */ +typedef enum +{ + TS_INITIAL, + TS_LR_STRIP, + TS_AFFIX_SPLIT, + TS_REGEX, + TS_RUNON, + TS_SPELL, + TS_DONE /* Tokenization done */ +} Tokenizing_step; + +/* For the "guess" field of Gword_struct. */ +typedef enum +{ + GM_REGEX = '!', + GM_SPELL = '~', + GM_RUNON = '&', + GM_UNKNOWN = '?' +} Guess_mark; + +#define MAX_SPLITS 10 /* See split_counter below */ + +struct Gword_struct +{ + const char *subword; + const char *start; /* subword start position. */ + const char *end; /* subword end position. */ + + Gword *unsplit_word; /* Upward-going co-tree */ + Gword **next; /* Right-going tree */ + Gword **prev; /* Left-going tree */ + Gword *chain_next; /* Next word in the chain of all words */ + + /* Disjuncts and connectors point back to their originating Gword(s). */ + gword_set gword_set_head; + + /* Used by sane_linkage_morphism() and remove_empty_words() for + * locating optional words that should be removed. */ + WordIdx sent_wordidx; /* Index in the 2D sentence word array. */ + + /* For debug and inspiration. */ + const char *label; /* Debug label - code locations of tokenization */ + size_t node_num; /* For differentiating words with identical subwords, + and for indicating the order in which word splits + have been done. Shown in the Wordgraph display and in + debug messages. Not used otherwise. Could have been + used for hier_position instead of pointers in order + to optimize its generation and comparison. */ + + /* Tokenizer state */ + Tokenizing_step tokenizing_step; + bool issued_unsplit; /* The word has been issued as an alternative to itself. + It will become an actual alternative to itself only + if it's not the sole alternative, in which case it + will be marked with WS_UNSPLIT. */ + size_t split_counter; /* Incremented on splits. A word cannot split more than + MAX_SPLITS times and a warning is issued then. */ + + unsigned int status; /* See WS_* */ + Morpheme_type morpheme_type; /* See MT_* */ + Gword *alternative_id; /* Alternative start - a unique identifier of + the alternative to which the word belongs. */ + const char *regex_name; /* Subword matches this regex. + FIXME? Extend for multiple regexes. */ + + /* Only used by wordgraph_flatten() */ + const Gword **hier_position; /* Unsplit_word/alternative_id pointer list, up + to the original sentence word. */ + size_t hier_depth; /* Number of pointer pairs in hier_position */ + + /* XXX Experimental. Only used after the linkage (by compute_chosen_words()) + * for an element in the linkage display wordgraph path that represents + * a block of null words that are morphemes of the same word. */ + Gword **null_subwords; /* Null subwords represented by this word */ +}; + +/* Wordgraph path word-positions, + * used in wordgraph_flatten() and sane_linkage_morphism(). + * FIXME Separate to two different structures. */ +struct Wordgraph_pathpos_s +{ + Gword *word; /* Position in the Wordgraph */ + /* Only for wordgraph_flatten(). */ + bool same_word; /* Still the same word - issue an empty word */ + bool next_ok; /* OK to proceed to the next Wordgraph word */ + bool used; /* Debug - the word has been issued */ + /* Only for sane_morphism(). */ + const Gword **path; /* Linkage candidate wordgraph path */ +}; + +#endif diff -Nru link-grammar-5.3.16/link-grammar/tokenize/wg-display.c link-grammar-5.5.0/link-grammar/tokenize/wg-display.c --- link-grammar-5.3.16/link-grammar/tokenize/wg-display.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/wg-display.c 2018-04-25 17:04:50.000000000 +0000 @@ -0,0 +1,648 @@ +/*************************************************************************/ +/* Copyright (c) 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include + +#ifdef USE_WORDGRAPH_DISPLAY +#include +#include +#include +#ifdef HAVE_FORK +#include /* fork() and execl() */ +#include /* waitpid() */ +#endif +#ifdef HAVE_PRCTL +#include /* prctl() */ +#endif +#include /* SIG* */ + +#include /* for append_string */ +#include /* for dyn_str functions and UNREACHABLE */ +#endif /* USE_WORDGRAPH_DISPLAY */ + +#include "api-structures.h" +#include "error.h" +#include "string-set.h" +#include "tok-structures.h" +#include "wordgraph.h" + +#ifdef __APPLE__ +#define POPEN_DOT +#endif /* __APPLE__ */ + +#if USE_WORDGRAPH_DISPLAY || defined(DEBUG) +GNUC_UNUSED const char *gword_morpheme(Sentence sent, const Gword *w) +{ + const char *mt; + char buff[64]; + + switch (w->morpheme_type) + { + case MT_INVALID: + mt = "MT_INVALID"; + break; + case MT_WORD: + mt = "MT_WORD"; + break; + case MT_FEATURE: + mt = "MT_FEATURE"; + break; + case MT_INFRASTRUCTURE: + mt = "MT_I-S"; + break; + case MT_WALL: + mt = "MT_WALL"; + break; + case MT_EMPTY: + mt = "MT_EMPTY"; + break; + case MT_UNKNOWN: + mt = "MT_UNKNOWN"; + break; + case MT_TEMPLATE: + mt = "MT_TEMPLATE"; + break; + case MT_ROOT: + mt = "MT_ROOT"; + break; + case MT_CONTR: + mt = "MT_CONTR"; + break; + case MT_PUNC: + mt = "MT_PUNC"; + break; + case MT_STEM: + mt = "MT_STEM"; + break; + case MT_PREFIX: + mt = "MT_PREFIX"; + break; + case MT_MIDDLE: + mt = "MT_MIDDLE"; + break; + case MT_SUFFIX: + mt = "MT_SUFFIX"; + break; + default: + /* No truncation is expected. */ + snprintf(buff, sizeof(buff), "MT_%d", w->morpheme_type); + mt = string_set_add(buff, sent->string_set); + } + + return mt; +} +#endif /* USE_WORDGRAPH_DISPLAY || defined(DEBUG) */ + +#if USE_WORDGRAPH_DISPLAY +/* === Wordgraph graphical representation === */ + +static void wordgraph_legend(dyn_str *wgd, unsigned int mode) +{ + size_t i; + static char const *wst[] = { + "RE", "Matched a regex", + "SP", "Result of spell guess", + "RU", "Separated run-on word", + "HA", "Has an alternative", + "UNS", "Also unsplit_word", + "IN", "In the dict file", + "FI", "First char is uppercase" + }; + + append_string(wgd, + "subgraph cluster_legend {\n" + "label=Legend;\n" + "%s" + "legend [label=\"subword\\n(status-flags)\\nmorpheme-type\"];\n" + "legend [xlabel=\"ordinal-number\\ndebug-label\"];\n" + "%s" + "legend_width [width=4.5 height=0 shape=none label=<\n" + "\n" + "\n", + (mode & WGR_SUB) ? "subgraph cluster_unsplit_word {\n" + "label=\"ordinal-number unsplit-word\";\n" : "", + (mode & WGR_SUB) ? "}\n" : "" + + ); + for (i = 0; i < sizeof(wst)/sizeof(wst[0]); i += 2) + { + append_string(wgd, + "\n", + wst[i], wst[i+1]); + } + + append_string(wgd, + "
status-flags
%s%s
>];" + "}\n" + "subgraph cluster_legend_top_space {\n" + "style=invis legend_dummy [style=invis height=0 shape=box]\n" + "};\n" + ); +} + +/** + * Graph node name: Add "Sentence:" for the main node. + * Also escape " and \ with a \. + */ +static const char *wlabel(Sentence sent, const Gword *w) +{ + const char *s; + const char sentence_label[] = "Sentence:\\n"; + dyn_str *l = dyn_str_new(); + char c0[] = "\0\0"; + + assert((NULL != w) && (NULL != w->subword), "Word must exist"); + if ('\0' == *w->subword) + return string_set_add("(nothing)", sent->string_set); + + if (w == sent->wordgraph) dyn_strcat(l, sentence_label); + + for (s = w->subword; *s; s++) + { + switch (*s) + { + case '\"': + dyn_strcat(l, "\\\""); + break; + case '\\': + dyn_strcat(l, "\\"); + break; + default: + *c0 = *s; + dyn_strcat(l, c0); + } + } + + char *label_str = dyn_str_take(l); + s = string_set_add(label_str, sent->string_set); + free(label_str); + return s; +} + +/** + * Generate the wordgraph in dot(1) format, for debug. + */ +static dyn_str *wordgraph2dot(Sentence sent, unsigned int mode, const char *modestr) +{ + const Gword *w; + Gword **wp; + dyn_str *wgd = dyn_str_new(); /* the wordgraph in dot representation */ + char nn[2*sizeof(char *) + 2 + 2 + 1]; /* \"%p\" node name: "0x..."+NUL*/ + + /* This function is called only if we have a wordgraph, in which case + * chain_next is non-NULL. So stop static analyzers to complain that + * it can be possibly NULL. */ + UNREACHABLE(NULL == sent->wordgraph->chain_next); + + append_string(wgd, "# Mode: %s\n", modestr); + dyn_strcat(wgd, "digraph G {\nsize =\"30,20\";\nrankdir=LR;\n"); + if ((mode & (WGR_SUB)) && !(mode & WGR_COMPACT)) + dyn_strcat(wgd, "newrank=true;\n"); + if (mode & WGR_LEGEND) wordgraph_legend(wgd, mode); + append_string(wgd, "\"%p\" [shape=box,style=filled,color=\".7 .3 1.0\"];\n", + sent->wordgraph); + + for (w = sent->wordgraph; w; w = w->chain_next) + { + bool show_node; + + if (!(mode & WGR_UNSPLIT) && (MT_INFRASTRUCTURE != w->morpheme_type)) + { + Gword *wu; + + show_node = false; + /* In this mode nodes that are only unsplit_word are not shown. */ + for (wu = sent->wordgraph; wu; wu = wu->chain_next) + { + if (NULL != wu->next) + { + for (wp = wu->next; *wp; wp++) + { + if (w == *wp) + { + show_node = true; + break; + } + } + } + } + + if (!show_node) continue; + } + + snprintf(nn, sizeof(nn), "\"%p\"", w); + + /* Subword node format: + * +------------------+ + * + + + * + w->subword + + * + (w->flags) + + * + w->morpheme_type + + * + + + * +------------------+ + * w->node_num } <- external node label + * w->label } + * + * The flags and morpheme type are printed symbolically. + * The node_num field is the ordinal number of word creation. + * The label shows the code positions that created the subword. + * The external node label may appear at other positions near the node. + * + * FIXME: Use HTML labels. + */ + + append_string(wgd, "%s [label=\"%s\\n(%s)\\n%s\"];\n", nn, + wlabel(sent, w), gword_status(sent, w), gword_morpheme(sent, w)); + + if (!(mode & WGR_DBGLABEL)) + { + append_string(wgd, "%s [xlabel=\"%zu", + nn, w->node_num); + } + else + { + append_string(wgd, "%s [xlabel=\"%zu\\n%s", + nn, w->node_num, w->label); + } + + /* For debugging this function: display also hex node names. */ + if (mode & WGR_DOTDEBUG) + append_string(wgd, "\\n%p-%s", w, wlabel(sent, w)); + + dyn_strcat(wgd, "\"];\n"); + + if (NULL != w->next) + { + for (wp = w->next; *wp; wp++) + { + append_string(wgd, "%s->\"%p\" [label=next color=red];\n", + nn, *wp); + } + } + if (mode & WGR_PREV) + { + if (NULL != w->prev) + { + for (wp = w->prev; *wp; wp++) + { + append_string(wgd, "%s->\"%p\" [label=prev color=blue];\n", + nn, *wp); + } + } + } + if (mode & WGR_UNSPLIT) + { + if (!(mode & WGR_SUB) && (NULL != w->unsplit_word)) + { + append_string(wgd, "%s->\"%p\" [label=unsplit];\n", + nn, w->unsplit_word); + } + } + } + + if (mode & WGR_SUB) + { + const Gword *old_unsplit = NULL; + + for (w = sent->wordgraph; w; w = w->chain_next) + { + if (NULL != w->unsplit_word) + { + if (w->unsplit_word != old_unsplit) + { + if (NULL != old_unsplit) dyn_strcat(wgd, "}\n"); + append_string(wgd, "subgraph \"cluster-%p\" {", w->unsplit_word); + append_string(wgd, "label=\"%zu %s\"; \n", + w->unsplit_word->node_num, wlabel(sent, w->unsplit_word)); + + old_unsplit = w->unsplit_word; + } + snprintf(nn, sizeof(nn), "\"%p\"", w); + if (strstr(dyn_str_value(wgd), nn)) + append_string(wgd, "\"%p\"; ", w); + } + } + dyn_strcat(wgd, "}\n"); + } + else + { +#ifdef WGR_SHOW_TERMINATOR_AT_LHS /* not defined - not useful */ + const Gword *terminating_node = NULL; +#endif + + dyn_strcat(wgd, "{rank=same; "); + for (w = sent->wordgraph->chain_next; w; w = w->chain_next) + { + snprintf(nn, sizeof(nn), "\"%p\"", w); + if (IS_SENTENCE_WORD(sent, w) && + ((mode & WGR_UNSPLIT) || strstr(dyn_str_value(wgd), nn))) + { + append_string(wgd, "%s; ", nn); + } + +#ifdef WGR_SHOW_TERMINATOR_AT_LHS + if (NULL == w->next) terminating_node = w; +#endif + } + dyn_strcat(wgd, "}\n"); + +#ifdef WGR_SHOW_TERMINATOR_AT_LHS + if (terminating_node) + append_string(wgd, "{rank=sink; \"%p\"}\n", terminating_node); +#endif + } + + dyn_strcat(wgd, "\n}\n"); + + return wgd; +} + +#if defined(HAVE_FORK) && !defined(POPEN_DOT) +static pid_t pid; /* XXX not reentrant */ + +#ifndef HAVE_PRCTL +/** + * Cancel the wordgraph viewers, to be used if there is fork() but no prctl(). + */ +static void wordgraph_show_cancel(void) +{ + kill(pid, SIGTERM); +} +#endif /* HAVE_FORK */ +#endif /* HAVE_PRCTL */ + +#ifndef DOT_COMMNAD +#define DOT_COMMAND "dot" +#endif + +#ifndef DOT_DRIVER +#define DOT_DRIVER "-Txlib" +#endif + +/* In case files are used, their names are fixed. So more than one thread + * (or program) cannot use the word-graph display at the same time. This + * can be corrected, even though there is no much point to do that + * (displaying the word-graph is for debug). */ +#define DOT_FILENAME "lg-wg.vg" + +#define POPEN_DOT_CMD DOT_COMMAND" "DOT_DRIVER +#ifndef POPEN_DOT_CMD_NATIVE +# ifdef _WIN32 +# ifndef IMAGE_VIEWER +# define IMAGE_VIEWER "rundll32 PhotoViewer,ImageView_Fullscreen" +# endif +# define WGJPG "%TEMP%\\lg-wg.jpg" +# define POPEN_DOT_CMD_NATIVE \ + DOT_COMMAND" -Tjpg>"WGJPG"&"IMAGE_VIEWER" "WGJPG"&del "WGJPG +# elif __APPLE__ +# ifndef IMAGE_VIEWER +# define IMAGE_VIEWER "open -W" +# endif +# define WGJPG "$TMPDIR/lg-wg.jpg" +# define POPEN_DOT_CMD_NATIVE \ + DOT_COMMAND" -Tjpg>"WGJPG";"IMAGE_VIEWER" "WGJPG";rm "WGJPG +# else +# define POPEN_DOT_CMD_NATIVE POPEN_DOT_CMD +# endif +#endif + +#if !defined HAVE_FORK || defined POPEN_DOT +#ifdef _MSC_VER +#define popen _popen +#define pclose _pclose +#endif +/** + * popen a command with the given input. + * If the system doesn't have fork(), popen() is used to launch "dot". + * This is an inferior implementation than the one below that uses + * fork(), in which the window remains open and is updated automatically + * when new sentences are entered. With popen(), the program blocks at + * pclose() and the user needs to close the window after each sentence. + */ +static bool x_popen(const char *cmd, const char *wgds) +{ + FILE *const cmdf = popen(cmd, "w"); + bool rc = true; + + if (NULL == cmdf) + { + prt_error("Error: popen of '%s' failed: %s\n", cmd, strerror(errno)); + rc = false; + } + else + { + if (fprintf(cmdf, "%s", wgds) == -1) + { + prt_error("Error: print to display command: %s\n", strerror(errno)); + rc = false; + } + if (pclose(cmdf) == -1) + { + prt_error("Error: pclose of display command: %s\n", strerror(errno)); + rc = false; + } + } + + return rc; +} +#else +static bool x_forkexec(const char *const argv[], pid_t *vpid) +{ + /* Fork/exec a graph viewer, and leave it in the background until we exit. + * On exit, send SIGHUP. If prctl() is not available and the program + * crashes, then it is left to the user to exit the viewer. */ + if (0 < *vpid) + { + pid_t rpid = waitpid(*vpid, NULL, WNOHANG); + + if (0 == rpid) return true; /* viewer still active */ + if (-1 == rpid) + { + prt_error("Error: waitpid(%d): %s\n", *vpid, strerror(errno)); + *vpid = 0; + return false; + } + } + + *vpid = fork(); + switch (*vpid) + { + case -1: + prt_error("Error: fork(): %s\n", strerror(errno)); + return false; + case 0: +#ifdef HAVE_PRCTL + if (-1 == prctl(PR_SET_PDEATHSIG, SIGHUP)) + { + prt_error("Error: prctl: %s\n", strerror(errno)); + /* Non-fatal error - continue. */ + } +#endif + /* Not closing fd 0/1/2, to allow interaction with the program */ + execvp(argv[0], (char **)argv); + prt_error("Error: execlp of %s: %s\n", argv[0], strerror(errno)); + _exit(1); + default: +#ifndef HAVE_PRCTL + if (0 != atexit(wordgraph_show_cancel)) + { + prt_error("Warning: atexit(wordgraph_show_cancel) failed.\n"); + /* Non-fatal error - continue. */ + } +#endif + break; + } + + return true; +} +#endif /* !defined HAVE_FORK || defined POPEN_DOT */ + +#ifdef _WIN32 +#define TMPDIR (getenv("TEMP") ? getenv("TEMP") : ".") +#else +#define TMPDIR (getenv("TMPDIR") ? getenv("TMPDIR") : "/tmp") +#endif + +#define concatfn(fn, fn1, fn2) \ + (fn=alloca(strlen(fn1)+strlen(fn2)+2),\ + strcpy(fn, fn1), strcat(fn, "/"), strcat(fn, fn2)) + +static void wordgraph_unlink_xtmpfile(void) +{ + char *fn; + + if (!test_enabled("gvfile")) + { + concatfn(fn, TMPDIR, DOT_FILENAME); + if (unlink(fn) == -1) + prt_error("Warning: Cannot unlink %s: %s\n", fn, strerror(errno)); + } +} + +/** + * Display the word-graph in the indicated mode. + * This is for debug. It is not reentrant due to the static pid and the + * possibly created fixed filenames. + * When Using X11, a "dot -Txlib" program is launched on the graph + * description file. The xlib driver refreshes the graph when the file is + * changed, displaying additional sentences in the same window. The viewer + * program exits on program end (see the comments in the code). When + * compiled with MSVC or MINGW, the system PhotoViewer is used by default, + * unless !wg:x is used (for using X11 when available). + * + * The "dot" and the "PhotoViewer" programs must be in the PATH. + * + * FIXME? "dot" may get a SEGV due to memory corruptions in it (a known + * problem - exists even in 2.38). This can be worked-around by trying it + * again until it succeeds (but the window size, if changed by the user, + * will not be preserved). + * + * modestr: a graph display mode as defined in wordgraph.h (default "ldu"). + */ +bool sentence_display_wordgraph(Sentence sent, const char *modestr) +{ + dyn_str *wgd; + char *gvf_name = NULL; + bool generate_gvfile = test_enabled("gvfile"); /* keep it for debug */ + char *wgds; + bool gvfile = false; + uint32_t mode = 0; + const char *mp; + bool rc = true; + + for (mp = modestr; '\0' != *mp && ',' != *mp; mp++) + { + if ((*mp >= 'a') && (*mp <= 'z')) mode |= 1<<(*mp-'a'); + } + if ((0 == mode) || (WGR_X11 == mode)) + mode |= WGR_LEGEND|WGR_DBGLABEL|WGR_UNSPLIT; + + wgd = wordgraph2dot(sent, mode, modestr); + wgds = dyn_str_take(wgd); + +#if defined(HAVE_FORK) && !defined(POPEN_DOT) + gvfile = true; +#endif + + if (gvfile || generate_gvfile) + { + FILE *gvf; + bool gvf_error = false; + static bool wordgraph_unlink_xtmpfile_needed = true; + + concatfn(gvf_name, TMPDIR, DOT_FILENAME); + gvf = fopen(gvf_name, "w"); + if (NULL == gvf) + { + prt_error("Error: %s(): open %s failed: %s\n", + __func__, gvf_name, strerror(errno)); + gvf_error = true; + } + else + { + if (fprintf(gvf, "%s", wgds) == -1) + { + gvf_error = true; + prt_error("Error: %s(): print to %s failed: %s\n", + __func__, gvf_name, strerror(errno)); + } + if (fclose(gvf) == EOF) + { + gvf_error = true; + prt_error("Error: %s(): close %s failed: %s\n", + __func__, gvf_name, strerror(errno)); + } + } + if (gvf_error && gvfile) /* we need it - cannot continue */ + { + rc = false; + goto finish; + } + + if (wordgraph_unlink_xtmpfile_needed) + { + /* The filename is fixed - removal needed only once. */ + wordgraph_unlink_xtmpfile_needed = false; + atexit(wordgraph_unlink_xtmpfile); + } + } + +#ifdef _WIN32 +#define EXITKEY "ALT-F4" +#elif __APPLE__ +#define EXITKEY "⌘-Q" +#endif + +#ifdef EXITKEY + prt_error("Press "EXITKEY" in the graphical display window to continue\n"); +#endif + +#if !defined HAVE_FORK || defined POPEN_DOT + rc = x_popen((mode & WGR_X11)? POPEN_DOT_CMD : POPEN_DOT_CMD_NATIVE, wgds); +#else + { + assert(NULL != gvf_name, "DOT filename not initialized (#define mess?)"); + const char *const args[] = { DOT_COMMAND, DOT_DRIVER, gvf_name, NULL }; + rc = x_forkexec(args, &pid); + } +#endif + +finish: + free(wgds); + return rc; +} +#else +bool sentence_display_wordgraph(Sentence sent, const char *modestr) +{ + prt_error("Error: Library not configured with wordgraph-display\n"); + return false; +} +#endif /* USE_WORDGRAPH_DISPLAY */ diff -Nru link-grammar-5.3.16/link-grammar/tokenize/wordgraph.c link-grammar-5.5.0/link-grammar/tokenize/wordgraph.c --- link-grammar-5.3.16/link-grammar/tokenize/wordgraph.c 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/wordgraph.c 2018-03-06 18:44:38.000000000 +0000 @@ -0,0 +1,522 @@ +/*************************************************************************/ +/* Copyright (c) 2014 Amir Plivatsky */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#include + +#include "api-structures.h" +#include "error.h" +#include "string-set.h" +#include "tok-structures.h" +#include "tokenize.h" +#include "wordgraph.h" + +/* === Gword utilities === */ +/* Many more Gword utilities, that are used only in particular files, + * are defined in these files statically. */ + +Gword *gword_new(Sentence sent, const char *s) +{ + Gword *gword = malloc(sizeof(*gword)); + + memset(gword, 0, sizeof(*gword)); + assert(NULL != s, "Null-string subword"); + gword->subword = string_set_add(s, sent->string_set); + + if (NULL != sent->last_word) sent->last_word->chain_next = gword; + sent->last_word = gword; + gword->node_num = sent->gword_node_num++; + + gword->gword_set_head = (gword_set){0}; + gword->gword_set_head.o_gword = gword; + + return gword; +} + +/* FIXME: Remove it. */ +Gword *empty_word(void) +{ + /* + static Gword e = { + .subword = EMPTY_WORD_MARK, + .unsplit_word = &e, + .morpheme_type = MT_EMPTY, + .alternative_id = &e, + .status = WS_INDICT, + }; + */ + return NULL; +} + +static Gword **gwordlist_resize(Gword **arr, size_t len) +{ + arr = realloc(arr, (len+2) * sizeof(Gword *)); + arr[len+1] = NULL; + return arr; +} + +size_t gwordlist_len(const Gword **arr) +{ + size_t len = 0; + if (arr) + while (arr[len] != NULL) len++; + return len; +} + +void gwordlist_append(Gword ***arrp, Gword *p) +{ + size_t n = gwordlist_len((const Gword **)*arrp); + + *arrp = gwordlist_resize(*arrp, n); + (*arrp)[n] = p; +} + +#if 0 +/** + * Append a Gword list to a given Gword list (w/o duplicates). + */ +void gwordlist_append_list(const Gword ***to_word, const Gword **from_word) +{ + size_t to_word_arr_len = gwordlist_len(*to_word); + + for (const Gword **f = from_word; NULL != *f; f++) + { + size_t l; + + /* Note: Must use indexing because to_word may get realloc'ed. */ + for (l = 0; l < to_word_arr_len; l++) + if (*f == (*to_word)[l]) break; /* Filter duplicates. */ + + if (l == to_word_arr_len) + gwordlist_append((Gword ***)to_word, (Gword *)*f); + } +} + +/** + * Replace "count" words from the position "start" by word "wnew". + */ +static void wordlist_replace(Gword ***arrp, size_t start, size_t count, + const Gword *wnew) +{ + size_t n = gwordlist_len((const Gword **)(*arrp+start+count)); + + memmove(*arrp+start+1, *arrp+start+count, (n+1) * sizeof(Gword *)); + (*arrp)[start] = (Gword *)wnew; +} +#endif + +size_t wordgraph_pathpos_len(Wordgraph_pathpos *wp) +{ + size_t len = 0; + if (wp) + while (wp[len].word != NULL) len++; + return len; +} + +/** + * `len` is the new length, not counting the terminating null entry. + */ +/* FIXME (efficiency): Initially allocate more than 2 elements */ +Wordgraph_pathpos *wordgraph_pathpos_resize(Wordgraph_pathpos *wp, + size_t len) +{ + wp = realloc(wp, (len+1) * sizeof(*wp)); + wp[len].word = NULL; + return wp; +} + +/** + * Insert the gword into the path queue in reverse order of its hier_depth. + * + * The deepest wordgraph alternatives must be scanned first. + * Otherwise, this sentence causes a flattening mess: + * "T" this is a flattening test + * (The mess depends on both "T" and "T matching EMOTICON, and any + * 5 words after "T".) + * + * Parameters: + * same_word: mark that the same word is queued again. + * For validation code only (until the wordgraph version is mature): + * used: mark that the word has already been issued into the 2D-array. + * diff_alternative: validate we don't queue words from the same alternative. + */ +bool wordgraph_pathpos_add(Wordgraph_pathpos **wp, Gword *p, bool used, + bool same_word, bool diff_alternative) +{ + size_t n = wordgraph_pathpos_len(*wp); + Wordgraph_pathpos *wpt; + size_t insert_here = n; + + assert(NULL != p); + +#ifdef DEBUG + if (verbosity_level(+9)) print_hier_position(p); +#endif + + if (NULL != *wp) + { + for (wpt = *wp; NULL != wpt->word; wpt++) + { + if (p == wpt->word) + return false; /* already in the pathpos queue - nothing to do */ + + /* Insert in reverse order of hier_depth. */ + if ((n == insert_here) && (p->hier_depth >= wpt->word->hier_depth)) + insert_here = wpt - *wp; + + /* Validate that there are no words in the pathpos queue from the same + * alternative. This can be commented out when the wordgraph code is + * mature. FIXME */ + if (diff_alternative) + { + assert(same_word||wpt->same_word||!in_same_alternative(p,wpt->word), + "wordgraph_pathpos_add(): " + "Word%zu '%s' is from same alternative of word%zu '%s'", + p->node_num, p->subword, + wpt->word->node_num, wpt->word->subword); + } + } + } + + *wp = wordgraph_pathpos_resize(*wp, n+1); + + if (insert_here < n) + { + /* n+1 because n is the length of the array, not including the + * terminating null entry. We need to protect the terminating null. + */ + memmove(&(*wp)[insert_here+1], &(*wp)[insert_here], + (n+1 - insert_here) * sizeof (*wpt)); + } + + (*wp)[insert_here].word = p; + (*wp)[insert_here].same_word = same_word; + (*wp)[insert_here].used = used; + (*wp)[insert_here].next_ok = false; + + return true; +} + +/** + * Print linkage wordgraph path. + */ +void print_lwg_path(Gword **w, const char *title) +{ + lgdebug(+0, "%s: ", title); + for (; *w; w++) lgdebug(0, "%s ", (*w)->subword); + lgdebug(0, "\n"); +} + +#ifdef DEBUG +GNUC_UNUSED static const char *debug_show_subword(const Gword *w) +{ + return w->unsplit_word ? w->subword : "S"; +} + +GNUC_UNUSED void print_hier_position(const Gword *word) +{ + const Gword **p; + + err_msg(lg_Debug, "[Word %zu:%s hier_position(hier_depth=%zu): ", + word->node_num, word->subword, word->hier_depth); + assert(2*word->hier_depth==gwordlist_len(word->hier_position), "word '%s'", + word->subword); + + for (p = word->hier_position; NULL != *p; p += 2) + { + err_msg(lg_Debug, "(%zu:%s/%zu:%s)", + p[0]->node_num, debug_show_subword(p[0]), + p[1]->node_num, debug_show_subword(p[1])); + } + err_msg(lg_Debug, "]\n"); +} + +/* Debug printout of a wordgraph Gword list. */ +GNUC_UNUSED void gword_set_print(const gword_set *gs) +{ + printf("Gword list: "); + + if (NULL == gs) + { + printf("(null)\n"); + return; + } + + for (; NULL != gs; gs = gs->next) + { + printf("word %p '%s' unsplit '%s'%s", gs->o_gword, (gs->o_gword)->subword, + (gs->o_gword)->unsplit_word->subword, NULL==gs->next ? "" : ", "); + } + printf("\n"); + +} +#endif + +/** + * Given a word, find its alternative ID. + * An alternative is identified by a pointer to its first word, which is + * getting set at the time the alternative is created at + * issue_word_alternative(). (It could be any unique identifier - for coding + * convenience it is a pointer.) + * + * Return the alternative_id of this alternative. + */ +static Gword *find_alternative(Gword *word) +{ + assert(NULL != word, "find_alternative(NULL)"); + assert(NULL != word->alternative_id, "find_alternative(%s): NULL id", + word->subword); + +#if 0 + lgdebug(+0, "find_alternative(%s): '%s'\n", + word->subword, debug_show_subword(word->alternative_id)); +#endif + + return word->alternative_id; +} + +/** + * Generate an hierarchy-position vector for the given word. + * It consists of list of (unsplit_word, alternative_id) pairs, leading + * to the word, starting from a sentence word. It is NULL terminated. + * Original sentence words don't have any such pair. + */ +const Gword **wordgraph_hier_position(Gword *word) +{ + const Gword **hier_position; /* NULL terminated */ + size_t i = 0; + Gword *w; + bool is_leaf = true; /* the word is in the bottom of the hierarchy */ + + if (NULL != word->hier_position) return word->hier_position; /* from cache */ + + /* + * Compute the length of the hier_position vector. + */ + for (w = find_real_unsplit_word(word, true); NULL != w; w = w->unsplit_word) + i++; + if (0 == i) i = 1; /* Handle the dummy start/end words, just in case. */ + /* Original sentence words (i==1) have zero (i-1) elements. Each deeper + * unsplit word has an additional element. Each element takes 2 word pointers + * (first one the unsplit word, second one indicating the alternative in + * which it is found). The last +1 is for a terminating NULL. */ + word->hier_depth = i - 1; + i = (2 * word->hier_depth)+1; + hier_position = malloc(i * sizeof(*hier_position)); + + /* Stuff the hierarchical position in a reverse order. */ + hier_position[--i] = NULL; + w = word; + while (0 != i) + { + hier_position[--i] = find_alternative(w); + w = find_real_unsplit_word(w, is_leaf); + hier_position[--i] = w; + is_leaf = false; + } + + word->hier_position = hier_position; /* cache it */ + return hier_position; +} + +/** + * Find if 2 words are in the same alternative of their common ancestor + * unsplit_word. + * "Same alternative" means at the direct alternative or any level below it. + * A + * | + * +-B C D + * | + * +-E F + * | + * +-G H + * | + * +-I J + * J and E (but not J and B) are in the same alternative of their common + * ancestor unsplit_word A. + * J and G are not in the same alternative (common ancestor unsplit_word F). + * + * Return true if they are, false otherwise. + */ +bool in_same_alternative(Gword *w1, Gword *w2) +{ + const Gword **hp1 = wordgraph_hier_position(w1); + const Gword **hp2 = wordgraph_hier_position(w2); + size_t i; + +#if 0 /* DEBUG */ + print_hier_position(w1); print_hier_position(w2); +#endif + +#if 0 /* BUG */ + /* The following is wrong! Comparison to the hier_position of the + * termination word is actually needed when there are alternatives of + * different lengths at the end of a sentence. This check then prevents + * the generation of empty words on the shorter alternative. */ + if ((NULL == w1->next) || (NULL == w2->next)) return false;/* termination */ +#endif + + for (i = 0; (NULL != hp1[i]) && (NULL != hp2[i]); i++) + { + if (hp1[i] != hp2[i]) break; + } + + /* In the even positions we have an unsplit_word. + * In the odd positions we have an alternative_id. + * + * If we are here when i is even, it means the preceding alternative_id was + * the same in the two words - so they belong to the same alternative. If + * i is 0, it means these are sentence words, and sentence words are all in + * the same alternative (including the dummy termination word). + * If the hierarchy-position vectors are equal, i is also even, and words + * with equal hierarchy-position vectors are in the same alternative. + * + * If we are here when i is odd, it means the alternative_id at i is not + * the same in the given words, but their preceding unsplit_words are the + * same - so they clearly not in the same alternative. + */ + if (0 == i%2) return true; + + return false; +} + +/** + * Get the real unsplit word of the given word. + * While the Wordgraph is getting constructed, when a subword has itself as one + * of its own alternatives, it appears in the wordgraph only once, still + * pointing to its original unsplit_word. It appears once in order not to + * complicate the graph, and the unsplit_word is not changed in order not loss + * information (all of these are implementation decisions). However, for the + * hierarchy position of the word (when it is a word to be issued, i.e. a leaf + * node) the real unsplit word is needed, which is the word itself. It is fine + * since such a word cannot get split further. + */ +Gword *find_real_unsplit_word(Gword *word, bool is_leaf) +{ + /* For the terminating word, return something unique. */ + if (NULL == word->unsplit_word) + return word; + + if (is_leaf && (word->status & WS_UNSPLIT)) + return word; + + return word->unsplit_word; +} + +/** + * Find the sentence word of which the given word has part. + * This is done by going upward in the wordgraph along the unsplit_word + * path until finding a word that is an original sentence word. + */ +Gword *wg_get_sentence_word(const Sentence sent, Gword *word) +{ + if (MT_INFRASTRUCTURE == word->morpheme_type) return NULL; + + while (!IS_SENTENCE_WORD(sent, word)) + { + word = word->unsplit_word; + assert(NULL != word, "wg_get_sentence_word(): NULL unsplit word"); + } + + assert(NULL != word->subword, "wg_get_sentence_word(): NULL subword"); + return word; +} + +/* FIXME The following debug functions can be generated by a script running + * from a Makefile and taking the values from structures.h, instead of hard + * coding the strings as done here. */ + +/** + * Create a short form of flags summary for displaying in a word node. + */ +const char *gword_status(Sentence sent, const Gword *w) +{ + dyn_str *s = dyn_str_new(); + const char *r; + size_t len; + + if (w->status & WS_UNKNOWN) + dyn_strcat(s, "UNK|"); + if (w->status & WS_INDICT) + dyn_strcat(s, "IN|"); + if (w->status & WS_REGEX) + dyn_strcat(s, "RE|"); + if (w->status & WS_SPELL) + dyn_strcat(s, "SP|"); + if (w->status & WS_RUNON) + dyn_strcat(s, "RU|"); + if (w->status & WS_HASALT) + dyn_strcat(s, "HA|"); + if (w->status & WS_UNSPLIT) + dyn_strcat(s, "UNS|"); + if (w->status & WS_PL) + dyn_strcat(s, "PL|"); + + + char *status_str = dyn_str_take(s); + len = strlen(status_str); + if (len > 0) status_str[len-1] = '\0'; /* ditch the last '|' */ + r = string_set_add(status_str, sent->string_set); + free(status_str); + return r; +} + +static void word_queue_delete(Sentence sent) +{ + word_queue_t *wq = sent->word_queue; + while (NULL != wq) + { + word_queue_t *wq_tofree = wq; + wq = wq->next; + free(wq_tofree); + } + sent->word_queue = NULL; + sent->word_queue_last = NULL; +} + +/** + * Delete the gword_set associated with the Wordgraph. + * @w First Wordgraph word. + */ +static void gword_set_delete(Gword *w) +{ + if (NULL == w) return; + for (w = w->chain_next; NULL != w; w = w->chain_next) + { + gword_set *n; + for (gword_set *f = w->gword_set_head.chain_next; NULL != f; f = n) + { + n = f->chain_next; + free(f); + } + } +} + +void wordgraph_delete(Sentence sent) +{ + word_queue_delete(sent); + + Gword *w = sent->wordgraph; + gword_set_delete(w); + + while (NULL != w) + { + Gword *w_tofree = w; + + free(w->prev); + free(w->next); + free(w->hier_position); + free(w->null_subwords); + w = w->chain_next; + free(w_tofree); + } + sent->last_word = NULL; + sent->wordgraph = NULL; +} diff -Nru link-grammar-5.3.16/link-grammar/tokenize/wordgraph.h link-grammar-5.5.0/link-grammar/tokenize/wordgraph.h --- link-grammar-5.3.16/link-grammar/tokenize/wordgraph.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/wordgraph.h 2018-04-25 17:04:50.000000000 +0000 @@ -0,0 +1,45 @@ +#ifndef _WORDGRAPH_H +#define _WORDGRAPH_H + +#include "api-structures.h" + +#ifdef USE_WORDGRAPH_DISPLAY +/* Wordgraph display representation modes. */ +#define lo(l) (l-'a') +#define WGR_SUB (1<unsplit_word == sent->wordgraph) + +Gword *gword_new(Sentence, const char *); +Gword *empty_word(void); /* FIXME: Remove it. */ +size_t gwordlist_len(const Gword **); +void gwordlist_append(Gword ***, Gword *); +void gword_set_print(const gword_set *); +void print_lwg_path(Gword **, const char *); +Gword *wg_get_sentence_word(const Sentence, Gword *); +#if 0 +void gwordlist_append_list(const Gword ***, const Gword **); +#endif + +const Gword **wordgraph_hier_position(Gword *); +void print_hier_position(const Gword *); +bool in_same_alternative(Gword *, Gword *); +Gword *find_real_unsplit_word(Gword *, bool); + +size_t wordgraph_pathpos_len(Wordgraph_pathpos *); +Wordgraph_pathpos *wordgraph_pathpos_resize(Wordgraph_pathpos *, size_t); +bool wordgraph_pathpos_add(Wordgraph_pathpos **, Gword *, bool, bool, bool); + +const char *gword_status(Sentence, const Gword *); +const char *gword_morpheme(Sentence sent, const Gword *w); +#endif /* _WORDGRAPH_H */ diff -Nru link-grammar-5.3.16/link-grammar/tokenize/word-structures.h link-grammar-5.5.0/link-grammar/tokenize/word-structures.h --- link-grammar-5.3.16/link-grammar/tokenize/word-structures.h 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize/word-structures.h 2017-05-31 07:51:33.000000000 +0000 @@ -0,0 +1,43 @@ +/*************************************************************************/ +/* Copyright (c) 2004 */ +/* Daniel Sleator, David Temperley, and John Lafferty */ +/* Copyright (c) 2013 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +#ifndef _WORD_STRUCTURE_H_ +#define _WORD_STRUCTURE_H_ + +#include "api-types.h" + +typedef struct X_node_struct X_node; +/** + * Word, as represented shortly after tokenization, but before parsing. + * + * X_node* x: + * Contains a pointer to a list of expressions from the dictionary, + * Computed by build_sentence_expressions(). + * + * Disjunct* d: + * Contains a pointer to a list of disjuncts for this word. + * Computed by: prepare_to_parse(), but modified by pruning and power + * pruning. + */ +struct Word_struct +{ + const char *unsplit_word; + + X_node * x; /* Sentence starts out with these, */ + Disjunct * d; /* eventually these get generated. */ + bool optional; /* Linkage is optional. */ + + const char **alternatives; +}; + +#endif diff -Nru link-grammar-5.3.16/link-grammar/tokenize.c link-grammar-5.5.0/link-grammar/tokenize.c --- link-grammar-5.3.16/link-grammar/tokenize.c 2017-02-17 21:27:03.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,3058 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2009, 2012-2014 Linas Vepstas */ -/* Copyright (c) 2014 Amir Plivatsky */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _WIN32 -#include -#endif -#include - -#include "anysplit.h" -#include "build-disjuncts.h" -#include "dict-api.h" -#include "dict-common.h" -#include "error.h" -#include "externs.h" -#include "print.h" -#include "print-util.h" -#include "regex-morph.h" -#include "spellcheck.h" -#include "string-set.h" -#include "structures.h" -#include "tokenize.h" -#include "utilities.h" -#include "wordgraph.h" -#include "word-utils.h" - -#define MAX_STRIP 10 -#define SYNTHETIC_SENTENCE_MARK '>' /* A marking of a synthetic sentence. */ -#define D_SW 6 /* debug level for word splits */ -#define D_UN 6 /* debug level for units/punct */ - -/* These are no longer in use, but are read from the 4.0.affix file */ -/* I've left these here, as an example of what to expect. */ -/*static char * strip_left[] = {"(", "$", "``", NULL}; */ -/*static char * strip_right[] = {")", "%", ",", ".", ":", ";", "?", "!", "''", "'", "'s", NULL};*/ -/* Single-quotes are used for abbreviations, don't mess with them */ -/*//const char * qs = "\"\'«»《》【】『』‘’`„“"; */ -/*const char* qs = "\"«»《》【】『』`„“"; */ - -#define ENTITY_MARKER "" -#define COMMON_ENTITY_MARKER "" - -/** - * is_common_entity - Return true if word is a common noun or adjective - * Common nouns and adjectives are typically used in corporate entity - * names -- e.g. "Sun State Bank" -- "sun", "state" and "bank" are all - * common nouns. - */ -static bool is_common_entity(Dictionary dict, const char * str) -{ - if (word_contains(dict, str, COMMON_ENTITY_MARKER) == 1) - return true; - return false; -} - -static bool is_entity(Dictionary dict, const char * str) -{ - const char * regex_name; - if (word_contains(dict, str, ENTITY_MARKER) == 1) - return true; - regex_name = match_regex(dict->regex_root, str); - if (NULL == regex_name) return false; - return word_contains(dict, regex_name, ENTITY_MARKER); -} - -#if defined HAVE_HUNSPELL || defined HAVE_ASPELL -/** - * Return true if word is a proper name. - * XXX This is a cheap hack that works only in English, and is - * broken for German! We need to replace this with something - * language-specific. - * - * Basically, if word starts with upper-case latter, we assume - * its a proper name, and that's that. - */ -static bool is_proper_name(const char * word, locale_t dict_locale) -{ - return is_utf8_upper(word, dict_locale); -} - -/** - * Returns true if the word contains digits. - */ -static bool contains_digits(const char * s, locale_t dict_locale) -{ - mbstate_t mbs; - int nb = 1; - wchar_t c; - - memset(&mbs, 0, sizeof(mbs)); - while ((*s != 0) && (0 < nb)) - { - nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs); - if (nb < 0) return false; - if (iswdigit_l(c, dict_locale)) return true; - s += nb; - } - return false; -} - -#if 0 -/** - * Return true if an alternative has been issued for the given word. - * If there is an alternative, the previous word points to it. - * Its unsplit_word is the given word. - * - * Return true if an alternative is found, else false. - * XXX need to check correctness. - * XXX It seems this function is not needed any more. Remove if so. - */ -static bool word_has_alternative(const Gword *word) -{ - const Gword **n; - - for (n = word->prev[0]->next; *n; n++) - { - if ((*n)->unsplit_word == word) return true; - } - return false; -} -#endif -#endif /* defined HAVE_HUNSPELL || defined HAVE_ASPELL */ - -/** - * Find if a string is equal to a character in afdict class afdict_classnum. - * The parameter afdict_classnum is one of the following affix classes: - * AFDICT_BULLETS - anything that can be construed to be a bullet. - * AFDICT_QUOTES - anything that can be construed to be a quotation mark. - * - * Return TRUE if the character is in afdict_classnum. - */ - -/** - * Search in s the first character from utf-8 string xc. - */ -static char *utf8_str1chr(const char *s, const char *xc) -{ - /* FIXME use strndupa() */ - int len = utf8_charlen(xc); - char *xc1 = alloca(len+1); - strncpy(xc1, xc, len); - xc1[len] = '\0'; - - return strstr(s, xc1); -} - -static bool in_afdict_class(Dictionary dict, afdict_classnum cn, const char *s) -{ - if (0 == AFCLASS(dict->affix_table, cn)->length) return false; - const char *classchars = AFCLASS(dict->affix_table, cn)->string[0]; - - return NULL != utf8_str1chr(classchars, s); -} - -/** - * Return TRUE if the character is white-space - */ -static bool is_space(wchar_t wc, locale_t dict_locale) -{ - if (iswspace_l(wc, dict_locale)) return true; - - /* 0xc2 0xa0 is U+00A0, c2 a0, NO-BREAK SPACE */ - /* For some reason, iswspace doesn't get this */ - if (0xa0 == wc) return true; - - /* iswspace seems to use somewhat different rules than what we want, - * so over-ride special cases in the U+2000 to U+206F range. - * Caution: this potentially screws with arabic, and right-to-left - * languages. - */ -/*** later, not now .. - if (0x2000 <= wc && wc <= 0x200f) return true; - if (0x2028 <= wc && wc <= 0x202f) return true; - if (0x205f <= wc && wc <= 0x206f) return true; -***/ - - return false; -} - -#if 0 -/** - * Returns true if the word can be interpreted as a number. - * The ":" is included here so we allow "10:30" to be a number. - * The "." and "," allow numbers in both US and European notation: - * e.g. American million: 1,000,000.00 Euro million: 1.000.000,00 - * We also allow U+00A0 "no-break space" - */ -static bool is_number(Dictionary dict, const char * s) -{ - mbstate_t mbs; - int nb = 1; - wchar_t c; - if (!is_utf8_digit(s, Dictionary dict)) return false; - - memset(&mbs, 0, sizeof(mbs)); - while ((*s != 0) && (0 < nb)) - { - nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs); - if (iswdigit_l(dict, c)) { s += nb; } - - /* U+00A0 no break space */ - else if (0xa0 == c) { s += nb; } - - else if ((*s == '.') || (*s == ',') || (*s == ':')) { s++; } - else return false; - } - return true; -} -#endif - -static void gwordqueue_add(const Sentence sent, Gword *const word) -{ - struct word_queue *wq_element = malloc(sizeof(*wq_element)); - - assert((NULL == sent->word_queue) == (NULL == sent->word_queue_last)); - - if (NULL == sent->word_queue) - sent->word_queue = wq_element; - else - sent->word_queue_last->next = wq_element; - - wq_element->word = word; - wq_element->next = NULL; - sent->word_queue_last = wq_element; - -} - -/** - * Update the label of the word (for debug). - * The word label marks which part(s) of the program issued the word. - * This label accumulated if several parts touch the same word. - */ -static void word_label(Sentence sent, Gword *w, const char *op, - const char *label) -{ - const size_t s = (NULL == w->label) ? 0 : strlen(w->label); - char *new_label = alloca(s + strlen(label) + 1 + 2 + 1); /* len+op+()+NUL */ - - if (0 != s) - strcpy(new_label, w->label); - else - new_label[0] = '\0'; - - if (NULL == op) - strcat(new_label, "("); - else if ('\0' != new_label[0]) - strcat(new_label, op); - strcat(new_label, label); - if (NULL == op) strcat(new_label, ")"); - w->label = string_set_add(new_label, sent->string_set); -} - -#define D_WSAA 9 -/** - * Disallow unsplit_word alternatives with the same subword string path. - * (I.e. we are talking about preventing Wordgraph paths consisting of the same - * word strings in the same order, not consisting of same graph nodes.) - * - * The first subword of the candidate alternative is checked against the first - * subword of all the existing alternatives of the unsplit_word (this allows the - * first alternative unconditionally). - * - * Parameters: - * - unsplit_word: the unsplit_word that shouldn't have redundant paths. - * - altword0: the first word of the candidate alternative. - * Return true if the alternative is redundant, false if it is fine. - * - * Assumptions and heuristics: - * - * - This function is invoked only for subwords that we try to split in every - * possible combination, i.e. LR-split. It is not invoked for morpheme - * splits, because then an identical first subword mat be valid due to - * a different way of performing the splits. - * - * - If the first subword of the checked candidate alternative is the same as an - * already existing alternative, it means the candidate alternative is - * redundant. This is because we are going to generate all the subword - * combinations for the other subwords, a thing that would generate an - * identical Wordgraph sub-path otherwise. So the alternative is disallowed. - * - * - The previous step may allow an unknown first subword to not split further - * to a known word plus punctuation, a thing that will leave an unwanted - * unknown word in the Wordgraph. To prevent this we also check if an existing - * first subword is a prefix of the first (unknown) subword of the candidate - * alternative. If it is, it means that a future split would be prevented by - * the previous step. - * - * Examples (the words and alternatives are shown in () for clarity): - * - * 1. An input word ('50s,) splits to (' 50s,). An additional split of this - * input word to ('50s ,) would be prevented if '50s was not a known word, as - * (') is a prefix of ('50s). The split of ('50s) to (' 50s) is prevented by - * this function because its first subword matches the first subword of an - * existing alternative (' 50s,). - * - * 2. The input word ("who") gets split to (" who"). Its split to ("who ") is - * then prevented, as (") is a prefix of ("who) which is unknown. - * - * XXX I don't have a mathematical proof of correctness, it just happens to work - * on the example sentences. [ap] - * - * FIXME XXX What if a non-first subword is unknown and cannot split further? - * For example, for ('50s,) we get an alternative (' 50s ,) in which (50s) - * cannot split further because another alternative also starts with (50), but - * (50s) is an unknown word (that doesn't usually cause a trouble because - * tokens in that alternative don't have a linkage). It will be a good idea to - * find out exactly how it happens and a way to avoid that. A fix "by force" - * may be to explicitly mark unknown-words in separate_word() (as originally - * designed) and in flatten_wordgraph() ignore alternatives that have - * so-unmarked unknown words. - */ -static bool word_start_another_alternative(Dictionary dict, - Gword *unsplit_word, - const char *altword0) -{ - Gword **n; - - lgdebug(+D_WSAA, "\n"); /* Terminate a previous partial trace message. */ - lgdebug(+D_WSAA, "Checking %s in alternatives of %zu:%s (prev %zu:%s)\n", - altword0, unsplit_word->node_num, unsplit_word->subword, - unsplit_word->prev[0]->node_num, unsplit_word->prev[0]->subword); - - for (n = unsplit_word->prev[0]->next; NULL != *n; n++) - { - lgdebug(D_WSAA, "Comparing alt %s\n\\", (*n)->subword); - if ((0 == strcmp((*n)->subword, altword0) || - ((0 == strncmp((*n)->subword, altword0, strlen((*n)->subword))) && - !find_word_in_dict(dict, altword0)))) - { - lgdebug(+D_UN, "Preventing alt starts with %s due to existing %s\n", - altword0, (*n)->subword); - return true; - } - } - return false; -} -#undef D_WSAA - -/** - * Find if a suffix is of a contraction. - * XXX This is appropriate for English and maybe for some other languages, and - * may need a generalization. - * FIXME? Try to work-around the current need of this functions. - */ -static char const *contraction_char[] = { "'", "’" }; - -#if 0 -static bool is_contraction_suffix(const char *s) -{ - size_t len = strlen(s); - - for (size_t i = 0; i < ARRAY_SIZE(contraction_char); i++) - { - size_t cclen = strlen(contraction_char[i]); - if (len < cclen) continue; - if (0 == strncmp(s+len-cclen, contraction_char[i], cclen)) return true; - } - - return false; -} - -static bool is_contraction_prefix(const char *s) -{ - for (size_t i = 0; i < ARRAY_SIZE(contraction_char); i++) - { - size_t cclen = strlen(contraction_char[i]); - if (0 == strncmp(s, contraction_char[i], cclen)) return true; - } - return false; -} -#endif - -static bool is_contraction_word(Dictionary dict, const char *s) -{ - if (dict->affix_table && dict->affix_table->anysplit) - return false; - - for (size_t i = 0; i < ARRAY_SIZE(contraction_char); i++) - { - if (NULL != strstr(s, contraction_char[i])) return true; - } - return false; -} - -/** - * Issue candidate subwords for unsplit_word (an "alternative"). - * Issue prefnum elements from prefix, stemnum elements from stem, and suffnum - * elements from suffix. Mark the prefixes and sufixes with INFIX_MARK (the - * stems are assumed to be already marked with one of the STEMSUBSCR - * possibilities. Set the Morpeheme_type of the subwords. - * - * Return a pointer to the first word of the added alternative. - * - * TODO Support also middle morphemes if needed. - */ -#define D_IWA 6 -Gword *issue_word_alternative(Sentence sent, Gword *unsplit_word, - const char *label, - int prefnum, const char * const *prefix, - int stemnum, const char * const *stem, - int suffnum, const char * const *suffix) -{ - int ai = 0; /* affix index */ - const char * const *affix; /* affix list pointer */ - const char * const * const affixlist[] = { prefix, stem, suffix, NULL }; - const int numlist[] = { prefnum, stemnum, suffnum }; - enum affixtype { PREFIX, STEM, SUFFIX, END }; - enum affixtype at; - const char infix_mark = INFIX_MARK(sent->dict->affix_table); - Gword *subword; /* subword of the current token */ - Gword *psubword = NULL; /* subword of the previous token */ - const size_t token_tot = prefnum + stemnum + suffnum; /* number of tokens */ - size_t token_ord = 0; /* ordinal number of the current token */ - Morpheme_type morpheme_type; - Gword *alternative_id = NULL; /* to be set to the start subword */ - bool subword_eq_unsplit_word; - size_t maxword = 0; - bool last_split = false; /* this is a final token */ -#ifdef DEBUG - Gword *sole_alternative_of_itself = NULL; -#endif - - if (unsplit_word->split_counter > MAX_SPLITS) - { - prt_error("Error: Word %s reached %d splits. " - "It will not get split further. The result is undefined.\n" - "Run with !verbosity="STRINGIFY(D_SW)" to debug\n", - unsplit_word->subword, MAX_SPLITS); - unsplit_word->tokenizing_step = TS_DONE; - /* We cannot return NULL here, because it is unexpected by the caller, - * which expects the alternative_id of the split. Hence a dummy word is - * returned. Since there was no actual split, the results for this - * sentence are undefined. */ - return gword_new(sent, "[MAX_SPLITS]"); - } - /* The incremented split_counter will be assigned to the created subwords. */ - - lgdebug(+D_IWA, "(%s) Gword %zu:%s split (split_counter=%zu) into", label, - unsplit_word->node_num, unsplit_word->subword, - unsplit_word->split_counter); - - /* Allocate memory which is enough for the longest token. */ - for (at = PREFIX; at < END; at++) - { - int affixnum = numlist[at]; - char morpheme_sym[] = "pts"; - - for (affix = affixlist[at]; affixnum-- > 0; affix++) - { - maxword = MAX(maxword, strlen(*affix)); - lgdebug(D_IWA, " %c:%s", morpheme_sym[at], - ('\0' == (*affix)[0]) ? "[null]" : *affix); - } - } - - char * const buff = alloca(maxword + 2); /* strlen + INFIX_MARK + NUL */ - const char *token; - - for (at = PREFIX; at < END; at++) - { - int affixnum = numlist[at]; - - for (affix = affixlist[at]; affixnum-- > 0; affix++, ai++) - { - token_ord++; - token = *affix; /* avoid copying if possible */ - switch (at) - { - /* Mark the token with INFIX_MARK if needed. */ - case PREFIX: /* set to word= */ - if ('\0' != infix_mark) - { - size_t sz = strlen(*affix); - memcpy(buff, *affix, sz); - buff[sz] = infix_mark; - buff[sz+1] = '\0'; - last_split = true; - token = buff; - } - if (is_contraction_word(sent->dict, unsplit_word->subword)) - morpheme_type = MT_CONTR; - else - morpheme_type = MT_PREFIX; - break; - case STEM: /* already word, word.=, word.=x */ - /* Stems are already marked with a stem subscript, if needed. - * The possible marks are set in the affix class STEMSUBSCR. */ - if (is_stem(token)) - { - morpheme_type = MT_STEM; - last_split = true; - } - else - { - morpheme_type = MT_WORD; - } - break; - case SUFFIX: /* set to =word */ - /* XXX If the suffix starts with an apostrophe, don't mark it. - * Actually - any non-alpha is checked. The random-splitting - * "languages" always need the suffix marking. */ - if (((NULL == sent->dict->affix_table->anysplit) && - ('\0' != (*affix)[0]) && - !is_utf8_alpha(*affix, sent->dict->lctype)) || - '\0' == infix_mark) - { - if (is_contraction_word(sent->dict, unsplit_word->subword)) - morpheme_type = MT_CONTR; - else - morpheme_type = MT_WORD; - break; - } - last_split = true; - buff[0] = infix_mark; - strcpy(&buff[1], *affix); - morpheme_type = MT_SUFFIX; - token = buff; - break; - case END: - assert(true, "affixtype END reached"); - } - - /* FIXME Use another method instead of checking the label. */ - if (1 == token_ord && 1 < token_tot && label[0] == 'r' && - word_start_another_alternative(sent->dict, unsplit_word, token)) - { - /* When called due to left/right strip, the code shouldn't use the - * returned value due to the possibility of this returned NULL. */ - return NULL; - } - - subword_eq_unsplit_word= (0 == strcmp(unsplit_word->subword, token)); - - if ((1 == token_tot) && subword_eq_unsplit_word) - { - /* Prevent adding a subword as a sole alternative to itself. */ - Gword **q; - - unsplit_word->issued_unsplit = true; - - /* - * If WS_HASALT is unset, then this is the first alternative. - */ - if (!(unsplit_word->status & WS_HASALT)) - { - /* The unsplit_word itself got issued here as the first - * alternative of itself. In order that it will not become the - * sole alternative of itself, just return. In - * remqueue_gword(), issue_word_alternative() is invoked - * again if needed - see the next comment. */ - word_label(sent, unsplit_word, "+", label); - word_label(sent, unsplit_word, NULL, "IU"); - lgdebug(D_IWA, " (issued_unsplit)\n"); - /* Note: The original morpheme_type is preserved. - * The morpheme_type value set above is just ignored. */ - return unsplit_word; - } - - if (unsplit_word->status & WS_UNSPLIT) - { - /* If we are here, there is tokenization logic error in the - * program, as the word has been issued as an alternative of - * itself an additional time. If we proceed it would mess the - * Wordgraph pointers. Just warn (if verbosity>0) and return. - * The return value is not likely to be used in such a case, - * since this is an issuing of a single word. - * - * Note: In case a new tokenization logic permits adding a - * word more than once, just remove this warning. */ - if (0 < verbosity) - { - prt_error("Warning: Internal error: " - "word \"%s\" got issued more than once\n", - unsplit_word->subword); - } - return NULL; - } - - /* We arrive when a word is issued as an alternative of itself and - * it already has at least one another alternative. This may happen - * when the word is issued as a second and on alternative, or when - * we are invoked from remqueue_gword() if it finds that - * unsplit_word->issued_unsplit is true and there are - * alternatives. Due to the alternatives, the unsplit_word is not - * connected to the word flow. We reconnect it here to its former - * prev/next words so it will serve as an alternative too. */ - - /* Scan its "prev" words and add it as their "next" word */ - for (q = unsplit_word->prev; *q; q++) - gwordlist_append(&(*q)->next, unsplit_word); - /* Scan its "next" words and add it as their "prev" word */ - for (q = unsplit_word->next; *q; q++) - gwordlist_append(&(*q)->prev, unsplit_word); - word_label(sent, unsplit_word, "+", label); - word_label(sent, unsplit_word, NULL, "R"); - unsplit_word->status |= WS_UNSPLIT; -#ifdef DEBUG - sole_alternative_of_itself = unsplit_word; -#endif - lgdebug(D_IWA, " (reconnected)"); - } - else - { - /* Add the token as a subword of this alternative */ - subword = gword_new(sent, token); - subword->unsplit_word = unsplit_word; - subword->split_counter = unsplit_word->split_counter + 1; - subword->morpheme_type = morpheme_type; - - if (last_split) - { -#if 0 - /* XXX the new Turkish experimental dictionary depend on - * specifying compound suffixes which are not in the dict file, - * in the SUF affix class. This allows them to split farther. - * However, here is a need to detail all the supported - * combinations of compound suffixes. - * FIXME: There is a need for a real multi affix splitter. - * (last_split will get optimized out by the compiler.) */ - - /* This is a stem, or an affix which is marked by INFIX_MARK. - * Hence it must be a dict word - regex/spell are not done - * for stems/affixes. Also, it cannot split further. - * Save resources by marking it accordingly. */ - subword->status |= WS_INDICT; - subword->tokenizing_step = TS_DONE; -#endif - } - word_label(sent, subword, "+", label); - - /* If the subword is equal to the unsplit_word (may happen when the - * word is issued together with "virtual" morphemes) we should not - * queue it for further processing, in order to prevent an infinite - * loop. */ - if (!subword_eq_unsplit_word) - gwordqueue_add(sent, subword); - - /* The spelling properties are inherited over morpheme split */ - if (unsplit_word->status & (WS_SPELL|WS_RUNON)) - subword->status |= unsplit_word->status & (WS_SPELL|WS_RUNON); - - if (1 == token_ord) /* first subword of this alternative */ - { - /* Arrange for subword to be the "next" word of the previous - * words of unsplit_word. There are 2 cases: - * - If this is not the first alternative - add the subword to - * their "next" links. - * - If this is the first alternative - replace the "next" link - * pointing to unsplit_word with a link to subword, - * disconnecting unsplit_word from its RHS. */ - Gword **p; - - alternative_id = subword; - - //previous_wordgraph_nextalts(sent, unsplit_word, subword); - /* Scan the said previous words. */ - for (p = unsplit_word->prev; NULL != *p; p++) - { - Gword **n; - - /* Create the "prev" link for subword */ - gwordlist_append(&subword->prev, *p); - - if (unsplit_word->status & WS_HASALT) - { - gwordlist_append(&(*p)->next, subword); - } - else - { - /* Scan the said "next" links */ - for(n = (*p)->next; NULL != *n; n++) - { - if (*n == unsplit_word) - { - /* Now finally replace the "next" link */ - *n = subword; - break; - } - } - assert(NULL != *n, "Adding subword '%s': " - "No corresponding next link for a prev link: " - "prevword='%s' word='%s'", - subword->subword, (*p)->subword, unsplit_word->subword); - } - } - } - - if (token_tot == token_ord) /* last subword of this alternative */ - { - - /* Arrange for subword to be the "prev" word of the next words of - * unsplit_word. There are 2 cases: - * - If this is not the first alternative - add the subword to - * their "prev" links. - * - If this is the first alternative - replace the "prev" link - * pointing to unsplit_word with a link to subword, - * disconnecting unsplit_word from its LHS. - */ - Gword **n; - - //next_wordgraph_prevalts(sent, unsplit_word, subword); - /* Scan the said next words. */ - for (n = unsplit_word->next; NULL != *n; n++) - { - Gword **p; - - /* Create the "next" link for subword */ - gwordlist_append(&subword->next, *n); - - if (unsplit_word->status & WS_HASALT) - { - gwordlist_append(&(*n)->prev, subword); - } - else - { - /* Scan the said "prev" links */ - for(p = (*n)->prev; NULL != *p; p++) - { - if (*p == unsplit_word) - { - /* Now finally replace the "prev" link */ - *p = subword; - break; - } - } - assert(NULL!=*p, - "Adding subword '%s': " - "No corresponding prev link for a next link" - "nextword='%s' word='%s'", - subword->subword, (*n)->subword, unsplit_word->subword); - } - - } - } - - if (1 < token_ord) /* not the first subword */ - { - gwordlist_append(&psubword->next, subword); - gwordlist_append(&subword->prev, psubword); - } - - subword->alternative_id = alternative_id; - psubword = subword; - } - } - } - - unsplit_word->status |= WS_HASALT; - lgdebug(D_IWA, "\n"); - -#ifdef DEBUG - /* Check if the alternative that has just been added already exists. - * If it exists - just warn. */ - { - Gword **prev = unsplit_word->prev; - Gword *curr_alt = sole_alternative_of_itself ? - sole_alternative_of_itself : alternative_id; - Gword **alts; - - assert(curr_alt, "'%s': No alt mark", unsplit_word->subword); - assert(prev, "'%s': No prev", unsplit_word->subword); - assert(prev[0], "'%s': No prev[0]", unsplit_word->subword); - assert(prev[0]->next, "%s': No next",prev[0]->subword); - assert(prev[0]->next[0], "'%s': No next[0]",prev[0]->subword); - for (alts = prev[0]->next; *alts; alts++) - { - Gword *calt = curr_alt; /* check alternative */ - Gword *oalt; /* old alternatives */ - size_t token_no = token_tot; - - if (*alts == curr_alt) break; - for (oalt = *alts; token_no > 0; oalt = oalt->next[0]) - { - if (0 != (strcmp(oalt->subword, calt->subword))) - break; - calt = calt->next[0]; - token_no--; - } - if (token_tot) continue; - prt_error("Error: >>>DEBUG>>>: '%s' " - "(alternative start '%s', len=%zu): " - "Alternative already exists!\n", - curr_alt->subword, unsplit_word->subword, token_tot); - } - } -#endif - return alternative_id; -} -#undef D_IWA - -#define D_RWW 6 -static void remqueue_gword(const Sentence sent) -{ - struct word_queue *const wq = sent->word_queue; - Gword *w = wq->word; - - assert(NULL!=wq, "Trying to remove a word from an empty word queue"); - - lgdebug(+D_RWW, "Word '%s'%s%s\n", w->subword, - w->issued_unsplit ? " issued_unsplit" : "", - w->status & WS_HASALT ? " WS_HASALT" : ""); - - /* If the word should have an alternative which includes itself, add it as an - * additional alternative (unless it has already been added, as indicated by - * WS_UNSPLIT). - * See the comments in issue_word_alternative() where remqueue_gword is - * mentioned. */ - if (w->issued_unsplit && (w->status & WS_HASALT) && !(w->status & WS_UNSPLIT)) - { - issue_word_alternative(sent, w, "RQ" ,0,NULL, 1,&w->subword, 0,NULL); - } - -#if WORDGRAPH_PARSER /* not defined */ - /* If the parsers are modified to work directly on the Wordgraph. */ - build_expressions(wq->word); -#endif - - /* Finally, remove the word from the queue. */ - sent->word_queue = wq->next; - free(wq); -} -#undef D_RWW - -static Gword *wordgraph_getqueue_word(Sentence sent) -{ - Gword *w; - - if (NULL == sent->word_queue) return NULL; - w = sent->word_queue->word;; - - return w; -} - -/** - * Prevent a further tokenization of all the subwords in the given alternative. - * To be used if the alternative represents a final tokenization. - */ -static void tokenization_done(Dictionary dict, Gword *altp) -{ - - Gword *alternative_id = altp->alternative_id; - - for (; altp->alternative_id == alternative_id; altp = altp->next[0]) - { - if (NULL == altp) break; /* just in case this is a dummy word */ - - /* Mark only words that are in the dict file. - * Other words need further processing. */ - if (boolean_dictionary_lookup(dict, altp->subword)) - { - altp->status |= WS_INDICT; - altp->tokenizing_step = TS_DONE; - } - } -} - -/* - Here's a summary of how subscripts are handled: - - Reading the dictionary: - - If the last "." in a string is followed by a non-digit character, - then the "." and everything after it is considered to be the subscript - of the word. - - The dictionary reader does not allow you to have two words that - match according to the criterion below. (so you can't have - "dog.n" and "dog") - - Quote marks are used to allow you to define words in the dictionary - which would otherwise be considered part of the dictionary, as in - - ";": {@Xca-} & Xx- & (W+ or Qd+) & {Xx+}; - "%" : (ND- & {DD-} & & - ( or B*x+)) or (ND- & (OD- or AN+)); - - Rules for chopping words from the input sentence: - - First the prefix chars are stripped off of the word. These - characters are "(" and "$" (and now "``") - - Now, repeat the following as long as necessary: - - Look up the word in the dictionary. - If it's there, the process terminates. - - If it's not there and it ends in one of the right strippable - strings (see "strip_right") then remove the strippable string - and make it into a separate word. - - If there is no strippable string, then the process terminates. - - Rule for defining subscripts in input words: - - The subscript rule is followed just as when reading the dictionary. - - When does a word in the sentence match a word in the dictionary? - - Matching is done as follows: Two words with subscripts must match - exactly. If neither has a subscript they must match exactly. If one - does and one doesn't then they must match when the subscript is - removed. Notice that this is symmetric. - - So, under this system, the dictionary could have the words "Ill" and - also the word "Ill." It could also have the word "i.e.", which could be - used in a sentence. -*/ - -#ifdef DEBUG -/** - * Split special synthetic words, for Wordgraph handling debug. - * Word syntax (recursively): LABEL(WORD+WORD+...|WORD+...) - * Notations in the word syntax: - * +: a separator between words of the same alternative. - * |: a separator between alternatives. - * LABEL: (optional) mark the graph node by a name (for convenience). - * (): refers to the unsplit word, in order to generate it as an - * alternative to itself. E.g. (A|()) generates A as one alternative and the - * whole unsplit word as the other one. - * Example sentence: K Ax(BC((mD2+e+F)+(G+h)|(v+w)) C(3|J)) L (()|X+Y) - * If no split is needed, word syntax errors are silently ignored. - * Null-string subwords are not allowed, e.g.: A(|B) C(+) D(E|) - */ -static bool synthetic_split(Sentence sent, Gword *unsplit_word) -{ - const char *const w = unsplit_word->subword; - const char *c = w; - const char *s = w; - int plevel = 0; - const char **alts = NULL; - bool can_split = false; - const size_t len = strlen(c); - char *alt = alloca(len+1); -#define SYNTHSPLIT_ERROR(e) ("Error: synthetic_split(): word '%s':" e "\n") - - /* Synthetic sentences are marked by a special initial character. */ - if (SYNTHETIC_SENTENCE_MARK != sent->orig_sentence[0]) return false; - - assert(0 != len, "synthetic_split(): empty-string word"); - if (')' != w[len-1]) return false; /* no split needed (syntax not checked) */ - - do - { - switch (*c) - { - case '(': - if (0 == plevel) s = c + 1; - plevel++; - break; - case ')': - case '+': - case '|': - if (1 == plevel) - { - if (c == s) - { - prt_error(SYNTHSPLIT_ERROR("(empty subword)."), w); - goto error; - } - strncpy(alt, s, c-s); - alt[c-s] = '\0'; - if (0 == strcmp(alt, "()")) - { - /* The word is an alternative to itself. It is not going to - * loop due to a special handling in issue_word_alternative(). - */ - strcpy(alt, w); - } - altappend(sent, &alts, alt); - s = c + 1; - - if ('|' == *c) - { - if (alts) - issue_word_alternative(sent, unsplit_word, "SS", 0,NULL, - altlen(alts),alts, 0,NULL); - can_split = true; - free(alts); - alts = NULL; - } - } - if (')' == *c) plevel--; - break; - default: - if (!(((*c >= 'a') && (*c <= 'z')) || - ((*c >= 'A') && (*c <= 'Z')) || - ((*c >= '0') && (*c <= '9')) || - ('_' == *c))) - { - prt_error(SYNTHSPLIT_ERROR("('%c' not alphanumeric)."), w, *c); - goto error; - } - } - if (0 > plevel) - { - prt_error(SYNTHSPLIT_ERROR("extra ')'"), w); - goto error; - } - - } while ('\0' != *++c); - - if (0 < plevel) - { - prt_error(SYNTHSPLIT_ERROR("missing '('."), w); - goto error; - } - - if (alts) - { - issue_word_alternative(sent, unsplit_word, "SS", 0,NULL, - altlen(alts),alts, 0,NULL); - can_split = true; - } - -error: - free(alts); - return can_split; -} -#endif - -/** - * Add the given prefix, word and suffix as an alternative. - * If STEMSUBSCR is define in the affix file, use its values as possible - * subscripts for the word. In that case, if the word cannot be found in - * the dict with any of the given stem suffixes, the alternative is not - * valid and thus not added. - * - * If unsplit_word is null, this function actually only checks whether - * the alternative is valid as described above. This is used for checking - * is a spell guess result if valid if the word itself is not in the dict. - * FIXME: If a word can split it doesn't follow it is a "real" dictionary - * word, as there can still be no links between some of its parts. - * - * Return true if the alternative is valid, else false. - */ -static bool add_alternative_with_subscr(Sentence sent, - Gword * unsplit_word, - const char * prefix, - const char * word, - const char * suffix) -{ - Dictionary dict = sent->dict; - Afdict_class * stemsubscr_list = - AFCLASS(dict->affix_table, AFDICT_STEMSUBSCR); - const char ** stemsubscr = stemsubscr_list->string; - size_t stemsubscr_count = stemsubscr_list->length; - bool word_is_in_dict = false; - bool issue_alternatives = (NULL != unsplit_word); - - if (0 == stemsubscr_count) - { - if (issue_alternatives) - { - word_is_in_dict = true; - issue_word_alternative(sent, unsplit_word, "AWS", - (prefix ? 1 : 0),&prefix, 1,&word, - (suffix ? 1 : 0),&suffix); - } - else - { - /* This is a compound-word spell check. Reject unknown words. - * XXX: What if the word is capitalized? */ - word_is_in_dict = boolean_dictionary_lookup(dict, word); - } - } - else - { - size_t si; - size_t wlen = strlen(word); - size_t slen = 0; - char *w; - - for (si = 0; si < stemsubscr_count; si++) - { - slen = MAX(slen, strlen(stemsubscr[si])); - } - w = alloca(wlen + slen + 1); - strcpy(w, word); - - for (si = 0; si < stemsubscr_count; si++) - { - strcpy(&w[wlen], stemsubscr[si]); - - /* We should not match regexes to stems. */ - if (boolean_dictionary_lookup(dict, w)) - { - word_is_in_dict = true; - if (issue_alternatives) - { - issue_word_alternative(sent, unsplit_word, "AWS", - (prefix ? 1 : 0),&prefix, 1,(const char **)&w, 1,&suffix); - } - } - } - } - - lgdebug(+D_SW,"Stem subscript not found: p:%s t:%s s:%s\n", - prefix ? prefix : "(none)", word, suffix ? suffix : "(none)"); - return word_is_in_dict; -} - -/** - * Split word into prefix, stem and suffix. - * It can also split contracted words (like he's). - * Alternatives are generated if issue_alternatives=true. - * Return value: - * - * The prefix code is only lightly validated by actual use. - * - * If unsplit_word is null, this function actually only checks whether - * the word can split. This is used for checking if a spell guess result is - * valid if the word itself is not in the dict. See also - * add_alternative_with_subscr(). - */ -static bool suffix_split(Sentence sent, Gword *unsplit_word, const char *w) -{ - int i, j; - Afdict_class *prefix_list, *suffix_list; - int p_strippable, s_strippable; - const char **prefix, **suffix; - const char *no_suffix = NULL; - bool word_can_split = false; - const Dictionary dict = sent->dict; - const char *wend = w + strlen(w); - char *newword = alloca(wend-w+1); - - /* Set up affix tables. */ - if (NULL == dict->affix_table) return false; - prefix_list = AFCLASS(dict->affix_table, AFDICT_PRE); - p_strippable = prefix_list->length; - prefix = prefix_list->string; - suffix_list = AFCLASS(dict->affix_table, AFDICT_SUF); - s_strippable = suffix_list->length; - suffix = suffix_list->string; - - if (INT_MAX == s_strippable) return false; - - /* Go through once for each suffix; then go through one - * final time for the no-suffix case (i.e. to look for - * prefixes only, without suffixes). */ - for (i = 0; i <= s_strippable; i++, suffix++) - { - bool did_split = false; - size_t suflen = 0; - if (i < s_strippable) - { - suflen = strlen(*suffix); - /* The remaining w is too short for a possible match. - * In addition, don't allow empty stems. */ - if ((wend-suflen) < (w+1)) continue; - - /* A lang like Russian allows empty suffixes, which have a real - * morphological linkage. In the following check, the empty suffix - * always matches. */ - if (0 == strncmp(wend-suflen, *suffix, suflen)) - { - size_t sz = (wend-w)-suflen; - strncpy(newword, w, sz); - newword[sz] = '\0'; - - /* Check if the remainder is in the dictionary. - * In case we try to split a contracted word, the first word - * may match a regex. Hence find_word_in_dict() is used and - * not boolean_dictionary_lookup(). - * Note: Not like a previous version, stems cannot match a regex - * here, and stem capitalization need to be handled elsewhere. */ - if ((is_contraction_word(dict, w) && - find_word_in_dict(dict, newword)) || - boolean_dictionary_lookup(dict, newword)) - { - did_split = true; - word_can_split |= - add_alternative_with_subscr(sent, unsplit_word, - NULL, newword, *suffix); - } - } - } - else - { - suflen = 0; - suffix = &no_suffix; - } - - /* - * Try stripping off prefixes. Avoid double-counting and - * other trouble by doing this only if we split off a suffix, - * or if there is no suffix. - */ - if (did_split || 0==suflen) - { - for (j = 0; j < p_strippable; j++) - { - size_t prelen = strlen(prefix[j]); - /* The remaining w is too short for a possible match. - * NOTE: A zero length "stem" is not allowed here. In any - * case, it cannot be handled (yet) by the rest of the code. */ - if ((wend-w) - suflen <= prelen) continue; - if (strncmp(w, prefix[j], prelen) == 0) - { - size_t sz = MIN((wend-w) - suflen - prelen, MAX_WORD); - - strncpy(newword, w+prelen, sz); - newword[sz] = '\0'; - /* ??? Do we need a regex match? */ - if (boolean_dictionary_lookup(dict, newword)) - { - word_can_split |= - add_alternative_with_subscr(sent, unsplit_word, prefix[j], - newword, *suffix); - } - } - } - } - } - - return word_can_split; -} - -#if defined HAVE_HUNSPELL || defined HAVE_ASPELL -/** - * Set the status of all the words in a given alternative. - * Currently used to mark words that are a result of a spelling. - */ -static void set_alt_word_status(Dictionary dict, Gword *altp, - unsigned int status) -{ - Gword *alternative_id = altp->alternative_id; - - for (; (NULL != altp) && (altp->alternative_id == alternative_id); - altp = altp->next[0]) - { - if (NULL == altp) break; /* just in case this is a dummy word */ - - /* WS_INDICT is to be used if we are bypassing separate_word(). */ - if ((altp->status & WS_INDICT) && - !boolean_dictionary_lookup(dict, altp->subword)) - { - status &= ~WS_INDICT; - } - - altp->status |= status; - - /* Is this needed? */ - if (MT_INFRASTRUCTURE == altp->unsplit_word->morpheme_type) break; - } -} -#endif /* HAVE_HUNSPELL */ - - -#define HEB_PRENUM_MAX 5 /* no more than 5 prefix "subwords" */ -#define HEB_UTF8_BYTES 2 /* Hebrew UTF8 characters are always 2-byte */ -#define HEB_CHAREQ(s, c) (strncmp(s, c, HEB_UTF8_BYTES) == 0) -/** - * Handle "formative letters" ב, ה, ו, כ, ל, מ, ש. - * Split word into multiple prefix "subwords" (1-3 characters each) - * and an unprefixed word (which must be in the dictionary or be null) - * in all possible ways (even when the prefix combination is not valid, - * the LG rules will resolve that). - * If the whole word (i.e. including the prefixes) is in the dictionary, - * the word will be added in separate_word(). - * Add all the alternatives. - * The assumptions used prevent a large number of false splits. - * They may be relaxed later. - * - * XXX Because the grammatical rules of which prefixes are valid for the - * remaining word are not checked, non-existing words may get split. In such a - * case there is no opportunity for a regex or spell check of this unknown word. - * FIXME Before issuing an alternative, validate that the combination is - * supported by the dict. - * - * Note: This function currently does more than absolutely needed for LG, - * in order to simplify the initial Hebrew dictionary. - * It may be latter replaced by a simpler version. - * - * These algorithm is most probably very Hebrew-specific. - * These assumptions are used: - * - the prefix consists of subwords - * - longer subwords have priority over shorter ones - * - subwords in a prefix are unique ('ככ' is considered here as one "subword") - * - input words with length <= 2 don't have a prefix - * - each character uses 2 bytes (can be fixed) - * - the input word contains only Hebrew characters - * - the letter "ו" (vav) can only be the first prefix subword - * - if the last prefix subword is not "ו" and the word (length>2) starts - * with 2 "ו", the actual word to be looked up starts with one "ו" - * (see also TBD there) - * - a prefix can be stand-alone (an input word that consists of prefixes) - * - * To implement this function in a way which is appropriate for more languages, - * Hunspell-like definitions (but more general) are needed. - */ -static bool mprefix_split(Sentence sent, Gword *unsplit_word, const char *word) -{ - int i; - Afdict_class *mprefix_list; - int mp_strippable; - const char **mprefix; - const char *newword; - const char *w; - int sz = 0; - bool word_is_in_dict = false; - int split_prefix_i = 0; /* split prefix index */ - const char *split_prefix[HEB_PRENUM_MAX]; /* the whole prefix */ - bool *pseen; /* prefix "subword" seen (not allowed again) */ - int pfound; /* index of longer prefix found at a prefix level */ - Dictionary dict = sent->dict; - int wordlen; - int wlen; - int plen = 0; - Gword *altp; - bool split_check = (NULL == unsplit_word); - - /* Set up affix table */ - if (NULL == dict->affix_table) return false; - mprefix_list = AFCLASS(dict->affix_table, AFDICT_MPRE); - mp_strippable = mprefix_list->length; - if (0 == mp_strippable) return false; - /* The mprefix list is revered-sorted according to prefix length. - * The code here depends on that. */ - mprefix = mprefix_list->string; - - pseen = alloca(mp_strippable * sizeof(*pseen)); - /* Assuming zeroed-out bytes are interpreted as false. */ - memset(pseen, 0, mp_strippable * sizeof(*pseen)); - - w = word; - wordlen = strlen(word); /* guaranteed < MAX_WORD by separate_word() */ - do - { - pfound = -1; - - for (i=0; i 0) && - HEB_CHAREQ(mprefix[i], "ו") && (HEB_CHAREQ(w, "ו"))) - { - continue; - } - - plen = strlen(mprefix[i]); - wlen = strlen(w); - sz = wlen - plen; - if (strncmp(w, mprefix[i], plen) == 0) - { - if (-1 == pfound) pfound = i; - newword = w + plen; - /* Check for non-vav before vav */ - if (!HEB_CHAREQ(mprefix[i], "ו") && (HEB_CHAREQ(newword, "ו"))) - { - /* Non-vav before a single-vav - not in a prefix */ - if (!HEB_CHAREQ(newword+HEB_UTF8_BYTES, "ו")) - continue; - - /* Non-vav before 2-vav */ - if (newword[HEB_UTF8_BYTES+1]) - newword += HEB_UTF8_BYTES; /* strip one 'ו' */ - /* TBD: check word also without stripping. */ - } - pseen[i] = true; - split_prefix[split_prefix_i] = mprefix[i]; - if (0 == sz) /* stand-alone prefix */ - { - word_is_in_dict = true; - /* Add the prefix alone */ - lgdebug(+D_UN, "Whole-word prefix: %s\n", word); - if (split_check) return true; - altp = issue_word_alternative(sent, unsplit_word, "MPW", - split_prefix_i+1,split_prefix, 0,NULL, 0,NULL); - tokenization_done(dict, altp); - /* If the prefix is a valid word, - * It has been added in separate_word() as a word */ - break; - } - if (find_word_in_dict(dict, newword)) - { - word_is_in_dict = true; - lgdebug(+D_UN, "Splitting off a prefix: %.*s-%s\n", - wordlen-sz, word, newword); - if (split_check) return true; - altp = issue_word_alternative(sent, unsplit_word, "MPS", - split_prefix_i+1,split_prefix, 1,&newword, 0,NULL); - tokenization_done(dict, altp); - } - } - } - if ((-1 != pfound) && (i != pfound)) - { - /* A previous prefix is the longer one - use it */ - split_prefix[split_prefix_i] = mprefix[pfound]; - plen = strlen(mprefix[pfound]); - w += plen; - } - else - { - w = newword; - } - split_prefix_i++; - /* "wlen + sz < wordlen" is true if a vav has been stripped */ - } while ((sz > 0) && (-1 != pfound) && (split_prefix_i < HEB_PRENUM_MAX)); - - return word_is_in_dict; -} - -/* Return true if the word might be capitalized by convention: - * -- if its the first word of a sentence - * -- if its the first word following a colon, a period, a question mark, - * or any bullet (For example: VII. Ancient Rome) - * -- if its the first word of a quote - * - * XXX FIXME: These rules are rather English-centric. Someone should - * do something about this someday. - */ -static bool is_capitalizable(const Dictionary dict, const Gword *word) -{ - /* Words at the start of sentences are capitalizable */ - if (MT_WALL == word->prev[0]->morpheme_type) return true; - if (MT_INFRASTRUCTURE == word->prev[0]->morpheme_type) return true; - - /* Words following colons are capitalizable. */ - /* Mid-text periods and question marks are sentence-splitters. */ - if (strcmp(":", word->prev[0]->subword) == 0 || - strcmp(".", word->prev[0]->subword) == 0 || - strcmp("?", word->prev[0]->subword) == 0 || - strcmp("!", word->prev[0]->subword) == 0 || - strcmp("?", word->prev[0]->subword) == 0 || - strcmp("!", word->prev[0]->subword) == 0 ) - return true; - if (in_afdict_class(dict, AFDICT_BULLETS, word->prev[0]->subword)) - return true; - if (in_afdict_class(dict, AFDICT_QUOTES, word->prev[0]->subword)) - return true; - - return false; -} - -#define D_MS 6 -/* - * Split the given word "word" to morphemes. - * If unsplit_word is not NULL then issue alternatives. - * Else only check the word can split (to validate a spell guess). - */ -static bool morpheme_split(Sentence sent, Gword *unsplit_word, const char *word) -{ - bool word_can_split; - - if (0 < AFCLASS(sent->dict->affix_table, AFDICT_MPRE)->length) - { - word_can_split = mprefix_split(sent, unsplit_word, word); - lgdebug(+D_MS, "Tried mprefix_split word=%s, can_split=%d\n", - word, word_can_split); - } - else - { - word_can_split = suffix_split(sent, unsplit_word, word); - lgdebug(+D_MS, "Tried to split word=%s, can_split=%d\n", - word, word_can_split); - - /* XXX WS_FIRSTUPPER marking is missing here! */ - if ((NULL != unsplit_word) && is_utf8_upper(word, sent->dict->lctype) && - is_capitalizable(sent->dict, unsplit_word) && - !(unsplit_word->status & (WS_SPELL|WS_RUNON))) - { - int downcase_size = strlen(word)+MB_LEN_MAX+1; - char *const downcase = alloca(downcase_size); - - downcase_utf8_str(downcase, word, downcase_size, sent->dict->lctype); - word_can_split |= - suffix_split(sent, unsplit_word, downcase); - lgdebug(+D_MS, "Tried to split lc=%s, now can_split=%d\n", - downcase, word_can_split); - } - } - - return word_can_split; -} - -#if defined HAVE_HUNSPELL || defined HAVE_ASPELL -static bool is_known_word(Sentence sent, const char *word) -{ - return (boolean_dictionary_lookup(sent->dict, word) || - morpheme_split(sent, NULL, word)); -} - -/** - * Try to spell guess an unknown word, and issue the results as alternatives. - * There are two kind of guesses: - * - Separating run-on words into an exact combination of words, usually 2. - * - Find similar words. These are limited to use_spell_guess alternatives. - * - * Return true if corrections have been issued, else false. - * - * Note: spellcheck_suggest(), which is invoked by this function, returns - * guesses for words containing numbers (including words consisting of digits - * only). Hence this function should not be called for such words. - * - * Note that a lowercase word can be spell-corrected to an uppercase word. - * FIXME? Should we allow that only if the lc version of the corrected word - * is the same? - */ -static bool guess_misspelled_word(Sentence sent, Gword *unsplit_word, - Parse_Options opts) -{ - Dictionary dict = sent->dict; - int runon_word_corrections = 0; - int num_guesses = 0; - int j, n; - char *sp = NULL; - const char *wp; - char **alternates = NULL; - const char *word = unsplit_word->subword; - - /* If the spell-checker knows about this word, and we don't ... - * Dang. We should fix it someday. Accept it as such. */ - if (spellcheck_test(dict->spell_checker, word)) return false; - - /* Else, ask the spell-checker for alternate spellings - * and see if these are in the dict. */ - n = spellcheck_suggest(dict->spell_checker, &alternates, word); - if (verbosity_level(+D_SW)) - { - lgdebug(0, "spellcheck_suggest for %s:\\", word); - if (0 == n) - lgdebug(0, " (nothing)\n"); - else - lgdebug(0, "\n\\"); - - for (j=0; jdict, altp, WS_RUNON); - runon_word_corrections++; - } - free(runon_word); - } - else - { - /* A spell guess. - */ - if (is_known_word(sent, alternates[j])) - { - wp = alternates[j]; - altp = issue_word_alternative(sent, unsplit_word, "SP", - 0,NULL, 1,&wp, 0,NULL); - set_alt_word_status(sent->dict, altp, WS_SPELL); - num_guesses++; - } - //else prt_error("Debug: Spell guess '%s' ignored\n", alternates[j]); - } - - if (num_guesses >= opts->use_spell_guess) break; - } - if (alternates) spellcheck_free_suggest(dict->spell_checker, alternates, n); - - return ((num_guesses > 0) || (runon_word_corrections > 0)); -} -#endif /* HAVE_HUNSPELL */ - -/** - * Strip off punctuation, etc. on the left-hand side. - */ -static const char *strip_left(Sentence sent, const char * w, - const char *r_stripped[], - size_t *n_r_stripped) -{ - const Dictionary afdict = sent->dict->affix_table; - const Afdict_class * lpunc_list; - const char * const * lpunc; - size_t l_strippable; - size_t i; - - if (NULL == afdict) return (w); - lpunc_list = AFCLASS(afdict, AFDICT_LPUNC); - l_strippable = lpunc_list->length; - lpunc = lpunc_list->string; - - *n_r_stripped = 0; - - do - { - for (i=0; idict; - Dictionary afdict = dict->affix_table; - const char * temp_wend = *wend; - char *word = alloca(temp_wend-w+1); - size_t sz; - size_t i; - size_t nrs = 0; - size_t len = 0; - bool stripped = false; - - Afdict_class *rword_list; - size_t rword_num; - const char * const * rword; - - if (*n_r_stripped >= MAX_STRIP-1) return false; - - assert(temp_wend>w, "strip_right: unexpected empty word"); - if (NULL == afdict) return false; - - rword_list = AFCLASS(afdict, classnum); - rword_num = rword_list->length; - rword = rword_list->string; - - do - { - for (i = 0; i < rword_num; i++) - { - const char *t = rword[i]; - - len = strlen(t); - /* The remaining w is too short for a possible match */ - if ((temp_wend-w) < (int)len) continue; - - if (strncmp(temp_wend-len, t, len) == 0) - { - lgdebug(D_UN, "%d: strip_right(%s): w='%s' rword '%s'\n", - p, afdict_classname[classnum], temp_wend-len, t); - r_stripped[*n_r_stripped+nrs] = t; - nrs++; - temp_wend -= len; - break; - } - } - } while ((i < rword_num) && (temp_wend > w) && rootdigit && - (*n_r_stripped+nrs < MAX_STRIP)); - assert(w <= temp_wend, "A word should never start after its end..."); - - sz = temp_wend-w; - strncpy(word, w, sz); - word[sz] = '\0'; - - /* If there is a non-null root, we require that it ends with a number, - * to ensure we stripped off all units. This prevents striping - * off "h." from "20th.". - * FIXME: is_utf8_digit(temp_wend-1, dict) here can only check ASCII digits, - * since it is invoked with the last byte... */ - if (rootdigit && (temp_wend > w) && !is_utf8_digit(temp_wend-1, dict->lctype)) - { - lgdebug(D_UN, "%d: strip_right(%s): return FALSE; root='%s' (%c is not a digit)\n", - p, afdict_classname[classnum], word, temp_wend[-1]); - return false; - } - - stripped = nrs > 0; - if (temp_wend == w) - { - /* Null root - undo the last strip */ - nrs--; - temp_wend += len; - } - - lgdebug(D_UN, "%d: strip_right(%s): return %s; n_r_stripped=%d+%d, wend='%s' temp_wend='%s'\n", -p, afdict_classname[classnum],stripped?"TRUE":"FALSE",(int)*n_r_stripped,(int)nrs,*wend,temp_wend); - - *n_r_stripped += nrs; - *wend = temp_wend; - return stripped; -} - -/** - * Issue an alternative that starts with w and continue with r_stripped[]. - * If wend is NULL, w is Null-terminated. - */ -static void issue_r_stripped(Sentence sent, - Gword *unsplit_word, - const char *w, - const char *wend, - const char *r_stripped[], - size_t n_r_stripped, - const char *nalt) -{ - size_t sz = (NULL==wend) ? strlen(w) : (size_t)(wend-w); - char *const word = alloca(sz+1); - const char **rtokens = NULL; - size_t ntokens = 1; - int i; - - strncpy(word, w, sz); - word[sz] = '\0'; - - altappend(sent, &rtokens, word); - lgdebug(+D_SW, "Issue stripped word w='%s' (alt %s)\n", word, nalt); - for (i = n_r_stripped - 1; i >= 0; i--) - { - lgdebug(+D_SW, "Issue r_stripped w='%s' (alt %s)\n", r_stripped[i], nalt); - altappend(sent, &rtokens, r_stripped[i]); - ntokens++; - } - issue_word_alternative(sent, unsplit_word, nalt, - 0,NULL, ntokens,rtokens, 0,NULL); - free(rtokens); -} - -static void issue_dictcap(Sentence sent, bool is_cap, - Gword *unsplit_word, const char *word) -{ - const char *dictcap[2]; - Gword *altp; - - dictcap[0] = is_cap ? CAP1st : CAPnon; - dictcap[1] = word; - lgdebug(+D_SW, "Adding %s word=%s RE=%s\n", dictcap[0], word, - NULL == unsplit_word->regex_name ? "" : unsplit_word->regex_name); - altp = issue_word_alternative(sent, unsplit_word, dictcap[0], - 0,NULL, 2,dictcap, 0,NULL); - - /* Set the dictcap[0] word fields */ - altp->status |= WS_INDICT; /* already checked to be in the dict */ - altp->morpheme_type = MT_FEATURE; - altp->tokenizing_step = TS_DONE; /* no further tokeniation */ - - /* Set the alternative word fields. */ - if(is_cap && (NULL != unsplit_word->regex_name)) - { - /* This is the uc word. */ - altp->next[0]->status |= WS_REGEX; - altp->next[0]->regex_name = unsplit_word->regex_name; - /* issue_word_alternative() will mark it as TS_DONE because it appears in - * an alternative of itself. */ - } - else - { - /* This is the lc version. The original word can be restored later, if - * needed, through the unsplit word. */ - altp->status |= WS_FIRSTUPPER; - } -} - -/* r_stripped debug printout */ -static const char *print_rev_word_array(Sentence sent, const char **w, - size_t size) -{ - String *s = string_new(); - int i; - const char *r; - - for (i = size - 1; i >= 0; i--) - append_string(s, "[%d]='%s'%s", i, w[i], i>0 ? "," : ""); - - r = string_set_add(string_value(s), sent->string_set); - string_delete(s); - return r; -} - -/** - * Check if the word is capitalized according to the regex definitions. - * XXX Not nice - try to avoid the need of using it. - */ -static bool is_re_capitalized(const char *regex_name) -{ - return ((NULL != regex_name) && (NULL != strstr(regex_name, "CAPITALIZED"))); -} - -/** - * Separate a word to subwords in all the possible ways. - * unsplit_word is the current Wordgraph word to be separated to subwords. - * This function splits up the word if necessary, and calls - * "issue_word_alternatives()" on each of the resulting parts ("subwords"), - * creating an "alternative" to the original unsplit_word. - * - * This is used to, e.g, split Russian words into stem+suffix, issuing a - * separate subword for each. In addition, there are many English - * constructions that need splitting: - * - * 86mm -> 86 + mm (millimeters, measurement) - * $10 -> $ + 10 (dollar sign plus a number) - * Surprise! -> surprise + ! (pry the punctuation off the end of the word) - * you've -> you + 've (undo contraction, treat 've as synonym for 'have') - * - * The original separate_word() function directly created the 2D-word-array used - * by the parser. This version of separate_word() is a rewrite that creates a - * word graph, referred in the comments as Wordgraph. It is later converted to - * the said 2D-word-array by flatten_wordgraph(). - * - * The current separate_word() code is still too similar to the old one, even - * though some principles of operation are radically different: the separated - * subwords are now put in a central word queue, from which they are pulled out - * one by one. If a word is marked by TS_DONE, it will be removed from - * the word queue without further processing. - * - * The function gets each word in the queue, separates it to subwords and create - * alternatives from each such separation, until all the separating - * possibilities are exhausted. - * - * FIXME: The old code, although working, is convoluted and contains redundant - * parts. It needs much cleanup efforts, also to make it more flexible and - * efficient, and at the same time prevent extra splitting (i.e. prevent issuing - * alternatives which create graph paths with the same sequence of subwords as - * existing parallel graph paths). - * A test case: By the '50s, he was very prosperous. - * - * XXX This function is being rewritten (work in progress). - */ -static void separate_word(Sentence sent, Gword *unsplit_word, Parse_Options opts) -{ - Dictionary dict = sent->dict; - bool word_is_known = false; - bool word_can_split; - bool word_can_lrsplit = false; /* This is needed to prevent spelling on - * compound subwords, like "Word." while - * still allowing capitalization handling - * and regex match. */ - bool lc_word_is_in_dict = false; - bool stripped; - const char *wp; - const char *temp_wend; - - size_t n_r_stripped = 0; - const char *r_stripped[MAX_STRIP]; /* these were stripped from the right */ - - /* For units alternative */ - const char *units_wend = NULL; /* end of string consisting of units */ - size_t units_n_r_stripped = 0; - - size_t sz = strlen(unsplit_word->subword); - const char *word = unsplit_word->subword; - const char *wend = &unsplit_word->subword[sz]; - - /* Dynamic allocation of working buffers. */ - int downcase_size = sz+MB_LEN_MAX+1; /* pessimistic max. size of dc buffer */ - char *const downcase = alloca(downcase_size); /* downcasing buffer */ - char *const temp_word = alloca(downcase_size); /* tmp word buffer */ - char *const seen_word = alloca(downcase_size); /* loop-prevention buffer */ - - downcase[0] = '\0'; - - lgdebug(+D_SW, "Processing word: '%s'\n", word); - - if (unsplit_word->status & (WS_SPELL|WS_RUNON)) - { - /* The word is a result of spelling. - * So it it is in the dict, and doesn't need right/left stripping. */ - unsplit_word->status |= WS_INDICT; - word_is_known = true; - } - else - { - if (boolean_dictionary_lookup(dict, word)) - { - lgdebug(+D_SW, "0: Adding '%s' as is, before split tries\n", word); - issue_word_alternative(sent, unsplit_word, "W", - 0,NULL, 1,&word, 0,NULL); - unsplit_word->status |= WS_INDICT; - word_is_known = true; - } - - if ((MT_CONTR == unsplit_word->morpheme_type)) - { - /* The word is the contracted part of a contraction. It was most - * probably been marked as dict word by the check above (unless there - * is a definition error and it is only PRE or SUF without being in the - * dict). - * It should also not pass any more handling, so return here. - * Especially it should not pass right-strip. Else y' gets split to - * y ' and 'll gets split as units to ' l l - * FIXME This prevents separating double contraction (that still may - * not be done even otherwise). - * http://en.wiktionary.org/wiki/Category:English_double_contractions*/ - if (!word_is_known) - { - /* Note: If we are here it means dict->affix_table is not NULL. */ - prt_error("Warning: Contracted word part %s is in '%s/%s' " - "but not in '%s/%s'\n", word, - dict->lang, dict->affix_table->name, - dict->lang, dict->name); - } - return; - } - - /* - * This is essentially the old LR stripping code, from the pre-Wordgraph - * version. It still seems to work fine. Work should be done here in - * order to simplify it. - */ - - wp = strip_left(sent, word, r_stripped, &n_r_stripped); - if (wp != word) - { - /* If n_r_stripped exceed max, the "word" is most likely includes a long - * sequence of periods. Just accept it as an unknown "word", - * and move on. - * FIXME: Word separation may still be needed, e.g. for a table of - * contents: - * ............................something - * FIXME: "return" here prevents matching a regex. - */ - if (n_r_stripped >= MAX_STRIP-1) - { - lgdebug(+D_SW, "Left-strip of >= %d tokens\n", MAX_STRIP-1); - return; /* XXX */ - } - - if ('\0' != *wp) - r_stripped[n_r_stripped++] = wp; - - issue_word_alternative(sent, unsplit_word, "rL", - 0,NULL, n_r_stripped,r_stripped, 0,NULL); - - /* Its possible that the token consisted entirely of - * left-punctuation, in which case, wp is an empty string. - * In case this is a single token (n_r_stripped == 1), we have - * to continue processing, because it may match a regex. */ - if ('\0' == *wp && n_r_stripped != 1) - { - /* Suppose no more alternatives in such a case. */ - lgdebug(+D_SW, "1: Word '%s' all left-puncts - done\n", - unsplit_word->subword); - return; - } - - n_r_stripped = 0; - word_can_lrsplit = true; - } - - lgdebug(+D_SW, "1: Continue with word %s status=%s\n", - word, gword_status(sent, unsplit_word)); - - /* Strip off punctuation and units, etc. on the right-hand side. Try - * rpunc, then units, then rpunc, then units again, in a loop. We do this - * to handle expressions such as 12sqft. or 12lbs. (notice the period at - * end). That is, we want to strip off the "lbs." with the dot, first, - * rather than stripping the dot as punctuation, and then coming up - * empty-handed for "sq.ft" (without the dot) in the dict. But if we are - * NOT able to strip off any units, then we try punctuation, and then - * units. This allows commas to be removed (e.g. 7grams,). */ - - seen_word[0] = '\0'; - do - { - int temp_n_r_stripped; - /* First, try to strip off a single punctuation, typically a comma or - * period, and see if the resulting word is in the dict (but not the - * regex). This allows "sin." and "call." to be recognized. If we don't - * do this now, then the next stage will split "sin." into - * seconds-inches, and "call." into calories-liters. */ - temp_n_r_stripped = n_r_stripped; - temp_wend = wend; - stripped = strip_right(sent, word, &wend, r_stripped, &n_r_stripped, - AFDICT_RPUNC, /*rootdigit*/false, 2); - if (stripped) - { - /* "wend" points to the end of the remaining word. */ - sz = wend-word; - strncpy(temp_word, word, sz); - temp_word[sz] = '\0'; - - /* If the resulting word is in the dict, we are done. */ - if (boolean_dictionary_lookup(dict, temp_word)) break; - /* Undo the check. */ - wend = temp_wend; - n_r_stripped = temp_n_r_stripped; - } - - /* Remember the results, for a potential alternative. */ - units_wend = wend; - units_n_r_stripped = n_r_stripped; - - /* Strip off all units, if possible. It is not likely that we strip - * here a string like "in." which is not a unit since we require a - * number before it when only a single component is stripped off. */ - temp_wend = wend; - stripped = strip_right(sent, word, &wend, r_stripped, &n_r_stripped, - AFDICT_UNITS, /*rootdigit*/true, 3); - if (!stripped) - { - units_wend = NULL; - /* Try to strip off punctuation, typically a comma or period. */ - stripped = strip_right(sent, word, &wend, r_stripped, &n_r_stripped, - AFDICT_RPUNC, /*rootdigit*/false, 4); - } - - /* w points to the remaining word, - * "wend" to the end of the word. */ - sz = wend-word; - strncpy(temp_word, word, sz); - temp_word[sz] = '\0'; - - /* Avoid an infinite loop in case of a repeating unknown remaining word */ - if (0 == strcmp(temp_word, seen_word)) break; - strcpy(seen_word, temp_word); - - /* Any remaining dict word stops the right-punctuation stripping. */ - } while (NULL == units_wend && stripped && - !boolean_dictionary_lookup(dict, temp_word)); - - lgdebug(+D_SW, "After strip_right: n_r_stripped=(%s) " - "word='%s' wend='%s' units_wend='%s' temp_word='%s'\n", - print_rev_word_array(sent, r_stripped, n_r_stripped), - word, wend, units_wend, temp_word); - - /* If n_r_stripped exceed max, the "word" most likely includes a long - * sequence of periods. Just accept it as an unknown "word", - * and move on. - * FIXME: Word separation may still be needed, e.g. for a table of - * contents: - * 10............................ - */ - if (n_r_stripped >= MAX_STRIP-1) - { - lgdebug(+D_SW, "Right-strip of >= %d tokens\n", MAX_STRIP-1); - return; /* XXX */ - } - - /* Check whether the "word" is in the dict (including - * regex). In such a case we need to generate an alternative. This happens - * if it is a part number, like "1234-567A". - */ - - if (units_n_r_stripped && units_wend) /* units found */ - { - sz = units_wend-word; - strncpy(temp_word, word, sz); - temp_word[sz] = '\0'; - - if (find_word_in_dict(dict, temp_word)) - { - issue_r_stripped(sent, unsplit_word, temp_word, NULL, - r_stripped, units_n_r_stripped, "rR2"); - word_can_lrsplit = true; - } - } - - - /* Add the strip result as an alternative if one of these conditions is - * true: - * - If the root word (temp_word) is known. - * - If the unsplit_word is unknown. This happens with an unknown word - * that has punctuation after it). */ - if (n_r_stripped > 0) - { - sz = wend-word; - strncpy(temp_word, word, sz); - temp_word[sz] = '\0'; - - if (!find_word_in_dict(dict, unsplit_word->subword) || - find_word_in_dict(dict, temp_word)) - { - issue_r_stripped(sent, unsplit_word, temp_word, NULL, - r_stripped, n_r_stripped, "rR3"); - word_can_lrsplit = true; - } - } - } - - lgdebug(+D_SW, "2: Continue with word %s, can_lrsplit=%d status=%s\n", - word, word_can_lrsplit, gword_status(sent, unsplit_word)); - - /* Generate random morphology */ - if (dict->affix_table && dict->affix_table->anysplit) - anysplit(sent, unsplit_word); - - /* OK, now try to strip affixes. */ - word_can_split = morpheme_split(sent, unsplit_word, word); - - /* If the word is unknown, then try to guess its category by regexes. - * A word that can split is considered known, unless it is a contraction, - * in which case we need a regex for things like 1960's. - * The first regex which matches (if any) is used. - * An alternative consisting of the word has already been generated. */ - if (!word_is_known && (!word_can_split || is_contraction_word(dict, word))) - { - const char *regex_name = match_regex(dict->regex_root, word); - if ((NULL != regex_name) && boolean_dictionary_lookup(dict, regex_name)) - { - unsplit_word->status |= WS_REGEX; - unsplit_word->regex_name = regex_name; - /* Don't set word_is_known=true yet. */ - } - } - - lgdebug(+D_SW, "After split step, word=%s can_split=%d is_known=%d RE=%s\n", - word, word_can_split, word_is_known, - (NULL == unsplit_word->regex_name) ? "" : unsplit_word->regex_name); - - if (is_utf8_upper(word, dict->lctype)) - { - if (!test_enabled("dictcap")) - { - /** Hard-coded English-centric capitalization handling. - * - * FIXME: Capitalization handling should be done using the dict. - * - * If the word is capitalized, then issue as alternatives: - * - Issue its lowercase version if it is in a capitalizable - * position and also it is in the dict. - * - Issue it (capitalized) too as a word to regex (so the - * capitalized-words regex disjuncts will be used), in these - * conditions (cumulative): - * -- It could not be split (else capitalization has been - * handled XXX). - * -- It is not in the dict (it has already been issued in - * that case). - * -- It is not in a capitalizable position in the sentence. - * -- Its lowercase version is in the dict file (not regex) and - * it is an entity (checked capitalized) or a common entity - * (checked as lowercase). - * - * Comments from a previous release: - * - * * Common entity (checked as lowercase): This allows common - * nouns and adjectives to be used for entity names: e.g. "Great - * Southern Union declares bankruptcy", allowing Great to be - * capitalized, while preventing an upper-case "She" being used - * as a proper name in "She declared bankruptcy". - * - * * Entity (checked capitalized): We need to *add* Sue.f (female - * name Sue) even though sue.v (the verb "to sue") is in the - * dict. So test for capitalized entity names. FIXME: [ap] - * Since capitalized words which are in the dict file are now - * issued anyway as uppercase, and the capitalized-words regexes - * are not marked in the dict as entities, this may have effect - * only for capitalized words that match non-capitalized-words - * regexes that are marked as entities. I don't know about such, - * and if there are indeed no such regexes, it looks like the - * is_entity() check is redundant. A test "is_entity" added - * below to check if there is any sentence in the batches that - * contradicts that. - */ - bool word_is_capitalizable = is_capitalizable(dict, unsplit_word); - - if ('\0' == downcase[0]) - downcase_utf8_str(downcase, word, downcase_size, dict->lctype); - lc_word_is_in_dict = boolean_dictionary_lookup(dict, downcase); - - if (word_is_capitalizable) - { - if (lc_word_is_in_dict) - { - /* Issue the lowercase version of the word. */ - Gword *lc; - - wp = downcase; - lgdebug(+D_SW, "Adding lc=%s, is_capitalizable=1\n", wp); - lc = issue_word_alternative(sent, unsplit_word, "LC", - 0,NULL, 1,&wp, 0,NULL); - /* This is the lc version. The original word can be restored - * later, if needed, through the unsplit word. */ - lc->status |= WS_FIRSTUPPER; - } - else /* for a comment */ - { - /* If we are here, it is a capitalized word in a capitalized - * position which its lowercase version is not in the dict file. - * Should we try a regex match if the word is unknown? */ - } - } - - lgdebug(+D_SW, "Word=%s lc=%s in_dict=%d is_known=%d can_split=%d " - "is_capitalizable=%d lc_is_in_dict=%d " - "is_entity=%d is_common_entity=%d\n", - word, downcase, !!(unsplit_word->status & WS_INDICT), - word_is_known, word_can_split, - word_is_capitalizable, lc_word_is_in_dict, - is_entity(dict, word), is_common_entity(dict, downcase)); - - if (!word_can_split && !word_is_known && - (!word_is_capitalizable || (lc_word_is_in_dict && - (is_common_entity(dict, downcase) || is_entity(dict, word))))) - { - /* Issue it (capitalized) too */ - if ((NULL != unsplit_word->regex_name)) - { - lgdebug(+D_SW, "Adding uc word=%s RE=%s\n", word, - unsplit_word->regex_name); - issue_word_alternative(sent, unsplit_word, "REuc", - 0,NULL, 1,&word, 0,NULL); - word_is_known = true; - - if (test_enabled("is_entity") && is_entity(dict, word)) - prt_error("is_entity(%s): %s\n", word, sent->orig_sentence); - } - } - word_is_known |= lc_word_is_in_dict; - } - else - { - /* - * Experimental dictionary handling for capitalized words. - */ - - if (!boolean_dictionary_lookup(dict, CAP1st) || - !boolean_dictionary_lookup(dict, CAPnon)) - { - /* FIXME Move this check. Make it once. */ - prt_error("Error: Missing " CAP1st "/" CAPnon "in the dict\n"); - return; - } - - /* - If the (uc) word is in the dict, it has already been issued. - * - If the word is not a capitalized word according to the regex file, - * it also should not be issued, even if is_utf8_upper(word, dict), - * e.g Y'gonna or Let's. */ - if (!(unsplit_word->status & WS_INDICT) && - is_re_capitalized(unsplit_word->regex_name)) - { - issue_dictcap(sent, /*is_cap*/true, unsplit_word, word); - } - - downcase_utf8_str(downcase, word, downcase_size, dict->lctype); - /* Issue the lc version if it is known. - * FIXME? Issuing only known lc words prevents using the unknown-word - * device for words in capitalizable position (when the word is a uc - * version of an unknown word). */ - if (find_word_in_dict(sent->dict, downcase)) - issue_dictcap(sent, /*is_cap*/false, unsplit_word, downcase); - - word_is_known = true; /* We could just return */ - } - } - - /* Handle regex match. This is done for words which are not in the dict - * and cannot morpheme split. - * - * Contracted words, like 1960's should be tried - words that contain - * punctuation are not going to match). - * - * However, capital LR-split words which their lc version is in the dict, - * such as "As" (gets split to A s) shouldn't be tried here, as their - * capitalization handling has already been handled before we arrived here, - * and if a capital-word regex has not been issued there, we should prevent - * issuing it here. */ - if (!(word_is_known || lc_word_is_in_dict || - (word_can_split && !is_contraction_word(dict, word)))) - { - if ((NULL != unsplit_word->regex_name)) - { - lgdebug(+D_SW, "Adding word '%s' for regex, match=%s\n", - word, unsplit_word->regex_name); - issue_word_alternative(sent, unsplit_word, "RE", - 0,NULL, 1,&word, 0,NULL); - - word_is_known = true; /* make sure we skip spell guess */ - } - } - - word_is_known |= word_can_split; - -#if defined HAVE_HUNSPELL || defined HAVE_ASPELL - /* If the word is unknown, then it might be a run-on of two words or a - * misspelled word. Ask the spell-checker to split the word, if possible, - * and/or offer guesses. - * - * Do all of this only if the word is not a proper name, and if - * spell-checking is enabled and spell-checker is specified. A word which - * contains digits is considered a proper name (maybe of a part number). - * - * ??? Should we add spell guesses as alternatives in case: - * 1. The word if not in the main dict but matches a regex. - * 2. The word an unknown capitalized word. - */ - if (!word_can_lrsplit && !word_is_known && - !contains_digits(word, dict->lctype) && - !is_proper_name(word, dict->lctype) && - opts->use_spell_guess && dict->spell_checker) - { - bool spell_suggest = guess_misspelled_word(sent, unsplit_word, opts); - lgdebug(+D_SW, "Spell suggest=%d\n", spell_suggest); - } -#endif /* defined HAVE_HUNSPELL || defined HAVE_ASPELL */ - - lgdebug(+D_SW, "END: Word '%s' in_dict=%d is_known=%d status=%s\n", - unsplit_word->subword, !!(unsplit_word->status & WS_INDICT), - word_is_known, gword_status(sent, unsplit_word)); -#if 0 - if (!word_is_known && - !(unsplit_word->status & (WS_INDICT|WS_REGEX))) - unsplit_word->status |= WS_UNKNOWN; -#endif -} - -/** - * Make the string 's' be the next word of the sentence. - * - * Do not issue the empty string. - * - * FIXME: We need it for now in order to initially populate the wordgraph. - */ -static Gword *issue_sentence_word(const Sentence sent, const char *const s) -{ - Gword *new_word; - Gword *last_word = sent->last_word; - - assert(NULL!=last_word); - assert(NULL!=s, "subword must not be NULL"); - assert('\0'!=s[0], "subword must not be empty: Last subword issued: '%s'", - last_word->subword); - - new_word = gword_new(sent, s); - new_word->unsplit_word = sent->wordgraph; - new_word->label = "S"; /* a sentence word */ - - gwordlist_append(&last_word->next, new_word); - gwordlist_append(&new_word->prev, last_word); - - gwordqueue_add(sent, new_word); - - return new_word; -} - -static void add_gword(Sentence sent, const char *w, const char *wend, - Morpheme_type morpheme_type) -{ - const size_t sz = (NULL == wend) ? strlen(w) : (size_t)(wend - w); - char *const word = alloca(sz+1); - Gword *new_word; - - strncpy(word, w, sz); - word[sz] = '\0'; - - new_word = issue_sentence_word(sent, word); - new_word->morpheme_type = morpheme_type; - new_word->alternative_id = sent->wordgraph; - if (MT_WORD != morpheme_type) - { - /* Skip tokenizing this word */ - new_word->tokenizing_step = TS_DONE; - if (MT_WALL == morpheme_type) new_word->status |= WS_INDICT; - } -} - -/** - * Create the first node of the wordgraph. - * Its subword is the whole original sentence. - * It also serves as a leading dummy "word" for the sentence, like a "margin - * mark", in order to simplify the wordgraph pointer manipulation (no need to - * check for NULL "prev" pointer). The unsplit_word field is NULL only for the - * leading and trailing margin words. - */ -static void wordgraph_create(Sentence const sent) -{ - Gword *new_word; - - assert(NULL==sent->last_word, "wordgraph_create(): wordgraph exists"); - new_word = gword_new(sent, sent->orig_sentence); - - assert(NULL!=sent->orig_sentence, "wordgraph_create()"); - assert(NULL==sent->wordgraph, "wordgraph_create(): wordgraph exists"); - - sent->wordgraph = sent->last_word = new_word; - new_word->label = "D"; /* dummy word */ - new_word->morpheme_type = MT_INFRASTRUCTURE; -} - -/** - * Create a trailing dummy "word" for the sentence. - * It is a sentence "margin" trailing mark, in order to simplify the wordgraph - * pointer manipulation (no need to check for NULL "next" pointer). - * The unsplit_word field is NULL only for the margin words. */ -static void wordgraph_terminator(Sentence const sent) -{ - assert(NULL != sent->last_word, "wordgraph_terminator(): no wordgraph"); - add_gword(sent, "(T)", NULL, MT_INFRASTRUCTURE); /* cannot use "" */ - sent->last_word->unsplit_word = NULL; /* no unsplit word */ - sent->last_word->label = "D"; /* dummy word */ - sent->last_word->tokenizing_step = TS_DONE; /* not to be tokenized */ -} - -/** - * The string s has just been read in from standard input. - * This function breaks it up into words and stores these words in - * the sent->word[] array. Returns true if all is well, false otherwise. - */ -bool separate_sentence(Sentence sent, Parse_Options opts) -{ - const char * word_end; - //bool quote_found; - Dictionary dict = sent->dict; - mbstate_t mbs; - const char * word_start = sent->orig_sentence; - Gword *word; - - sent->length = 0; - - wordgraph_create(sent); - - if (dict->left_wall_defined) - add_gword(sent, LEFT_WALL_WORD, NULL, MT_WALL); - - /* Reset the multibyte shift state to the initial state */ - memset(&mbs, 0, sizeof(mbs)); - -#ifdef DEBUG - /* Skip a synthetic sentence mark, if any. See synthetic_split(). */ - if (SYNTHETIC_SENTENCE_MARK == sent->orig_sentence[0]) word_start++; -#endif - - for(;;) - { - wchar_t c; - int nb = mbrtowc(&c, word_start, MB_CUR_MAX, &mbs); - if (0 > nb) goto failure; - - - while (is_space(c, dict->lctype)) - { - word_start += nb; - nb = mbrtowc(&c, word_start, MB_CUR_MAX, &mbs); - if (0 == nb) break; - if (0 > nb) goto failure; - } - - if ('\0' == *word_start) break; - - /* Loop over non-blank characters until word-end is found. */ - word_end = word_start; - nb = mbrtowc(&c, word_end, MB_CUR_MAX, &mbs); - if (0 > nb) goto failure; - while (!is_space(c, dict->lctype) && (c != 0) && (0 < nb)) - { - word_end += nb; - nb = mbrtowc(&c, word_end, MB_CUR_MAX, &mbs); - if (0 > nb) goto failure; - } - - /* FIXME: Morpheme type of initial bad-sentence word may be wrong. - * E.g: He 's here. (Space before ' so 's is classified as MT_WORD). */ - add_gword(sent, word_start, word_end, MT_WORD); - word_start = word_end; - if ('\0' == *word_start) break; - } - - if (dict->right_wall_defined) - add_gword(sent, RIGHT_WALL_WORD, NULL, MT_WALL); - - wordgraph_terminator(sent); - - while ((word = wordgraph_getqueue_word(sent))) - { - if (TS_DONE == word->tokenizing_step) - { - remqueue_gword(sent); - continue; - } - - /* Perform prefix, suffix splitting, if needed */ -#ifdef DEBUG - if (SYNTHETIC_SENTENCE_MARK == sent->orig_sentence[0]) - synthetic_split(sent, word); -#else - if (0) - ; -#endif - else - separate_word(sent, word, opts); - - word->tokenizing_step = TS_DONE; - } - - /* Return true if at least one sentence word has been issued */ - for (word = sent->wordgraph; NULL != word->next; word = word->next[0]) - { - if ((word->morpheme_type != MT_INFRASTRUCTURE) && - (word->morpheme_type != MT_WALL)) - { - /* !test=wg or !test=wg:flags (for flags see wordgraph.h) */ - if (test_enabled("wg")) - wordgraph_show(sent, test_enabled("wg")); - return true; - } - } - return false; /* Something is wrong */ - -failure: -#ifdef _WIN32 - prt_error("Unable to process UTF8 input string.\n"); -#else - prt_error("Unable to process UTF8 input string in current locale %s\n", - nl_langinfo(CODESET)); -#endif - return false; -} - -static Word *word_new(Sentence sent) -{ - const size_t len = sent->length; - - sent->word = realloc(sent->word, (len+1)*sizeof(*sent->word)); - sent->word[len].d= NULL; - sent->word[len].x= NULL; - sent->word[len].unsplit_word = NULL; - sent->word[len].alternatives = NULL; - sent->word[len].optional = false; - sent->length++; - - return &sent->word[len]; -} - -/** - * Build the expression lists for a given word at the current word-array word. - * - * The resulted word-array is later used as an input to the parser. - * - * Algorithm: - * Apply the following step to all words w: - * - If w is in the dictionary, use it. - * - Else if w is identified by regex matching, use the appropriately - * matched disjunct collection. - * - Otherwise w is unknown - use the disjunct collection of UNKNOWN_WORD. - * - * FIXME For now, also add an element to the alternatives array, so the rest of - * program will work fine (print_sentence_word_alternatives(), - * sentence_in_dictionary(), verr_msg()). - */ -#define D_X_NODE 9 -#define D_DWE 8 -static bool determine_word_expressions(Sentence sent, Gword *w, - unsigned int *ZZZ_added) -{ - Dictionary dict = sent->dict; - const size_t wordpos = sent->length - 1; - - const char *s = w->subword; - X_node * we = NULL; - - lgdebug(+D_DWE, "Word %zu subword %zu:'%s' status %s", - wordpos, w->node_num, s, gword_status(sent, w)); - if (NULL != sent->word[wordpos].unsplit_word) - lgdebug(D_DWE, " (unsplit '%s')", sent->word[wordpos].unsplit_word); - - /* Generate an "alternatives" component. */ - altappend(sent, &sent->word[wordpos].alternatives, s); - - if (w->status & WS_INDICT) - { - we = build_word_expressions(sent, w, NULL); - } - else if (w->status & WS_REGEX) - { - we = build_word_expressions(sent, w, w->regex_name); - } - else if (dict->unknown_word_defined && dict->use_unknown_word) - { - we = build_word_expressions(sent, w, UNKNOWN_WORD); - assert(we, UNKNOWN_WORD " supposed to be defined in the dictionary!"); - w->status |= WS_UNKNOWN; - } - else - { - /* The word is unknown, but UNKNOWN_WORD cannot be used. - * An error message will eventually be printed. */ - prt_error("Error: Word '%s': word is unknown\n", w->subword); - return false; - } - -#ifdef DEBUG - assert(NULL != we, "Word '%s': NULL X-node", w->subword); -#else - if (NULL == we) - { - /* FIXME Change it to assert() when the Wordgraph version is mature. */ - prt_error("Error: Word '%s': Internal error: NULL X_node", w->subword); - return false; - } -#endif - - /* If the current word is an empty-word (or like it), add a - * connector for an empty-word (EMPTY_CONNECTOR - ZZZ+) to the - * previous word. See the comments at add_empty_word(). - * As a shortcut, only the first x-node is checked here for ZZZ-, - * supposing that the word has it in all of its dict entries - * (in any case, currently there is only 1 entry for each such word). - * Note that ZZZ_added starts by 0 and so also wordpos, and that the - * first sentence word (usually LEFT-WALL) doesn't need a check. */ - if ((wordpos != *ZZZ_added) && is_exp_like_empty_word(dict, we->exp)) - { - lgdebug(D_DWE, " (has ZZZ-)"); - add_empty_word(dict, sent->word[wordpos-1].x); - *ZZZ_added = wordpos; /* Remember it for not doing it again */ - } - lgdebug(D_DWE, "\n"); - - /* At last .. concatenate the word expressions we build for - * this alternative. */ - sent->word[wordpos].x = catenate_X_nodes(sent->word[wordpos].x, we); - if (verbosity_level(D_X_NODE)) - { - /* Print the X_node details for the word. */ - prt_error("Debug: Tokenize word/alt=%zu/%zu '%s' re=%s\n\\", - wordpos, altlen(sent->word[wordpos].alternatives), s, - w->regex_name ? w->regex_name : ""); - while (we) - { - prt_error("Debug: string='%s' expr=", we->string); - print_expression(we->exp); - we = we->next; - } - } - - return true; -} -#undef D_DWE - -#if 0 /* unused */ -/** - * Find whether w1 and w2 have been generated together in the same alternative. - */ -static bool is_alternative_next_word(const Gword *w1, const Gword *w2) -{ - assert(NULL != w1->alternative_id, "Word '%s' NULL alternative_id", - w1->subword); - lgdebug(+6, "w1='%s' (%p=%s) w2='%s' (%p=%s) \n", - w1->subword, w1->alternative_id, w1->alternative_id->subword, - w2->subword, w2->alternative_id, w2->alternative_id->subword); - return (w1->alternative_id == w2->alternative_id); -} -#endif - -#ifdef FIXIT /* unused */ -/* XXX WS_UNSPLIT */ -static bool same_unsplit_word(Sentence sent, const Gword *w1, const Gword *w2) -{ - return ((w1->unsplit_word == w2->unsplit_word) && - (w1->unsplit_word != sent->wordgraph)); -} -#endif - -/** - * "Flatten" the wordgraph into a word array. - * Return false if an error was encountered. - */ -#define D_FW 8 -bool flatten_wordgraph(Sentence sent, Parse_Options opts) -{ - Wordgraph_pathpos *wp_new = NULL; - Wordgraph_pathpos *wp_old = NULL; - Wordgraph_pathpos *wpp_new, *wpp_old; - Gword *wg_word; /* A wordgraph word */ - Gword **next; /* The next words */ - const Gword *last_unsplit_word = NULL; - size_t max_words = 0; - bool error_encountered = false; - bool right_wall_encountered = false; - unsigned int ZZZ_added = 0; /* ZZZ+ has been added to previous word */ - - assert(0 == sent->length, "flatten_wordgraph(): Word array already exists."); - - /* Establish an upper bound on the total number of words, to prevent an - * infinite loop in case of a bug. At the same time, calculate the - * hierarchy position of the word. */ - for (wg_word = sent->wordgraph->chain_next; wg_word; - wg_word = wg_word->chain_next) - { - wordgraph_hier_position(wg_word); - max_words++; - } - - /* Populate the pathpos word queue */ - for (next = sent->wordgraph->next; *next; next++) - { - wordgraph_pathpos_add(&wp_new, *next, - false/* used */, false/* same_word */, - true/* diff_alternative */); - } - - /* Scan the wordgraph and flatten it. */ - do - { - Word *wa_word; /* A word-array word (for the parsing stage) */ - const Gword *unsplit_word; - - assert(NULL != wp_new, "pathpos word queue is empty"); - wp_old = wp_new; - wp_new = NULL; - print_wordgraph_pathpos(wp_old); - - /* Add a new word to the sentence word array. - */ - assert(0 < max_words--, "Too many words (it may be an infinite loop)"); - wa_word = word_new(sent); - - /* Go upward and find the sentence word. */ - unsplit_word = wp_old->word; - if (MT_INFRASTRUCTURE != unsplit_word->morpheme_type) - { - while (unsplit_word->unsplit_word != sent->wordgraph) - { - assert(NULL != unsplit_word, "'%s': Unsplit word not found", - wg_word->subword); - unsplit_word = unsplit_word->unsplit_word; - } - - assert(NULL != unsplit_word->subword, "Unsplit word not found"); - - if (unsplit_word != last_unsplit_word) - { - /* This is a new sentence word - use it as the unsplit word. */ - wa_word->unsplit_word = unsplit_word->subword; - last_unsplit_word = unsplit_word; - } - } - - /* Generate the X-nodes. */ - for (wpp_old = wp_old; NULL != wpp_old->word; wpp_old++) - { - wg_word = wpp_old->word; - if (NULL == wg_word->next) continue; /* XXX avoid termination */ - - if (wpp_old->same_word) - { - /* We haven't advanced to the next wordgraph word, so its X-node - * has already been generated in a previous word of the word - * array. This means we are in a longer alternative which has - * "extra" words that may not have links, and this is one of - * them. Mark it as "optional", so we consider that while - * parsing, and then remove it in case it doesn't have links. */ - sent->word[sent->length - 1].optional = true; - } - else - { - /* Words are not supposed to get issued more than once. */ - assert(!wpp_old->used, "Word %zu:%s has been used", - wg_word->node_num, wpp_old->word->subword); - - /* This is a new wordgraph word. - */ - assert(!right_wall_encountered, "Extra word"); - if (!determine_word_expressions(sent, wg_word, &ZZZ_added)) - error_encountered = true; - if ((MT_WALL == wg_word->morpheme_type) && - 0== strcmp(wg_word->subword, RIGHT_WALL_WORD)) - right_wall_encountered = true; - wpp_old->used = true; - } - } - - /* Scan the old pathpos queue, and check for which words we can advance - * in the wordgraph. Do it in two passes: - * 1. Advance to next words that are next in the alternative of old - * words. - * 2. Advance to next words that are in a different alternative than the - * words that are already in the new pathpos queue. - */ - - for (wpp_old = wp_old; NULL != wpp_old->word; wpp_old++) - { - wg_word = wpp_old->word; - if (NULL == wg_word->next) continue; /* XXX avoid termination word */ - - /* Here wg_word->next cannot be NULL. */ - assert(NULL != wg_word->next[0], "Bad wordgraph: " - "'%s'->next[0]==NULL", wg_word->subword); - assert((NULL != wg_word->next[0]->prev) - || (NULL != wg_word->next[0]->next), "Bad wordgraph: " - "'%s'->next[0]->prev/next==NULL", wg_word->subword); - assert(NULL != wg_word->next[0]->prev[0], "Bad wordgraph: " - "'%s'->next[0]->prev[0]==NULL", wg_word->subword); - - for (next = wg_word->next; NULL != *next; next++) - { - if (wg_word->hier_depth <= (*next)->hier_depth && - (NULL == (*next)->prev[1])) - { - lgdebug(+D_FW, "Word %zu:%s(%zu) next %zu:%s(%zu) next_ok\n", - wg_word->node_num, wg_word->subword, wg_word->hier_depth, - (*next)->node_num, (*next)->subword, (*next)->hier_depth); - wpp_old->next_ok = true; - break; - } - } - - if (wpp_old->next_ok) - { - lgdebug(+D_FW, "Advancing %zu:%s next_ok\n", wg_word->node_num, - wg_word->subword); - for (next = wg_word->next; NULL != *next; next++) - { - wordgraph_pathpos_add(&wp_new, *next, - false/* used */, false/* same_word */, - true/* diff_alternative */); - } - } - } - - for (wpp_old = wp_old; NULL != wpp_old->word; wpp_old++) - { - wg_word = wpp_old->word; - - if (!wpp_old->next_ok) /* next_ok words have got handled above */ - { - bool same_alternative = false; - - if (NULL == wg_word->next) continue; /* termination word */ - - if (NULL != wp_new) - { - for (next = wg_word->next; NULL != *next; next++) - { - for (wpp_new = wp_new; NULL != wpp_new->word; wpp_new++) - { - if ((wpp_new->word != *next) && - in_same_alternative(wpp_new->word, *next)) - { - lgdebug(+D_FW, "same_alternative: %zu:%s and %zu:%s\n", - wpp_new->word->node_num, wpp_new->word->subword, - (*next)->node_num, (*next)->subword); - same_alternative = true; - break; - } - } - if (same_alternative) break; /* shortcut */ - } - } - - /* If there are already words in the pathpos queue from the same - * alternative of the common ancestor of the next word, we cannot - * put it yet in the queue, because we should not put in the same - * slot of the word-array, words from the same alternative since - * else only one of them can be chosen by the linkage. Hence put - * again in the pathpos queue the current word, marking it was - * "same_word". This will cause generation of an empty word in the - * next round. */ - lgdebug(+D_FW, "Advancing %zu:%s: ", wg_word->node_num, - wg_word->subword); - - if (same_alternative) - { - lgdebug(D_FW, "No (same alt) used=%d\n", wpp_old->used); - wordgraph_pathpos_add(&wp_new, wg_word, - wpp_old->used, true/* same_word */, - true/* diff_alternative */); - } - else - { - bool added = false; - - for (next = wg_word->next; NULL != *next; next++) - added |= wordgraph_pathpos_add(&wp_new, *next, - false/* used */, - false/* same_word */, - true/* diff_alternative */); - if (added) - { - lgdebug(D_FW, "Yes\n"); - } - else - { - lgdebug(D_FW, "No (existing)\n"); - } - } - } - } - - free(wp_old); - } while ((NULL != wp_new[1].word) || - (wp_new[0].word->morpheme_type != MT_INFRASTRUCTURE)); - - free(wp_new); - lgdebug(+D_FW, "sent->length %zu\n", sent->length); - if (verbosity_level(D_SW)) - print_sentence_word_alternatives(sent, true, NULL, NULL); - - return !error_encountered; -} -#undef D_FW - -/** - * This just looks up all the words in the sentence, and builds - * up an appropriate error message in case some are not there. - * It has no side effect on the sentence. Returns true if all - * went well. - * - * This code is called only if the 'unknown-words' flag is set. - */ -bool sentence_in_dictionary(Sentence sent) -{ - bool ok_so_far; - size_t w; - const char * s; - Dictionary dict = sent->dict; - char temp[1024]; - - ok_so_far = true; - for (w=0; wlength; w++) - { - size_t ialt; - for (ialt=0; NULL != sent->word[w].alternatives[ialt]; ialt++) - { - s = sent->word[w].alternatives[ialt]; - if (!find_word_in_dict(dict, s)) - { - if (ok_so_far) - { - safe_strcpy(temp, "The following words are not in the dictionary:", sizeof(temp)); - ok_so_far = false; - } - safe_strcat(temp, " \"", sizeof(temp)); - safe_strcat(temp, s, sizeof(temp)); - safe_strcat(temp, "\"", sizeof(temp)); - } - } - } - if (!ok_so_far) - { - err_ctxt ec = { sent }; - err_msgc(&ec, lg_Error, "Error: Sentence not in dictionary\n%s", temp); - } - return ok_so_far; -} diff -Nru link-grammar-5.3.16/link-grammar/tokenize.h link-grammar-5.5.0/link-grammar/tokenize.h --- link-grammar-5.3.16/link-grammar/tokenize.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/tokenize.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,27 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _TOKENIZE_H -#define _TOKENIZE_H - -#include "api-types.h" - -bool separate_sentence(Sentence, Parse_Options); -bool sentence_in_dictionary(Sentence); -bool flatten_wordgraph(Sentence, Parse_Options); - -Gword *issue_word_alternative(Sentence sent, Gword *unsplit_word, - const char *label, - int prefnum, const char * const *prefix, - int stemnum, const char * const *stem, - int suffnum, const char * const *suffix); -#endif /* _TOKENIZE_H */ diff -Nru link-grammar-5.3.16/link-grammar/utilities.c link-grammar-5.5.0/link-grammar/utilities.c --- link-grammar-5.3.16/link-grammar/utilities.c 2017-01-30 02:44:10.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/utilities.c 2018-03-06 02:44:41.000000000 +0000 @@ -12,11 +12,16 @@ /*************************************************************************/ #include +#ifdef HAVE_POSIX_MEMALIGN +#include +#endif #include +#ifdef _WIN32 +#define _CRT_RAND_S +#endif /* _WIN32 */ #include #include #include -#include #include #include #ifdef HAVE_LOCALE_T_IN_XLOCALE_H @@ -24,37 +29,17 @@ #endif /* HAVE_LOCALE_T_IN_XLOCALE_H */ #ifndef _WIN32 - #include + // #include #include #else #include - #include /* For PathRemoveFileSpecA(). */ - #include /* For getcwd(). */ #endif /* _WIN32 */ -#include "string-set.h" -#include "structures.h" #include "utilities.h" -#ifdef _WIN32 - #define DIR_SEPARATOR "\\" -#else - #define DIR_SEPARATOR "/" -#endif /*_WIN32 */ - -#define IS_DIR_SEPARATOR(ch) (DIR_SEPARATOR[0] == (ch)) -#if !defined(DICTIONARY_DIR) || defined(__MINGW32__) - #define DEFAULTPATH NULL -#else - #define DEFAULTPATH DICTIONARY_DIR -#endif - -/* This file contains certain general utilities. */ -int verbosity; -/* debug and test should not be NULL since they can be used before they - * are assigned a value by parse_options_get_...() */ -char * debug = (char *)""; -char * test = (char *)""; +/* This file contains general utilities that fix, enhance OS-provided + * API's, esp ones that the OS forgot to provide, or managed to break. + */ /* ============================================================= */ /* String utilities */ @@ -103,26 +88,13 @@ u[usize-1] = '\0'; } -/** - * Prints string `s`, aligned to the left, in a field width `w`. - * If the width of `s` is shorter than `w`, then the remainder of - * field is padded with blanks (on the right). - */ -void left_print_string(FILE * fp, const char * s, int w) -{ - int width = w + strlen(s) - utf8_strwidth(s); - fprintf(fp, "%-*s", width, s); -} - #ifndef HAVE_STRNDUP /* Emulates glibc's strndup() */ char * strndup (const char *str, size_t size) { size_t len; - char *result = (char *) NULL; - - if ((char *) NULL == str) return (char *) NULL; + char *result; len = strlen (str); if (!len) return strdup (""); @@ -135,61 +107,33 @@ } #endif /* !HAVE_STRNDUP */ -/* ============================================================= */ -/* UTF8 utilities */ - -/** Returns length of UTF8 character. - * Current algo is based on the first character only. - * If pointer is not pointing at first char, no not a valid value, returns 0. - * Returns 0 for NULL as well. - */ -int utf8_charlen(const char *xc) -{ - unsigned char c; - - c = (unsigned char) *xc; - - if (c == 0) return 0; - if (c < 0x80) return 1; - if ((c >= 0xc2) && (c < 0xe0)) return 2; /* First byte of a code point U +0080 - U +07FF */ - if ((c >= 0xe0) && (c < 0xf0)) return 3; /* First byte of a code point U +0800 - U +FFFF */ - if ((c >= 0xf0) && (c <= 0xf4)) return 4; /* First byte of a code point U +10000 - U +10FFFF */ - return -1; /* Fallthrough -- not the first byte of a code-point. */ -} - -/* Implemented n wcwidth.c */ -extern int mk_wcwidth(wchar_t); - -/** - * Return the width, in text-column-widths, of the utf8-encoded - * string. This is needed when printing formatted strings. - * European langauges will typically have widths equal to the - * `mblen` value below (returned by mbsrtowcs); they occupy one - * column-width per code-point. The CJK ideographs occupy two - * column-widths per code-point. No clue about what happens for - * Arabic, or others. See wcwidth.c for details. +#ifndef HAVE_STRTOK_R +/* + * public domain strtok_r() by Charlie Gordon + * from comp.lang.c 9/14/2007 + * http://groups.google.com/group/comp.lang.c/msg/2ab1ecbb86646684 + * + * Declaration that it's public domain: + * http://groups.google.com/group/comp.lang.c/msg/7c7b39328fefab9c */ -size_t utf8_strwidth(const char *s) +char* strtok_r(char *str, const char *delim, char **nextp) { - mbstate_t mbss; - wchar_t ws[MAX_LINE]; - size_t mblen, glyph_width=0, i; + char *ret; - memset(&mbss, 0, sizeof(mbss)); + if (str == NULL) str = *nextp; + str += strspn(str, delim); + if (*str == '\0') return NULL; + ret = str; + str += strcspn(str, delim); + if (*str) *str++ = '\0'; + *nextp = str; -#ifdef _WIN32 - mblen = MultiByteToWideChar(CP_UTF8, 0, s, -1, ws, MAX_LINE) - 1; -#else - mblen = mbsrtowcs(ws, &s, MAX_LINE, &mbss); -#endif /* _WIN32 */ - - for (i=0; i". + */ +int rand_r(unsigned int *s) +{ + rand_s(s); + if (*s > INT_MAX) *s -= INT_MAX; + + return *s; +} #endif /* _WIN32 */ static int wctomb_check(char *s, wchar_t wc) @@ -242,7 +202,7 @@ * because the byte-counts might not match up, e.g. German ß and SS. * The correct long-term fix is to use ICU or glib g_utf8_strup(), etc. */ -void downcase_utf8_str(char *to, const char * from, size_t usize, locale_t locale_t) +void downcase_utf8_str(char *to, const char * from, size_t usize, locale_t locale) { wchar_t c; int i, nbl, nbh; @@ -259,7 +219,7 @@ prt_error("Error: Invalid UTF-8 string!\n"); return; } - c = towlower_l(c, locale_t); + c = towlower_l(c, locale); nbl = wctomb_check(low, c); /* Check for error on an in-place copy */ @@ -280,6 +240,7 @@ safe_strcpy(to, from, usize-nbl); } +#if 0 /** * Upcase the first letter of the word. * XXX FIXME This works 'most of the time', but is not technically correct. @@ -287,7 +248,7 @@ * because the byte-counts might not match up, e.g. German ß and SS. * The correct long-term fix is to use ICU or glib g_utf8_strup(), etc. */ -void upcase_utf8_str(char *to, const char * from, size_t usize, locale_t locale_t) +void upcase_utf8_str(char *to, const char * from, size_t usize, locale_t locale) { wchar_t c; int i, nbl, nbh; @@ -301,7 +262,7 @@ prt_error("Error: Invalid UTF-8 string!\n"); return; } - c = towupper_l(c, locale_t); + c = towupper_l(c, locale); nbl = wctomb_check(low, c); /* Check for error on an in-place copy */ @@ -321,6 +282,22 @@ to += nbl; safe_strcpy(to, from, usize-nbl); } +#endif + +#ifdef NO_ALIGNED_MALLOC +#if __GNUC__ +#warning No aligned alloc found (using malloc() instead). +#endif +#endif /* NO_ALIGNED_MALLOC */ + +#ifdef HAVE_POSIX_MEMALIGN +void *aligned_alloc(size_t alignment, size_t size) +{ + void *ptr; + errno = posix_memalign(&ptr, alignment, size); + return ptr; +} +#endif /* HAVE_POSIX_MEMALIGN */ /* ============================================================= */ /* Memory alloc routines below. These routines attempt to keep @@ -487,6 +464,13 @@ free(ds); } +char * dyn_str_take(dyn_str* ds) +{ + char * rv = ds->str; + free(ds); + return rv; +} + void dyn_strcat(dyn_str* ds, const char *str) { size_t l = strlen(str); @@ -499,311 +483,21 @@ ds->end += l; } -/* =========================================================== */ -/* File path and dictionary open routines below */ - -char * join_path(const char * prefix, const char * suffix) +/// Trim away trailing whitespace. +void dyn_trimback(dyn_str* ds) { - char * path; - size_t path_len, prel; - - path_len = strlen(prefix) + 1 /* len(DIR_SEPARATOR) */ + strlen(suffix); - path = (char *) malloc(path_len + 1); - - strcpy(path, prefix); + size_t tail = ds->end; + while (0 < tail && ' ' == ds->str[--tail]) {} - /* Windows is allergic to multiple path separators, so append one - * only if the prefix isn't already terminated by a path sep. - */ - prel = strlen(path); - if (0 < prel && path[prel-1] != DIR_SEPARATOR[0]) - { - path[prel] = DIR_SEPARATOR[0]; - path[prel+1] = '\0'; - } - strcat(path, suffix); - - return path; + ds->end = ++tail; + ds->str[tail] = 0x0; } -/* global - but that's OK, since this is set only during initialization, - * and is is thenceforth a read-only item. So it doesn't need to be - * locked. - */ -static char * custom_data_dir = NULL; - -void dictionary_set_data_dir(const char * path) +const char * dyn_str_value(dyn_str* s) { - if (custom_data_dir) free (custom_data_dir); - custom_data_dir = safe_strdup(path); + return s->str; } -char * dictionary_get_data_dir(void) -{ - char * data_dir = NULL; - - if (custom_data_dir != NULL) { - data_dir = safe_strdup(custom_data_dir); - return data_dir; - } - -#ifdef _WIN32 - /* Dynamically locate invocation directory of our program. - * Non-ASCII characters are not supported (files will not be found). */ - char prog_path[MAX_PATH_NAME]; - - if (!GetModuleFileNameA(NULL, prog_path, sizeof(prog_path))) - { - prt_error("Warning: GetModuleFileName error %d\n", (int)GetLastError()); - } - else - { - if (NULL == prog_path) - { - /* Can it happen? */ - prt_error("Warning: GetModuleFileName returned a NULL program path!\n"); - } - else - { - if (!PathRemoveFileSpecA(prog_path)) - { - prt_error("Warning: Cannot get directory from program path '%s'!\n", - prog_path); - } - else - { - /* Unconvertible characters are marked as '?' */ - const char *unsupported = (NULL != strchr(prog_path, '?')) ? - " (containing unsupported character)" : ""; - - lgdebug(D_USER_FILES, "Debug: Directory of executable: %s%s\n", - unsupported, prog_path); - data_dir = safe_strdup(prog_path); - } - } - } -#endif /* _WIN32 */ - - return data_dir; -} - -/** - * Locate a data file and open it. - * - * This function is used to open a dictionary file or a word file, - * or any associated data file (like a post process knowledge file). - * - * It works as follows. If the file name begins with a "/", then - * it's assumed to be an absolute file name and it tries to open - * that exact file. - * - * Otherwise, it looks for the file in a sequence of directories, as - * specified in the dictpath array, until it finds it. - * - * If it is still not found, it may be that the user specified a relative - * path, so it tries to open the exact file. - * - * Associated data files are looked in the *same* directory in which the - * first one was found (typically "en/4.0.dict"). The private static - * "path_found" serves as a directory path cache which records where the - * first file was found. The goal here is to avoid insanity due to - * user's fractured installs. - * If the filename argument is NULL, the function just invalidates this - * directory path cache. - */ -#define NOTFOUND(fp) ((NULL == (fp)) ? " (Not found)" : "") -void * object_open(const char *filename, - void * (*opencb)(const char *, const void *), - const void * user_data) -{ - static char *path_found; /* directory path cache */ - char *completename = NULL; - void *fp = NULL; - char *data_dir = NULL; - const char **path = NULL; - - if (NULL == filename) - { - /* Invalidate the directory path cache */ - free(path_found); - path_found = NULL; - return NULL; - } - - if (NULL == path_found) - { - data_dir = dictionary_get_data_dir(); - if (verbosity_level(D_USER_FILES)) - { - char cwd[MAX_PATH_NAME]; - char *cwdp = getcwd(cwd, sizeof(cwd)); - prt_error("Debug: Current directory: %s\n", NULL == cwdp ? "NULL": cwdp); - prt_error("Debug: Last-resort data directory: %s\n", - data_dir ? data_dir : "NULL"); - } - } - - /* Look for absolute filename. - * Unix: starts with leading slash. - * Windows: starts with C:\ except that the drive letter may differ. - * Note that only native windows C library uses backslashes; mingw - * seems to use forward-slash, from what I can tell. - */ - if ((filename[0] == '/') -#ifdef _WIN32 - || ((filename[1] == ':') - && ((filename[2] == '\\') || (filename[2] == '/'))) - || (filename[0] == '\\') /* UNC path */ -#endif /* _WIN32 */ - ) - { - /* opencb() returns NULL if the file does not exist. */ - fp = opencb(filename, user_data); - lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp)); - } - else - { - /* A path list in which to search for dictionaries. - * path_found, data_dir or DEFAULTPATH may be NULL. */ - const char *dictpath[] = - { - path_found, - ".", - "." DIR_SEPARATOR "data", - "..", - ".." DIR_SEPARATOR "data", - data_dir, - DEFAULTPATH, - }; - size_t i = sizeof(dictpath)/sizeof(dictpath[0]); - - for (path = dictpath; i-- > 0; path++) - { - if (NULL == *path) continue; - - free(completename); - completename = join_path(*path, filename); - fp = opencb(completename, user_data); - lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", completename, NOTFOUND(fp)); - if ((NULL != fp) || (NULL != path_found)) break; - } - } - - if (NULL == fp) - { - fp = opencb(filename, user_data); - lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp)); - } - else if (NULL == path_found) - { - size_t i; - - path_found = strdup((NULL != completename) ? completename : filename); - if (0 < verbosity) - prt_error("Info: Dictionary found at %s\n", path_found); - for (i = 0; i < 2; i++) - { - char *root = strrchr(path_found, DIR_SEPARATOR[0]); - if (NULL != root) *root = '\0'; - } - } - - free(data_dir); - free(completename); - return fp; -} -#undef NOTFOUND - -static void *dict_file_open(const char *fullname, const void *how) -{ - return fopen(fullname, how); -} - -FILE *dictopen(const char *filename, const char *how) -{ - return object_open(filename, dict_file_open, how); -} - -/* ======================================================== */ - -/** - * Check to see if a file exists. - */ -bool file_exists(const char * dict_name) -{ - bool retval = false; - int fd; - struct stat buf; - - /* On Windows, 'b' (binary mode) is mandatory, otherwise fstat file length - * is confused by crlf counted as one byte. POSIX systems just ignore it. */ - FILE *fp = dictopen(dict_name, "rb"); - - if (fp == NULL) - return false; - - /* Get the file size, in bytes. */ - fd = fileno(fp); - fstat(fd, &buf); - if (0 < buf.st_size) retval = true; - - fclose(fp); - return retval; -} - -/** - * Read in the whole stinkin file. This routine returns - * malloced memory, which should be freed as soon as possible. - */ -char *get_file_contents(const char * dict_name) -{ - int fd; - size_t tot_size; - int left; - struct stat buf; - char * contents, *p; - - /* On Windows, 'b' (binary mode) is mandatory, otherwise fstat file length - * is confused by crlf counted as one byte. POSIX systems just ignore it. */ - FILE *fp = dictopen(dict_name, "rb"); - - if (fp == NULL) - return NULL; - - /* Get the file size, in bytes. */ - fd = fileno(fp); - fstat(fd, &buf); - tot_size = buf.st_size; - - contents = (char *) malloc(sizeof(char) * (tot_size+7)); - - /* Now, read the whole file. */ - p = contents; - *p = '\0'; - left = tot_size + 7; - while (1) - { - char *rv = fgets(p, left, fp); - if (NULL == rv || feof(fp)) - break; - while (*p != '\0') { p++; left--; } - if (left < 0) - break; - } - - fclose(fp); - - if (left < 0) - { - prt_error("Error: File size is insane!\n"); - free(contents); - return NULL; - } - - return contents; -} - - /* ======================================================== */ /* Locale routines */ @@ -967,13 +661,6 @@ /* ============================================================= */ /* Alternatives utilities */ -static const char ** resize_alts(const char **arr, size_t len) -{ - arr = realloc(arr, (len+2) * sizeof(char *)); - arr[len+1] = NULL; - return arr; -} - size_t altlen(const char **arr) { size_t len = 0; @@ -982,14 +669,6 @@ return len; } -void altappend(Sentence sent, const char ***altp, const char *w) -{ - size_t n = altlen(*altp); - - *altp = resize_alts(*altp, n); - (*altp)[n] = string_set_add(w, sent->string_set); -} - /* ============================================================= */ #ifdef __MINGW32__ diff -Nru link-grammar-5.3.16/link-grammar/utilities.h link-grammar-5.5.0/link-grammar/utilities.h --- link-grammar-5.3.16/link-grammar/utilities.h 2017-01-19 18:44:36.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/utilities.h 2018-04-22 14:57:33.000000000 +0000 @@ -19,9 +19,6 @@ #include #include -#ifdef _WIN32 -#define _CRT_RAND_S -#endif /* _WIN32 */ #include #include #include @@ -33,7 +30,6 @@ #include "error.h" #include "lg_assert.h" -#include "wcwidth.h" #ifdef HAVE_ALLOCA_H # include @@ -62,11 +58,6 @@ #endif /* _MSC_VER */ #endif /* !TLS */ -#ifndef strdupa -/* In the following, the argument should not have side effects. */ -#define strdupa(s) strcpy(alloca(strlen(s)+1), s) -#endif - /* Windows, POSIX and GNU have different ideas about thread-safe strerror(). */ #ifdef _WIN32 #define strerror_r(errno, buf, len) strerror_s(buf, len, errno) @@ -90,6 +81,8 @@ #define vsnprintf _vsnprintf #endif +#define HAVE__ALIGNED_MALLOC 1 + /* Avoid plenty of: warning C4090: 'function': different 'const' qualifiers. * This happens, for example, when the argument is "const void **". */ #define free(x) free((void *)x) @@ -110,14 +103,22 @@ #ifndef strncasecmp #define strncasecmp(a,b,s) strnicmp((a),(b),(s)) #endif -/* Note that "#define _CRT_RAND_S" is needed before "#include " */ -#define rand_r(seedp) rand_s(seedp) +int rand_r(unsigned int *); +#ifndef __MINGW32__ +/* No strtok_s in XP/2003 and their strtok_r is incompatible. + * Hence HAVE_STRTOK_R will not be defined and our own one will be used. */ +#if _WINVER != 0x501 /* XP */ && _WINVER != 0x502 /* Server 2003 */ #define strtok_r strtok_s +#define HAVE_STRTOK_R +#endif /* _WINVER != XP|2003 */ + +/* There is no ssize_t definition in native Windows. */ +#include +typedef SSIZE_T ssize_t; /* Native windows has locale_t, and hence HAVE_LOCALE_T is defined here. * However, MinGW currently doesn't have locale_t. If/when it has locale_t, * "configure" will define HAVE_LOCALE_T for it. */ -#ifndef __MINGW32__ #define HAVE_LOCALE_T #endif @@ -147,41 +148,10 @@ #endif /* _WIN32 */ /* MSVC isspace asserts in debug mode, and mingw sometime returns true, - * when passed utf8. Thus, limit to 7 bits for windows. */ -#ifdef _WIN32 - #define lg_isspace(c) ((0 < c) && (c < 127) && isspace(c)) -#else - #define lg_isspace isspace -#endif - -#if __APPLE__ -/* It appears that fgetc on Mac OS 10.11.3 "El Capitan" has a weird - * or broken version of fgetc() that flubs reads of utf8 chars when - * the locale is not set to "C" -- in particular, it fails for the - * en_US.utf8 locale; see bug report #293 - * https://github.com/opencog/link-grammar/issues/293 - */ -static inline int lg_fgetc(FILE *stream) -{ - char c[4]; /* general overflow paranoia */ - size_t nr = fread(c, 1, 1, stream); - if (0 == nr) return EOF; - return (int) c[0]; -} - -static inline int lg_ungetc(int c, FILE *stream) -{ - /* This should work, because we never unget past the newline char. */ - int rc = fseek(stream, -1, SEEK_CUR); - if (rc) return EOF; - return c; -} - -#else -#define lg_fgetc fgetc -#define lg_ungetc ungetc -#endif - + * when passed utf8. OSX returns TRUE on char values 0x85 and 0xa0). + * Since it is defined to return TRUE only on 6 characters, all of which + * are in the range [0..127], just limit its arguments to 7 bits. */ +#define lg_isspace(c) ((0 < c) && (c < 127) && isspace(c)) #if defined(__sun__) int strncasecmp(const char *s1, const char *s2, size_t n); @@ -203,6 +173,29 @@ #define freelocale(l) #endif /* HAVE_LOCALE_T */ +#if HAVE__ALIGNED_MALLOC +#define aligned_alloc(alignment, size) _aligned_malloc (size, alignment) +#define aligned_free(p) _aligned_free(p) +#undef HAVE_POSIX_MEMALIGN + +#elif HAVE_ALIGNED_ALLOC +#define aligned_free(p) free(p) +#undef HAVE_POSIX_MEMALIGN + +#elif HAVE_POSIX_MEMALIGN +/* aligned_alloc() emulation will be defined in utilities.c. */ +void *aligned_alloc(size_t alignment, size_t size); +#define aligned_free(p) free(p) + +#else +/* Fallback to just malloc(), as alignment is not critical here. */ +#define NO_ALIGNED_MALLOC /* For generating a warning in utilities.c. */ +#define aligned_alloc(alignment, size) malloc(size) +#define aligned_free(p) free(p) +#endif /* HAVE__ALIGNED_MALLOC */ + +#define ALIGN(size, alignment) (((size)+(alignment-1))&~(alignment-1)) + #define STR(x) #x #define STRINGIFY(x) STR(x) @@ -217,12 +210,28 @@ #define MAX(X,Y) ( ((X) > (Y)) ? (X) : (Y)) #endif +/* In the following, the arguments should not have side effects. + * FIXME: Detect in "configure" and check HAVE_* */ +#ifndef strdupa +#define strdupa(s) strcpy(alloca(strlen(s)+1), s) +#endif +#ifndef strndupa +#define strndupa(s, n) _strndupa3(alloca((n)+1), s, n) +static inline char *_strndupa3(char *new_s, const char *s, size_t n) +{ + strncpy(new_s, s, n); + new_s[n] = '\0'; + + return new_s; +} +#endif + /* From ccan array_size.h and build_assert.h, which are under a CC0 license */ #define BUILD_ASSERT_OR_ZERO(cond) (sizeof(char [1 - 2*!(cond)]) - 1) #if !defined(ARRAY_SIZE) /** - * ARRAY_SIZE - get the number of elements in a visible array - * @arr: the array whose size you want. + * ARRAY_SIZE: Get the number of elements in a visible array + * @param arr The array whose size you want. * * This does not work on pointers, or arrays declared as [], or * function parameters. With correct compiler support, such usage @@ -241,20 +250,48 @@ #endif #endif /* !defined(ARRAY_SIZE) */ +/* The GCC version we need must be >= 4.7, because it has to + * support C11. So it already supports all the features below. */ + /* Optimizations etc. that only gcc understands */ -#if __GNUC__ > 2 +#if __GNUC__ +#define GCC_DIAGNOSTIC +#define UNREACHABLE(x) (__extension__ ({if (x) __builtin_unreachable();})) #define GNUC_MALLOC __attribute__ ((malloc)) #define GNUC_UNUSED __attribute__ ((unused)) #else +#define UNREACHABLE(x) #define GNUC_MALLOC #define GNUC_UNUSED #endif -/** - * Return the width, in text-column-widths, of the utf8-encoded - * string. This is needed when printing formatted strings. - */ -size_t utf8_strwidth(const char *); + +/* Apply a pragma to a specific code section only. + * XXX According to the GCC docs, we cannot use here something like + * "#ifdef HAVE_x". Also -Wunknown-pragmas & -Wno-unknown-warning-option + * don't work in this situation. So "-Wmaybe-uninitialized", which + * is not recognized by clang, is defined separately. */ +#ifdef GCC_DIAGNOSTIC + +#ifdef HAVE_MAYBE_UNINITIALIZED +#define PRAGMA_MAYBE_UNINITIALIZED \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") +#else +#define PRAGMA_MAYBE_UNINITIALIZED \ + _Pragma("GCC diagnostic push") +#endif /* HAVE_MAYBE_UNINITIALIZED */ + +#define PRAGMA_START(x) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \ + _Pragma(#x) +#define PRAGMA_END _Pragma("GCC diagnostic pop") +#else +#define PRAGMA_START(x) +#define PRAGMA_END +#define PRAGMA_MAYBE_UNINITIALIZED +#endif /* GCC_DIAGNOSTIC */ /** * Return the length, in codepoints/glyphs, of the utf8-encoded @@ -265,55 +302,31 @@ { mbstate_t mbss; memset(&mbss, 0, sizeof(mbss)); -#if defined(_MSC_VER) || defined(__MINGW32__) +#if _WIN32 return MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0)-1; #else return mbsrtowcs(NULL, &s, 0, &mbss); -#endif -} - -/** - * Return the distance, in bytes, to the next character, in the - * input utf8-encoded string. - */ -static inline size_t utf8_next(const char *s) -{ -#ifdef _WIN32 - /* mbrlen does not work correctly on Windows. See issue #285 */ - /* https://github.com/opencog/link-grammar/issues/285 */ - size_t len = 0; - while (0 != *s) - { - if ((0x80 <= ((unsigned char) *s)) && - (((unsigned char) *s) < 0xc0)) { s++; len++; } - else return len+1; - } - return len; -#else - size_t len; - mbstate_t mbs; - memset(&mbs, 0, sizeof(mbs)); - len = mbrlen(s, MB_CUR_MAX, &mbs); - if (len == (size_t)(-1) || len == (size_t)(-2)) { - /* Too long or malformed sequence, step one byte. */ - return 1; - } - return len; #endif /* _WIN32 */ } -/** - * Return the length, in codepoints/glyphs, of the utf8-encoded - * string. The string is assumed to be at least `len` code-points - * long. This is needed when splitting words into morphemes. +/** Returns length of UTF8 character. + * Current algo is based on the first character only. + * If pointer is not pointing at first char, or not a valid value, returns -1. + * Returns 0 for NULL. */ -static inline size_t utf8_strnlen(const char *s, size_t len) +static inline int utf8_charlen(const char *xc) { - size_t by = 0; - while (0 < len) { by += utf8_next(&s[by]); } - return by; -} + unsigned char c; + + c = (unsigned char) *xc; + if (c == 0) return 0; + if (c < 0x80) return 1; + if ((c >= 0xc2) && (c < 0xe0)) return 2; /* First byte of a code point U +0080 - U +07FF */ + if ((c >= 0xe0) && (c < 0xf0)) return 3; /* First byte of a code point U +0800 - U +FFFF */ + if ((c >= 0xf0) && (c <= 0xf4)) return 4; /* First byte of a code point U +10000 - U +10FFFF */ + return -1; /* Fallthrough -- not the first byte of a code-point. */ +} /** * Copy `n` utf8 characters from `src` to `dest`. @@ -325,7 +338,8 @@ size_t b = 0; while (0 < n) { - size_t k = utf8_next(src); + size_t k = utf8_charlen(src); + if (0 > (ssize_t)k) return 0; /* XXX Maybe print error. */ b += k; while (0 < k) { *dest = *src; dest++; src++; k--; } n--; @@ -436,8 +450,9 @@ #endif /* Not in use. */ void downcase_utf8_str(char *to, const char * from, size_t usize, locale_t); +#if 0 void upcase_utf8_str(char *to, const char * from, size_t usize, locale_t); -int utf8_charlen(const char *); +#endif size_t lg_strlcpy(char * dest, const char *src, size_t size); void safe_strcpy(char *u, const char * v, size_t usize); @@ -447,17 +462,19 @@ /* Simple, cheap, easy dynamic string. */ typedef struct { - char *str; - size_t end; - size_t len; + char *str; + size_t end; + size_t len; } dyn_str; dyn_str* dyn_str_new(void); void dyn_str_delete(dyn_str*); void dyn_strcat(dyn_str*, const char*); +void dyn_trimback(dyn_str*); +char * dyn_str_take(dyn_str*); +const char * dyn_str_value(dyn_str*); size_t altlen(const char **); -void altappend(Sentence, const char ***, const char *); /* routines for allocating basic objects */ void init_memusage(void); @@ -478,26 +495,17 @@ char * get_default_locale(void); -char * join_path(const char * prefix, const char * suffix); - -FILE * dictopen(const char *filename, const char *how); -void * object_open(const char *filename, - void * (*opencb)(const char *, const void *), - const void * user_data); - -bool file_exists(const char * dict_name); -char * get_file_contents(const char *filename); void set_utf8_program_locale(void); bool try_locale(const char *); /** * Returns the smallest power of two that is at least i and at least 1 */ -static inline unsigned int next_power_of_two_up(unsigned int i) +static inline size_t next_power_of_two_up(size_t i) { - unsigned int j=1; - while (j - * Copyright (C) Fredrik Fornwall 2016. - * Copyright (c) 2017 Linas Vepstas - updated for Unicode 9. - * - * Original version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - * - * Updated with latest Unicode-9 tables from - * https://github.com/termux/wcwidth/blob/master/wcwidth.c - */ - -#include -#include -#include "wcwidth.h" - -struct interval { - int first; - int last; -}; - -/* auxiliary function for binary search in interval table */ -static bool bisearch(wchar_t ucs, const struct interval *table, int max) -{ - int min = 0; - int mid; - - if (ucs < table[0].first || ucs > table[max].last) - return false; - while (max >= min) { - mid = (min + max) / 2; - if (ucs > table[mid].last) - min = mid + 1; - else if (ucs < table[mid].first) - max = mid - 1; - else - return true; - } - - return false; -} - - -/* The following two functions define the column width of an ISO 10646 - * character as follows: - * - * - The null character (U+0000) has a column width of 0. - * - * - Other C0/C1 control characters and DEL will lead to a return - * value of -1. - * - * - Non-spacing and enclosing combining characters (general - * category code Mn or Me in the Unicode database) have a - * column width of 0. - * - * - SOFT HYPHEN (U+00AD) has a column width of 1. - * - * - Other format characters (general category code Cf in the Unicode - * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. - * - * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) - * have a column width of 0. - * - * - Spacing characters in the East Asian Wide (W) or East Asian - * Full-width (F) category as defined in Unicode Technical - * Report #11 have a column width of 2. - * - * - All remaining characters (including all printable - * ISO 8859-1 and WGL4 characters, Unicode control characters, - * etc.) have a column width of 1. - * - * This implementation assumes that wchar_t characters are encoded - * in ISO 10646. - */ - -int mk_wcwidth(wchar_t ucs) -{ - /* sorted list of non-overlapping intervals of non-spacing characters */ - /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ - - // From https://github.com/jquast/wcwidth/blob/master/wcwidth/table_zero.py - // at commit 0d7de112202cc8b2ebe9232ff4a5c954f19d561a (2016-07-02): - // Original source: DerivedGeneralCategory-9.0.0.txt - static const struct interval combining[] = { - {0x0300, 0x036f}, // Combining Grave Accent ..Combining Latin Small Le - {0x0483, 0x0489}, // Combining Cyrillic Titlo..Combining Cyrillic Milli - {0x0591, 0x05bd}, // Hebrew Accent Etnahta ..Hebrew Point Meteg - {0x05bf, 0x05bf}, // Hebrew Point Rafe ..Hebrew Point Rafe - {0x05c1, 0x05c2}, // Hebrew Point Shin Dot ..Hebrew Point Sin Dot - {0x05c4, 0x05c5}, // Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot - {0x05c7, 0x05c7}, // Hebrew Point Qamats Qata..Hebrew Point Qamats Qata - {0x0610, 0x061a}, // Arabic Sign Sallallahou ..Arabic Small Kasra - {0x064b, 0x065f}, // Arabic Fathatan ..Arabic Wavy Hamza Below - {0x0670, 0x0670}, // Arabic Letter Superscrip..Arabic Letter Superscrip - {0x06d6, 0x06dc}, // Arabic Small High Ligatu..Arabic Small High Seen - {0x06df, 0x06e4}, // Arabic Small High Rounde..Arabic Small High Madda - {0x06e7, 0x06e8}, // Arabic Small High Yeh ..Arabic Small High Noon - {0x06ea, 0x06ed}, // Arabic Empty Centre Low ..Arabic Small Low Meem - {0x0711, 0x0711}, // Syriac Letter Superscrip..Syriac Letter Superscrip - {0x0730, 0x074a}, // Syriac Pthaha Above ..Syriac Barrekh - {0x07a6, 0x07b0}, // Thaana Abafili ..Thaana Sukun - {0x07eb, 0x07f3}, // Nko Combining Sh||t High..Nko Combining Double Dot - {0x0816, 0x0819}, // Samaritan Mark In ..Samaritan Mark Dagesh - {0x081b, 0x0823}, // Samaritan Mark Epentheti..Samaritan Vowel Sign A - {0x0825, 0x0827}, // Samaritan Vowel Sign Sho..Samaritan Vowel Sign U - {0x0829, 0x082d}, // Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa - {0x0859, 0x085b}, // Mandaic Affrication Mark..Mandaic Gemination Mark - {0x08d4, 0x08e1}, // (nil) .. - {0x08e3, 0x0902}, // Arabic Turned Damma Belo..Devanagari Sign Anusvara - {0x093a, 0x093a}, // Devanagari Vowel Sign Oe..Devanagari Vowel Sign Oe - {0x093c, 0x093c}, // Devanagari Sign Nukta ..Devanagari Sign Nukta - {0x0941, 0x0948}, // Devanagari Vowel Sign U ..Devanagari Vowel Sign Ai - {0x094d, 0x094d}, // Devanagari Sign Virama ..Devanagari Sign Virama - {0x0951, 0x0957}, // Devanagari Stress Sign U..Devanagari Vowel Sign Uu - {0x0962, 0x0963}, // Devanagari Vowel Sign Vo..Devanagari Vowel Sign Vo - {0x0981, 0x0981}, // Bengali Sign Candrabindu..Bengali Sign Candrabindu - {0x09bc, 0x09bc}, // Bengali Sign Nukta ..Bengali Sign Nukta - {0x09c1, 0x09c4}, // Bengali Vowel Sign U ..Bengali Vowel Sign Vocal - {0x09cd, 0x09cd}, // Bengali Sign Virama ..Bengali Sign Virama - {0x09e2, 0x09e3}, // Bengali Vowel Sign Vocal..Bengali Vowel Sign Vocal - {0x0a01, 0x0a02}, // Gurmukhi Sign Adak Bindi..Gurmukhi Sign Bindi - {0x0a3c, 0x0a3c}, // Gurmukhi Sign Nukta ..Gurmukhi Sign Nukta - {0x0a41, 0x0a42}, // Gurmukhi Vowel Sign U ..Gurmukhi Vowel Sign Uu - {0x0a47, 0x0a48}, // Gurmukhi Vowel Sign Ee ..Gurmukhi Vowel Sign Ai - {0x0a4b, 0x0a4d}, // Gurmukhi Vowel Sign Oo ..Gurmukhi Sign Virama - {0x0a51, 0x0a51}, // Gurmukhi Sign Udaat ..Gurmukhi Sign Udaat - {0x0a70, 0x0a71}, // Gurmukhi Tippi ..Gurmukhi Addak - {0x0a75, 0x0a75}, // Gurmukhi Sign Yakash ..Gurmukhi Sign Yakash - {0x0a81, 0x0a82}, // Gujarati Sign Candrabind..Gujarati Sign Anusvara - {0x0abc, 0x0abc}, // Gujarati Sign Nukta ..Gujarati Sign Nukta - {0x0ac1, 0x0ac5}, // Gujarati Vowel Sign U ..Gujarati Vowel Sign Cand - {0x0ac7, 0x0ac8}, // Gujarati Vowel Sign E ..Gujarati Vowel Sign Ai - {0x0acd, 0x0acd}, // Gujarati Sign Virama ..Gujarati Sign Virama - {0x0ae2, 0x0ae3}, // Gujarati Vowel Sign Voca..Gujarati Vowel Sign Voca - {0x0b01, 0x0b01}, // ||iya Sign Candrabindu ..||iya Sign Candrabindu - {0x0b3c, 0x0b3c}, // ||iya Sign Nukta ..||iya Sign Nukta - {0x0b3f, 0x0b3f}, // ||iya Vowel Sign I ..||iya Vowel Sign I - {0x0b41, 0x0b44}, // ||iya Vowel Sign U ..||iya Vowel Sign Vocalic - {0x0b4d, 0x0b4d}, // ||iya Sign Virama ..||iya Sign Virama - {0x0b56, 0x0b56}, // ||iya Ai Length Mark ..||iya Ai Length Mark - {0x0b62, 0x0b63}, // ||iya Vowel Sign Vocalic..||iya Vowel Sign Vocalic - {0x0b82, 0x0b82}, // Tamil Sign Anusvara ..Tamil Sign Anusvara - {0x0bc0, 0x0bc0}, // Tamil Vowel Sign Ii ..Tamil Vowel Sign Ii - {0x0bcd, 0x0bcd}, // Tamil Sign Virama ..Tamil Sign Virama - {0x0c00, 0x0c00}, // Telugu Sign Combining Ca..Telugu Sign Combining Ca - {0x0c3e, 0x0c40}, // Telugu Vowel Sign Aa ..Telugu Vowel Sign Ii - {0x0c46, 0x0c48}, // Telugu Vowel Sign E ..Telugu Vowel Sign Ai - {0x0c4a, 0x0c4d}, // Telugu Vowel Sign O ..Telugu Sign Virama - {0x0c55, 0x0c56}, // Telugu Length Mark ..Telugu Ai Length Mark - {0x0c62, 0x0c63}, // Telugu Vowel Sign Vocali..Telugu Vowel Sign Vocali - {0x0c81, 0x0c81}, // Kannada Sign Candrabindu..Kannada Sign Candrabindu - {0x0cbc, 0x0cbc}, // Kannada Sign Nukta ..Kannada Sign Nukta - {0x0cbf, 0x0cbf}, // Kannada Vowel Sign I ..Kannada Vowel Sign I - {0x0cc6, 0x0cc6}, // Kannada Vowel Sign E ..Kannada Vowel Sign E - {0x0ccc, 0x0ccd}, // Kannada Vowel Sign Au ..Kannada Sign Virama - {0x0ce2, 0x0ce3}, // Kannada Vowel Sign Vocal..Kannada Vowel Sign Vocal - {0x0d01, 0x0d01}, // Malayalam Sign Candrabin..Malayalam Sign Candrabin - {0x0d41, 0x0d44}, // Malayalam Vowel Sign U ..Malayalam Vowel Sign Voc - {0x0d4d, 0x0d4d}, // Malayalam Sign Virama ..Malayalam Sign Virama - {0x0d62, 0x0d63}, // Malayalam Vowel Sign Voc..Malayalam Vowel Sign Voc - {0x0dca, 0x0dca}, // Sinhala Sign Al-lakuna ..Sinhala Sign Al-lakuna - {0x0dd2, 0x0dd4}, // Sinhala Vowel Sign Ketti..Sinhala Vowel Sign Ketti - {0x0dd6, 0x0dd6}, // Sinhala Vowel Sign Diga ..Sinhala Vowel Sign Diga - {0x0e31, 0x0e31}, // Thai Character Mai Han-a..Thai Character Mai Han-a - {0x0e34, 0x0e3a}, // Thai Character Sara I ..Thai Character Phinthu - {0x0e47, 0x0e4e}, // Thai Character Maitaikhu..Thai Character Yamakkan - {0x0eb1, 0x0eb1}, // Lao Vowel Sign Mai Kan ..Lao Vowel Sign Mai Kan - {0x0eb4, 0x0eb9}, // Lao Vowel Sign I ..Lao Vowel Sign Uu - {0x0ebb, 0x0ebc}, // Lao Vowel Sign Mai Kon ..Lao Semivowel Sign Lo - {0x0ec8, 0x0ecd}, // Lao Tone Mai Ek ..Lao Niggahita - {0x0f18, 0x0f19}, // Tibetan Astrological Sig..Tibetan Astrological Sig - {0x0f35, 0x0f35}, // Tibetan Mark Ngas Bzung ..Tibetan Mark Ngas Bzung - {0x0f37, 0x0f37}, // Tibetan Mark Ngas Bzung ..Tibetan Mark Ngas Bzung - {0x0f39, 0x0f39}, // Tibetan Mark Tsa -phru ..Tibetan Mark Tsa -phru - {0x0f71, 0x0f7e}, // Tibetan Vowel Sign Aa ..Tibetan Sign Rjes Su Nga - {0x0f80, 0x0f84}, // Tibetan Vowel Sign Rever..Tibetan Mark Halanta - {0x0f86, 0x0f87}, // Tibetan Sign Lci Rtags ..Tibetan Sign Yang Rtags - {0x0f8d, 0x0f97}, // Tibetan Subjoined Sign L..Tibetan Subjoined Letter - {0x0f99, 0x0fbc}, // Tibetan Subjoined Letter..Tibetan Subjoined Letter - {0x0fc6, 0x0fc6}, // Tibetan Symbol Padma Gda..Tibetan Symbol Padma Gda - {0x102d, 0x1030}, // Myanmar Vowel Sign I ..Myanmar Vowel Sign Uu - {0x1032, 0x1037}, // Myanmar Vowel Sign Ai ..Myanmar Sign Dot Below - {0x1039, 0x103a}, // Myanmar Sign Virama ..Myanmar Sign Asat - {0x103d, 0x103e}, // Myanmar Consonant Sign M..Myanmar Consonant Sign M - {0x1058, 0x1059}, // Myanmar Vowel Sign Vocal..Myanmar Vowel Sign Vocal - {0x105e, 0x1060}, // Myanmar Consonant Sign M..Myanmar Consonant Sign M - {0x1071, 0x1074}, // Myanmar Vowel Sign Geba ..Myanmar Vowel Sign Kayah - {0x1082, 0x1082}, // Myanmar Consonant Sign S..Myanmar Consonant Sign S - {0x1085, 0x1086}, // Myanmar Vowel Sign Shan ..Myanmar Vowel Sign Shan - {0x108d, 0x108d}, // Myanmar Sign Shan Counci..Myanmar Sign Shan Counci - {0x109d, 0x109d}, // Myanmar Vowel Sign Aiton..Myanmar Vowel Sign Aiton - {0x135d, 0x135f}, // Ethiopic Combining Gemin..Ethiopic Combining Gemin - {0x1712, 0x1714}, // Tagalog Vowel Sign I ..Tagalog Sign Virama - {0x1732, 0x1734}, // Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod - {0x1752, 0x1753}, // Buhid Vowel Sign I ..Buhid Vowel Sign U - {0x1772, 0x1773}, // Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U - {0x17b4, 0x17b5}, // Khmer Vowel Inherent Aq ..Khmer Vowel Inherent Aa - {0x17b7, 0x17bd}, // Khmer Vowel Sign I ..Khmer Vowel Sign Ua - {0x17c6, 0x17c6}, // Khmer Sign Nikahit ..Khmer Sign Nikahit - {0x17c9, 0x17d3}, // Khmer Sign Muusikatoan ..Khmer Sign Bathamasat - {0x17dd, 0x17dd}, // Khmer Sign Atthacan ..Khmer Sign Atthacan - {0x180b, 0x180d}, // Mongolian Free Variation..Mongolian Free Variation - {0x1885, 0x1886}, // Mongolian Letter Ali Gal..Mongolian Letter Ali Gal - {0x18a9, 0x18a9}, // Mongolian Letter Ali Gal..Mongolian Letter Ali Gal - {0x1920, 0x1922}, // Limbu Vowel Sign A ..Limbu Vowel Sign U - {0x1927, 0x1928}, // Limbu Vowel Sign E ..Limbu Vowel Sign O - {0x1932, 0x1932}, // Limbu Small Letter Anusv..Limbu Small Letter Anusv - {0x1939, 0x193b}, // Limbu Sign Mukphreng ..Limbu Sign Sa-i - {0x1a17, 0x1a18}, // Buginese Vowel Sign I ..Buginese Vowel Sign U - {0x1a1b, 0x1a1b}, // Buginese Vowel Sign Ae ..Buginese Vowel Sign Ae - {0x1a56, 0x1a56}, // Tai Tham Consonant Sign ..Tai Tham Consonant Sign - {0x1a58, 0x1a5e}, // Tai Tham Sign Mai Kang L..Tai Tham Consonant Sign - {0x1a60, 0x1a60}, // Tai Tham Sign Sakot ..Tai Tham Sign Sakot - {0x1a62, 0x1a62}, // Tai Tham Vowel Sign Mai ..Tai Tham Vowel Sign Mai - {0x1a65, 0x1a6c}, // Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B - {0x1a73, 0x1a7c}, // Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue - {0x1a7f, 0x1a7f}, // Tai Tham Combining Crypt..Tai Tham Combining Crypt - {0x1ab0, 0x1abe}, // Combining Doubled Circum..Combining Parentheses Ov - {0x1b00, 0x1b03}, // Balinese Sign Ulu Ricem ..Balinese Sign Surang - {0x1b34, 0x1b34}, // Balinese Sign Rerekan ..Balinese Sign Rerekan - {0x1b36, 0x1b3a}, // Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R - {0x1b3c, 0x1b3c}, // Balinese Vowel Sign La L..Balinese Vowel Sign La L - {0x1b42, 0x1b42}, // Balinese Vowel Sign Pepe..Balinese Vowel Sign Pepe - {0x1b6b, 0x1b73}, // Balinese Musical Symbol ..Balinese Musical Symbol - {0x1b80, 0x1b81}, // Sundanese Sign Panyecek ..Sundanese Sign Panglayar - {0x1ba2, 0x1ba5}, // Sundanese Consonant Sign..Sundanese Vowel Sign Pan - {0x1ba8, 0x1ba9}, // Sundanese Vowel Sign Pam..Sundanese Vowel Sign Pan - {0x1bab, 0x1bad}, // Sundanese Sign Virama ..Sundanese Consonant Sign - {0x1be6, 0x1be6}, // Batak Sign Tompi ..Batak Sign Tompi - {0x1be8, 0x1be9}, // Batak Vowel Sign Pakpak ..Batak Vowel Sign Ee - {0x1bed, 0x1bed}, // Batak Vowel Sign Karo O ..Batak Vowel Sign Karo O - {0x1bef, 0x1bf1}, // Batak Vowel Sign U F|| S..Batak Consonant Sign H - {0x1c2c, 0x1c33}, // Lepcha Vowel Sign E ..Lepcha Consonant Sign T - {0x1c36, 0x1c37}, // Lepcha Sign Ran ..Lepcha Sign Nukta - {0x1cd0, 0x1cd2}, // Vedic Tone Karshana ..Vedic Tone Prenkha - {0x1cd4, 0x1ce0}, // Vedic Sign Yajurvedic Mi..Vedic Tone Rigvedic Kash - {0x1ce2, 0x1ce8}, // Vedic Sign Visarga Svari..Vedic Sign Visarga Anuda - {0x1ced, 0x1ced}, // Vedic Sign Tiryak ..Vedic Sign Tiryak - {0x1cf4, 0x1cf4}, // Vedic Tone Candra Above ..Vedic Tone Candra Above - {0x1cf8, 0x1cf9}, // Vedic Tone Ring Above ..Vedic Tone Double Ring A - {0x1dc0, 0x1df5}, // Combining Dotted Grave A..Combining Up Tack Above - {0x1dfb, 0x1dff}, // (nil) ..Combining Right Arrowhea - {0x20d0, 0x20f0}, // Combining Left Harpoon A..Combining Asterisk Above - {0x2cef, 0x2cf1}, // Coptic Combining Ni Abov..Coptic Combining Spiritu - {0x2d7f, 0x2d7f}, // Tifinagh Consonant Joine..Tifinagh Consonant Joine - {0x2de0, 0x2dff}, // Combining Cyrillic Lette..Combining Cyrillic Lette - {0x302a, 0x302d}, // Ideographic Level Tone M..Ideographic Entering Ton - {0x3099, 0x309a}, // Combining Katakana-hirag..Combining Katakana-hirag - {0xa66f, 0xa672}, // Combining Cyrillic Vzmet..Combining Cyrillic Thous - {0xa674, 0xa67d}, // Combining Cyrillic Lette..Combining Cyrillic Payer - {0xa69e, 0xa69f}, // Combining Cyrillic Lette..Combining Cyrillic Lette - {0xa6f0, 0xa6f1}, // Bamum Combining Mark Koq..Bamum Combining Mark Tuk - {0xa802, 0xa802}, // Syloti Nagri Sign Dvisva..Syloti Nagri Sign Dvisva - {0xa806, 0xa806}, // Syloti Nagri Sign Hasant..Syloti Nagri Sign Hasant - {0xa80b, 0xa80b}, // Syloti Nagri Sign Anusva..Syloti Nagri Sign Anusva - {0xa825, 0xa826}, // Syloti Nagri Vowel Sign ..Syloti Nagri Vowel Sign - {0xa8c4, 0xa8c5}, // Saurashtra Sign Virama .. - {0xa8e0, 0xa8f1}, // Combining Devanagari Dig..Combining Devanagari Sig - {0xa926, 0xa92d}, // Kayah Li Vowel Ue ..Kayah Li Tone Calya Plop - {0xa947, 0xa951}, // Rejang Vowel Sign I ..Rejang Consonant Sign R - {0xa980, 0xa982}, // Javanese Sign Panyangga ..Javanese Sign Layar - {0xa9b3, 0xa9b3}, // Javanese Sign Cecak Telu..Javanese Sign Cecak Telu - {0xa9b6, 0xa9b9}, // Javanese Vowel Sign Wulu..Javanese Vowel Sign Suku - {0xa9bc, 0xa9bc}, // Javanese Vowel Sign Pepe..Javanese Vowel Sign Pepe - {0xa9e5, 0xa9e5}, // Myanmar Sign Shan Saw ..Myanmar Sign Shan Saw - {0xaa29, 0xaa2e}, // Cham Vowel Sign Aa ..Cham Vowel Sign Oe - {0xaa31, 0xaa32}, // Cham Vowel Sign Au ..Cham Vowel Sign Ue - {0xaa35, 0xaa36}, // Cham Consonant Sign La ..Cham Consonant Sign Wa - {0xaa43, 0xaa43}, // Cham Consonant Sign Fina..Cham Consonant Sign Fina - {0xaa4c, 0xaa4c}, // Cham Consonant Sign Fina..Cham Consonant Sign Fina - {0xaa7c, 0xaa7c}, // Myanmar Sign Tai Laing T..Myanmar Sign Tai Laing T - {0xaab0, 0xaab0}, // Tai Viet Mai Kang ..Tai Viet Mai Kang - {0xaab2, 0xaab4}, // Tai Viet Vowel I ..Tai Viet Vowel U - {0xaab7, 0xaab8}, // Tai Viet Mai Khit ..Tai Viet Vowel Ia - {0xaabe, 0xaabf}, // Tai Viet Vowel Am ..Tai Viet Tone Mai Ek - {0xaac1, 0xaac1}, // Tai Viet Tone Mai Tho ..Tai Viet Tone Mai Tho - {0xaaec, 0xaaed}, // Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign - {0xaaf6, 0xaaf6}, // Meetei Mayek Virama ..Meetei Mayek Virama - {0xabe5, 0xabe5}, // Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign - {0xabe8, 0xabe8}, // Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign - {0xabed, 0xabed}, // Meetei Mayek Apun Iyek ..Meetei Mayek Apun Iyek - {0xfb1e, 0xfb1e}, // Hebrew Point Judeo-spani..Hebrew Point Judeo-spani - {0xfe00, 0xfe0f}, // Variation Select||-1 ..Variation Select||-16 - {0xfe20, 0xfe2f}, // Combining Ligature Left ..Combining Cyrillic Titlo - {0x101fd, 0x101fd}, // Phaistos Disc Sign Combi..Phaistos Disc Sign Combi - {0x102e0, 0x102e0}, // Coptic Epact Thousands M..Coptic Epact Thousands M - {0x10376, 0x1037a}, // Combining Old Permic Let..Combining Old Permic Let - {0x10a01, 0x10a03}, // Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo - {0x10a05, 0x10a06}, // Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O - {0x10a0c, 0x10a0f}, // Kharoshthi Vowel Length ..Kharoshthi Sign Visarga - {0x10a38, 0x10a3a}, // Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo - {0x10a3f, 0x10a3f}, // Kharoshthi Virama ..Kharoshthi Virama - {0x10ae5, 0x10ae6}, // Manichaean Abbreviation ..Manichaean Abbreviation - {0x11001, 0x11001}, // Brahmi Sign Anusvara ..Brahmi Sign Anusvara - {0x11038, 0x11046}, // Brahmi Vowel Sign Aa ..Brahmi Virama - {0x1107f, 0x11081}, // Brahmi Number Joiner ..Kaithi Sign Anusvara - {0x110b3, 0x110b6}, // Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai - {0x110b9, 0x110ba}, // Kaithi Sign Virama ..Kaithi Sign Nukta - {0x11100, 0x11102}, // Chakma Sign Candrabindu ..Chakma Sign Visarga - {0x11127, 0x1112b}, // Chakma Vowel Sign A ..Chakma Vowel Sign Uu - {0x1112d, 0x11134}, // Chakma Vowel Sign Ai ..Chakma Maayyaa - {0x11173, 0x11173}, // Mahajani Sign Nukta ..Mahajani Sign Nukta - {0x11180, 0x11181}, // Sharada Sign Candrabindu..Sharada Sign Anusvara - {0x111b6, 0x111be}, // Sharada Vowel Sign U ..Sharada Vowel Sign O - {0x111ca, 0x111cc}, // Sharada Sign Nukta ..Sharada Extra Sh||t Vowe - {0x1122f, 0x11231}, // Khojki Vowel Sign U ..Khojki Vowel Sign Ai - {0x11234, 0x11234}, // Khojki Sign Anusvara ..Khojki Sign Anusvara - {0x11236, 0x11237}, // Khojki Sign Nukta ..Khojki Sign Shadda - {0x1123e, 0x1123e}, // (nil) .. - {0x112df, 0x112df}, // Khudawadi Sign Anusvara ..Khudawadi Sign Anusvara - {0x112e3, 0x112ea}, // Khudawadi Vowel Sign U ..Khudawadi Sign Virama - {0x11300, 0x11301}, // Grantha Sign Combining A..Grantha Sign Candrabindu - {0x1133c, 0x1133c}, // Grantha Sign Nukta ..Grantha Sign Nukta - {0x11340, 0x11340}, // Grantha Vowel Sign Ii ..Grantha Vowel Sign Ii - {0x11366, 0x1136c}, // Combining Grantha Digit ..Combining Grantha Digit - {0x11370, 0x11374}, // Combining Grantha Letter..Combining Grantha Letter - {0x11438, 0x1143f}, // (nil) .. - {0x11442, 0x11444}, // (nil) .. - {0x11446, 0x11446}, // (nil) .. - {0x114b3, 0x114b8}, // Tirhuta Vowel Sign U ..Tirhuta Vowel Sign Vocal - {0x114ba, 0x114ba}, // Tirhuta Vowel Sign Sh||t..Tirhuta Vowel Sign Sh||t - {0x114bf, 0x114c0}, // Tirhuta Sign Candrabindu..Tirhuta Sign Anusvara - {0x114c2, 0x114c3}, // Tirhuta Sign Virama ..Tirhuta Sign Nukta - {0x115b2, 0x115b5}, // Siddham Vowel Sign U ..Siddham Vowel Sign Vocal - {0x115bc, 0x115bd}, // Siddham Sign Candrabindu..Siddham Sign Anusvara - {0x115bf, 0x115c0}, // Siddham Sign Virama ..Siddham Sign Nukta - {0x115dc, 0x115dd}, // Siddham Vowel Sign Alter..Siddham Vowel Sign Alter - {0x11633, 0x1163a}, // Modi Vowel Sign U ..Modi Vowel Sign Ai - {0x1163d, 0x1163d}, // Modi Sign Anusvara ..Modi Sign Anusvara - {0x1163f, 0x11640}, // Modi Sign Virama ..Modi Sign Ardhacandra - {0x116ab, 0x116ab}, // Takri Sign Anusvara ..Takri Sign Anusvara - {0x116ad, 0x116ad}, // Takri Vowel Sign Aa ..Takri Vowel Sign Aa - {0x116b0, 0x116b5}, // Takri Vowel Sign U ..Takri Vowel Sign Au - {0x116b7, 0x116b7}, // Takri Sign Nukta ..Takri Sign Nukta - {0x1171d, 0x1171f}, // Ahom Consonant Sign Medi..Ahom Consonant Sign Medi - {0x11722, 0x11725}, // Ahom Vowel Sign I ..Ahom Vowel Sign Uu - {0x11727, 0x1172b}, // Ahom Vowel Sign Aw ..Ahom Sign Killer - {0x11c30, 0x11c36}, // (nil) .. - {0x11c38, 0x11c3d}, // (nil) .. - {0x11c3f, 0x11c3f}, // (nil) .. - {0x11c92, 0x11ca7}, // (nil) .. - {0x11caa, 0x11cb0}, // (nil) .. - {0x11cb2, 0x11cb3}, // (nil) .. - {0x11cb5, 0x11cb6}, // (nil) .. - {0x16af0, 0x16af4}, // Bassa Vah Combining High..Bassa Vah Combining High - {0x16b30, 0x16b36}, // Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta - {0x16f8f, 0x16f92}, // Miao Tone Right ..Miao Tone Below - {0x1bc9d, 0x1bc9e}, // Duployan Thick Letter Se..Duployan Double Mark - {0x1d167, 0x1d169}, // Musical Symbol Combining..Musical Symbol Combining - {0x1d17b, 0x1d182}, // Musical Symbol Combining..Musical Symbol Combining - {0x1d185, 0x1d18b}, // Musical Symbol Combining..Musical Symbol Combining - {0x1d1aa, 0x1d1ad}, // Musical Symbol Combining..Musical Symbol Combining - {0x1d242, 0x1d244}, // Combining Greek Musical ..Combining Greek Musical - {0x1da00, 0x1da36}, // Signwriting Head Rim ..Signwriting Air Sucking - {0x1da3b, 0x1da6c}, // Signwriting Mouth Closed..Signwriting Excitement - {0x1da75, 0x1da75}, // Signwriting Upper Body T..Signwriting Upper Body T - {0x1da84, 0x1da84}, // Signwriting Location Hea..Signwriting Location Hea - {0x1da9b, 0x1da9f}, // Signwriting Fill Modifie..Signwriting Fill Modifie - {0x1daa1, 0x1daaf}, // Signwriting Rotation Mod..Signwriting Rotation Mod - {0x1e000, 0x1e006}, // (nil) .. - {0x1e008, 0x1e018}, // (nil) .. - {0x1e01b, 0x1e021}, // (nil) .. - {0x1e023, 0x1e024}, // (nil) .. - {0x1e026, 0x1e02a}, // (nil) .. - {0x1e8d0, 0x1e8d6}, // Mende Kikakui Combining ..Mende Kikakui Combining - {0x1e944, 0x1e94a}, // (nil) .. - {0xe0100, 0xe01ef}, // Variation Select||-17 ..Variation Select||-256 - }; - - // https://github.com/jquast/wcwidth/blob/master/wcwidth/table_wide.py - // at commit 0d7de112202cc8b2ebe9232ff4a5c954f19d561a (2016-07-02): - // Original source: EastAsianWidth-9.0.0.txt - static struct interval WIDE_GLYPH[] = { - {0x1100, 0x115f}, // Hangul Choseong Kiyeok ..Hangul Choseong Filler - {0x231a, 0x231b}, // Watch ..Hourglass - {0x2329, 0x232a}, // Left-pointing Angle Brac..Right-pointing Angle Bra - {0x23e9, 0x23ec}, // Black Right-pointing Dou..Black Down-pointing Doub - {0x23f0, 0x23f0}, // Alarm Clock ..Alarm Clock - {0x23f3, 0x23f3}, // Hourglass With Flowing S..Hourglass With Flowing S - {0x25fd, 0x25fe}, // White Medium Small Squar..Black Medium Small Squar - {0x2614, 0x2615}, // Umbrella With Rain Drops..Hot Beverage - {0x2648, 0x2653}, // Aries ..Pisces - {0x267f, 0x267f}, // Wheelchair Symbol ..Wheelchair Symbol - {0x2693, 0x2693}, // Anch|| ..Anch|| - {0x26a1, 0x26a1}, // High Voltage Sign ..High Voltage Sign - {0x26aa, 0x26ab}, // Medium White Circle ..Medium Black Circle - {0x26bd, 0x26be}, // Soccer Ball ..Baseball - {0x26c4, 0x26c5}, // Snowman Without Snow ..Sun Behind Cloud - {0x26ce, 0x26ce}, // Ophiuchus ..Ophiuchus - {0x26d4, 0x26d4}, // No Entry ..No Entry - {0x26ea, 0x26ea}, // Church ..Church - {0x26f2, 0x26f3}, // Fountain ..Flag In Hole - {0x26f5, 0x26f5}, // Sailboat ..Sailboat - {0x26fa, 0x26fa}, // Tent ..Tent - {0x26fd, 0x26fd}, // Fuel Pump ..Fuel Pump - {0x2705, 0x2705}, // White Heavy Check Mark ..White Heavy Check Mark - {0x270a, 0x270b}, // Raised Fist ..Raised Hand - {0x2728, 0x2728}, // Sparkles ..Sparkles - {0x274c, 0x274c}, // Cross Mark ..Cross Mark - {0x274e, 0x274e}, // Negative Squared Cross M..Negative Squared Cross M - {0x2753, 0x2755}, // Black Question Mark ||na..White Exclamation Mark O - {0x2757, 0x2757}, // Heavy Exclamation Mark S..Heavy Exclamation Mark S - {0x2795, 0x2797}, // Heavy Plus Sign ..Heavy Division Sign - {0x27b0, 0x27b0}, // Curly Loop ..Curly Loop - {0x27bf, 0x27bf}, // Double Curly Loop ..Double Curly Loop - {0x2b1b, 0x2b1c}, // Black Large Square ..White Large Square - {0x2b50, 0x2b50}, // White Medium Star ..White Medium Star - {0x2b55, 0x2b55}, // Heavy Large Circle ..Heavy Large Circle - {0x2e80, 0x2e99}, // Cjk Radical Repeat ..Cjk Radical Rap - {0x2e9b, 0x2ef3}, // Cjk Radical Choke ..Cjk Radical C-simplified - {0x2f00, 0x2fd5}, // Kangxi Radical One ..Kangxi Radical Flute - {0x2ff0, 0x2ffb}, // Ideographic Description ..Ideographic Description - {0x3000, 0x303e}, // Ideographic Space ..Ideographic Variation In - {0x3041, 0x3096}, // Hiragana Letter Small A ..Hiragana Letter Small Ke - {0x3099, 0x30ff}, // Combining Katakana-hirag..Katakana Digraph Koto - {0x3105, 0x312d}, // Bopomofo Letter B ..Bopomofo Letter Ih - {0x3131, 0x318e}, // Hangul Letter Kiyeok ..Hangul Letter Araeae - {0x3190, 0x31ba}, // Ideographic Annotation L..Bopomofo Letter Zy - {0x31c0, 0x31e3}, // Cjk Stroke T ..Cjk Stroke Q - {0x31f0, 0x321e}, // Katakana Letter Small Ku..Parenthesized K||ean Cha - {0x3220, 0x3247}, // Parenthesized Ideograph ..Circled Ideograph Koto - {0x3250, 0x32fe}, // Partnership Sign ..Circled Katakana Wo - {0x3300, 0x4dbf}, // Square Apaato .. - {0x4e00, 0xa48c}, // Cjk Unified Ideograph-4e..Yi Syllable Yyr - {0xa490, 0xa4c6}, // Yi Radical Qot ..Yi Radical Ke - {0xa960, 0xa97c}, // Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo - {0xac00, 0xd7a3}, // Hangul Syllable Ga ..Hangul Syllable Hih - {0xf900, 0xfaff}, // Cjk Compatibility Ideogr.. - {0xfe10, 0xfe19}, // Presentation F||m F|| Ve..Presentation F||m F|| Ve - {0xfe30, 0xfe52}, // Presentation F||m F|| Ve..Small Full Stop - {0xfe54, 0xfe66}, // Small Semicolon ..Small Equals Sign - {0xfe68, 0xfe6b}, // Small Reverse Solidus ..Small Commercial At - {0xff01, 0xff60}, // Fullwidth Exclamation Ma..Fullwidth Right White Pa - {0xffe0, 0xffe6}, // Fullwidth Cent Sign ..Fullwidth Won Sign - {0x16fe0, 0x16fe0}, // (nil) .. - {0x17000, 0x187ec}, // (nil) .. - {0x18800, 0x18af2}, // (nil) .. - {0x1b000, 0x1b001}, // Katakana Letter Archaic ..Hiragana Letter Archaic - {0x1f004, 0x1f004}, // Mahjong Tile Red Dragon ..Mahjong Tile Red Dragon - {0x1f0cf, 0x1f0cf}, // Playing Card Black Joker..Playing Card Black Joker - {0x1f18e, 0x1f18e}, // Negative Squared Ab ..Negative Squared Ab - {0x1f191, 0x1f19a}, // Squared Cl ..Squared Vs - {0x1f200, 0x1f202}, // Square Hiragana Hoka ..Squared Katakana Sa - {0x1f210, 0x1f23b}, // Squared Cjk Unified Ideo.. - {0x1f240, 0x1f248}, // T||toise Shell Bracketed..T||toise Shell Bracketed - {0x1f250, 0x1f251}, // Circled Ideograph Advant..Circled Ideograph Accept - {0x1f300, 0x1f320}, // Cyclone ..Shooting Star - {0x1f32d, 0x1f335}, // Hot Dog ..Cactus - {0x1f337, 0x1f37c}, // Tulip ..Baby Bottle - {0x1f37e, 0x1f393}, // Bottle With Popping C||k..Graduation Cap - {0x1f3a0, 0x1f3ca}, // Carousel H||se ..Swimmer - {0x1f3cf, 0x1f3d3}, // Cricket Bat And Ball ..Table Tennis Paddle And - {0x1f3e0, 0x1f3f0}, // House Building ..European Castle - {0x1f3f4, 0x1f3f4}, // Waving Black Flag ..Waving Black Flag - {0x1f3f8, 0x1f43e}, // Badminton Racquet And Sh..Paw Prints - {0x1f440, 0x1f440}, // Eyes ..Eyes - {0x1f442, 0x1f4fc}, // Ear ..Videocassette - {0x1f4ff, 0x1f53d}, // Prayer Beads ..Down-pointing Small Red - {0x1f54b, 0x1f54e}, // Kaaba ..Men||ah With Nine Branch - {0x1f550, 0x1f567}, // Clock Face One Oclock ..Clock Face Twelve-thirty - {0x1f57a, 0x1f57a}, // (nil) .. - {0x1f595, 0x1f596}, // Reversed Hand With Middl..Raised Hand With Part Be - {0x1f5a4, 0x1f5a4}, // (nil) .. - {0x1f5fb, 0x1f64f}, // Mount Fuji ..Person With Folded Hands - {0x1f680, 0x1f6c5}, // Rocket ..Left Luggage - {0x1f6cc, 0x1f6cc}, // Sleeping Accommodation ..Sleeping Accommodation - {0x1f6d0, 0x1f6d2}, // Place Of W||ship .. - {0x1f6eb, 0x1f6ec}, // Airplane Departure ..Airplane Arriving - {0x1f6f4, 0x1f6f6}, // (nil) .. - {0x1f910, 0x1f91e}, // Zipper-mouth Face .. - {0x1f920, 0x1f927}, // (nil) .. - {0x1f930, 0x1f930}, // (nil) .. - {0x1f933, 0x1f93e}, // (nil) .. - {0x1f940, 0x1f94b}, // (nil) .. - {0x1f950, 0x1f95e}, // (nil) .. - {0x1f980, 0x1f991}, // Crab .. - {0x1f9c0, 0x1f9c0}, // Cheese Wedge ..Cheese Wedge - {0x20000, 0x2fffd}, // Cjk Unified Ideograph-20.. - {0x30000, 0x3fffd}, // (nil) .. -}; - - /* test for 8-bit control characters */ - if (ucs == 0) - return 0; - if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) - return -1; - - /* Fast return for latin. */ - if (ucs < 0x300) return 1; - - /* Binary search in table of non-spacing characters. */ - if (bisearch(ucs, combining, - sizeof(combining) / sizeof(struct interval) - 1)) - return 0; - - /* Other zero-width characters not in table above. */ - if ((0x200B <= ucs && ucs <= 0x200F) || - ucs == 0x2028 || - ucs == 0x2029 || - (0x202A <= ucs && ucs <= 0x202E) || - (0x2060 <= ucs && ucs <= 0x2063)) - return 0; - - /* If we arrive here, ucs is not a combining or C0/C1 control character */ - if (bisearch(ucs, WIDE_GLYPH, - sizeof(WIDE_GLYPH) / sizeof(struct interval) - 1)) - return 2; - return 1; -} - - -int mk_wcswidth(const wchar_t *pwcs, size_t n) -{ - int w, width = 0; - - for (;*pwcs && n-- > 0; pwcs++) - if ((w = mk_wcwidth(*pwcs)) < 0) - return -1; - else - width += w; - - return width; -} diff -Nru link-grammar-5.3.16/link-grammar/wcwidth.h link-grammar-5.5.0/link-grammar/wcwidth.h --- link-grammar-5.3.16/link-grammar/wcwidth.h 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/wcwidth.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,8 +0,0 @@ -#ifndef _WCWIDTH_H -#define _WCWIDTH_H - -#include - -int mk_wcwidth(wchar_t); -int mk_wcswidth(const wchar_t *, size_t); -#endif /* _WCWIDTH_H */ diff -Nru link-grammar-5.3.16/link-grammar/wordgraph.c link-grammar-5.5.0/link-grammar/wordgraph.c --- link-grammar-5.3.16/link-grammar/wordgraph.c 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/wordgraph.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1035 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2014 Amir Plivatsky */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifdef USE_WORDGRAPH_DISPLAY -#include -#include -#ifdef HAVE_FORK -#include /* fork() and execl() */ -#include /* waitpid() */ -#endif -#ifdef HAVE_PRCTL -#include /* prctl() */ -#endif -#include /* SIG* */ -#endif /* USE_WORDGRAPH_DISPLAY */ - -#include "error.h" -#include "externs.h" -#include "string-set.h" -#include "structures.h" -#include "print-util.h" -#include "wordgraph.h" - -#ifdef __APPLE__ -#define POPEN_DOT -#endif /* __APPLE__ */ - -/* === Gword utilities === */ -/* Many more Gword utilities, that are used only in particular files, - * are defined in these files statically. */ - -Gword *gword_new(Sentence sent, const char *s) -{ - Gword *gword = malloc(sizeof(*gword)); - - memset(gword, 0, sizeof(*gword)); - assert(NULL != gword, "Null-string subword"); - gword->subword = string_set_add(s, sent->string_set); - - if (NULL != sent->last_word) sent->last_word->chain_next = gword; - sent->last_word = gword; - gword->node_num = sent->gword_node_num++; - - gword->gword_set_head = (gword_set){0}; - gword->gword_set_head.o_gword = gword; - - return gword; -} - -/* FIXME: Remove it. */ -Gword *empty_word(void) -{ - /* - static Gword e = { - .subword = EMPTY_WORD_MARK, - .unsplit_word = &e, - .morpheme_type = MT_EMPTY, - .alternative_id = &e, - .status = WS_INDICT, - }; - */ - return NULL; -} - -static Gword **gwordlist_resize(Gword **arr, size_t len) -{ - arr = realloc(arr, (len+2) * sizeof(Gword *)); - arr[len+1] = NULL; - return arr; -} - -size_t gwordlist_len(const Gword **arr) -{ - size_t len = 0; - if (arr) - while (arr[len] != NULL) len++; - return len; -} - -void gwordlist_append(Gword ***arrp, Gword *p) -{ - size_t n = gwordlist_len((const Gword **)*arrp); - - *arrp = gwordlist_resize(*arrp, n); - (*arrp)[n] = p; -} - -/** - * Append a Gword list to a given Gword list (w/o duplicates). - */ -void gwordlist_append_list(const Gword ***to_word, const Gword **from_word) -{ - size_t to_word_arr_len = gwordlist_len(*to_word); - - for (const Gword **f = from_word; NULL != *f; f++) - { - size_t l; - - /* Note: Must use indexing because to_word may get realloc'ed. */ - for (l = 0; l < to_word_arr_len; l++) - if (*f == (*to_word)[l]) break; /* Filter duplicates. */ - - if (l == to_word_arr_len) - gwordlist_append((Gword ***)to_word, (Gword *)*f); - } -} - -#if 0 -/** - * Replace "count" words from the position "start" by word "wnew". - */ -static void wordlist_replace(Gword ***arrp, size_t start, size_t count, - const Gword *wnew) -{ - size_t n = gwordlist_len((const Gword **)(*arrp+start+count)); - - memmove(*arrp+start+1, *arrp+start+count, (n+1) * sizeof(Gword *)); - (*arrp)[start] = (Gword *)wnew; -} -#endif - -size_t wordgraph_pathpos_len(Wordgraph_pathpos *wp) -{ - size_t len = 0; - if (wp) - while (wp[len].word != NULL) len++; - return len; -} - -/** - * `len` is the new length, not counting the terminating null entry. - */ -/* FIXME (efficiency): Initially allocate more than 2 elements */ -Wordgraph_pathpos *wordgraph_pathpos_resize(Wordgraph_pathpos *wp, - size_t len) -{ - wp = realloc(wp, (len+1) * sizeof(*wp)); - wp[len].word = NULL; - return wp; -} - -/** - * Insert the gword into the path queue in reverse order of its hier_depth. - * - * The deepest wordgraph alternatives must be scanned first. - * Otherwise, this sentence causes a flattening mess: - * "T" this is a flattening test - * (The mess depends on both "T" and "T matching EMOTICON, and any - * 5 words after "T".) - * - * Parameters: - * same_word: mark that the same word is queued again. - * For validation code only (until the wordgraph version is mature): - * used: mark that the word has already been issued into the 2D-array. - * diff_alternative: validate we don't queue words from the same alternative. - */ -bool wordgraph_pathpos_add(Wordgraph_pathpos **wp, Gword *p, bool used, - bool same_word, bool diff_alternative) -{ - size_t n = wordgraph_pathpos_len(*wp); - Wordgraph_pathpos *wpt; - size_t insert_here = n; - - assert(NULL != p); - -#ifdef DEBUG - if (verbosity_level(+9)) print_hier_position(p); -#endif - - if (NULL != *wp) - { - for (wpt = *wp; NULL != wpt->word; wpt++) - { - if (p == wpt->word) - return false; /* already in the pathpos queue - nothing to do */ - - /* Insert in reverse order of hier_depth. */ - if ((n == insert_here) && (p->hier_depth >= wpt->word->hier_depth)) - insert_here = wpt - *wp; - - /* Validate that there are no words in the pathpos queue from the same - * alternative. This can be commented out when the wordgraph code is - * mature. FIXME */ - if (diff_alternative) - { - assert(same_word||wpt->same_word||!in_same_alternative(p,wpt->word), - "wordgraph_pathpos_add(): " - "Word%zu '%s' is from same alternative of word%zu '%s'", - p->node_num, p->subword, - wpt->word->node_num, wpt->word->subword); - } - } - } - - *wp = wordgraph_pathpos_resize(*wp, n+1); - - if (insert_here < n) - { - /* n+1 because n is the length of the array, not including the - * terminating null entry. We need to protect the terminating null. - */ - memmove(&(*wp)[insert_here+1], &(*wp)[insert_here], - (n+1 - insert_here) * sizeof (*wpt)); - } - - (*wp)[insert_here].word = p; - (*wp)[insert_here].same_word = same_word; - (*wp)[insert_here].used = used; - (*wp)[insert_here].next_ok = false; - - return true; -} - -#ifdef DEBUG -GNUC_UNUSED static const char *debug_show_subword(const Gword *w) -{ - return w->unsplit_word ? w->subword : "S"; -} - -GNUC_UNUSED void print_hier_position(const Gword *word) -{ - const Gword **p; - - err_msg(lg_Debug, "[Word %zu:%s hier_position(hier_depth=%zu): ", - word->node_num, word->subword, word->hier_depth); - assert(2*word->hier_depth==gwordlist_len(word->hier_position), "word '%s'", - word->subword); - - for (p = word->hier_position; NULL != *p; p += 2) - { - err_msg(lg_Debug, "(%zu:%s/%zu:%s)", - p[0]->node_num, debug_show_subword(p[0]), - p[1]->node_num, debug_show_subword(p[1])); - } - err_msg(lg_Debug, "]\n"); -} - -/* Debug printout of a wordgraph Gword list. */ -GNUC_UNUSED void gword_set_print(const gword_set *gs) -{ - printf("Gword list: "); - - if (NULL == gs) - { - printf("(null)\n"); - return; - } - - for (; NULL != gs; gs = gs->next) - { - printf("word %p '%s' unsplit '%s'%s", gs->o_gword, (gs->o_gword)->subword, - (gs->o_gword)->unsplit_word->subword, NULL==gs->next ? "" : ", "); - } - printf("\n"); - -} -#endif - -/** - * Given a word, find its alternative ID. - * An alternative is identified by a pointer to its first word, which is - * getting set at the time the alternative is created at - * issue_word_alternative(). (It could be any unique identifier - for coding - * convenience it is a pointer.) - * - * Return the alternative_id of this alternative. - */ -static Gword *find_alternative(Gword *word) -{ - assert(NULL != word, "find_alternative(NULL)"); - assert(NULL != word->alternative_id, "find_alternative(%s): NULL id", - word->subword); - -#if 0 - lgdebug(+0, "find_alternative(%s): '%s'\n", - word->subword, debug_show_subword(word->alternative_id)); -#endif - - return word->alternative_id; -} - -/** - * Generate an hierarchy-position vector for the given word. - * It consists of list of (unsplit_word, alternative_id) pairs, leading - * to the word, starting from a sentence word. It is NULL terminated. - * Original sentence words don't have any such pair. - */ -const Gword **wordgraph_hier_position(Gword *word) -{ - const Gword **hier_position; /* NULL terminated */ - size_t i = 0; - Gword *w; - bool is_leaf = true; /* the word is in the bottom of the hierarchy */ - - if (NULL != word->hier_position) return word->hier_position; /* from cache */ - - /* - * Compute the length of the hier_position vector. - */ - for (w = find_real_unsplit_word(word, true); NULL != w; w = w->unsplit_word) - i++; - if (0 == i) i = 1; /* Handle the dummy start/end words, just in case. */ - /* Original sentence words (i==1) have zero (i-1) elements. Each deeper - * unsplit word has an additional element. Each element takes 2 word pointers - * (first one the unsplit word, second one indicating the alternative in - * which it is found). The last +1 is for a terminating NULL. */ - word->hier_depth = i - 1; - i = (2 * word->hier_depth)+1; - hier_position = malloc(i * sizeof(*hier_position)); - - /* Stuff the hierarchical position in a reverse order. */ - hier_position[--i] = NULL; - w = word; - while (0 != i) - { - hier_position[--i] = find_alternative(w); - w = find_real_unsplit_word(w, is_leaf); - hier_position[--i] = w; - is_leaf = false; - } - - word->hier_position = hier_position; /* cache it */ - return hier_position; -} - -/** - * Find if 2 words are in the same alternative of their common ancestor - * unsplit_word. - * "Same alternative" means at the direct alternative or any level below it. - * A - * | - * +-B C D - * | - * +-E F - * | - * +-G H - * | - * +-I J - * J and E (but not J and B) are in the same alternative of their common - * ancestor unsplit_word A. - * J and G are not in the same alternative (common ancestor unsplit_word F). - * - * Return true if they are, false otherwise. - */ -bool in_same_alternative(Gword *w1, Gword *w2) -{ - const Gword **hp1 = wordgraph_hier_position(w1); - const Gword **hp2 = wordgraph_hier_position(w2); - size_t i; - -#if 0 /* DEBUG */ - print_hier_position(w1); print_hier_position(w2); -#endif - -#if 0 /* BUG */ - /* The following is wrong! Comparison to the hier_position of the - * termination word is actually needed when there are alternatives of - * different lengths at the end of a sentence. This check then prevents - * the generation of empty words on the shorter alternative. */ - if ((NULL == w1->next) || (NULL == w2->next)) return false;/* termination */ -#endif - - for (i = 0; (NULL != hp1[i]) && (NULL != hp2[i]); i++) - { - if (hp1[i] != hp2[i]) break; - } - - /* In the even positions we have an unsplit_word. - * In the odd positions we have an alternative_id. - * - * If we are here when i is even, it means the preceding alternative_id was - * the same in the two words - so they belong to the same alternative. If - * i is 0, it means these are sentence words, and sentence words are all in - * the same alternative (including the dummy termination word). - * If the hierarchy-position vectors are equal, i is also even, and words - * with equal hierarchy-position vectors are in the same alternative. - * - * If we are here when i is odd, it means the alternative_id at i is not - * the same in the given words, but their preceding unsplit_words are the - * same - so they clearly not in the same alternative. - */ - if (0 == i%2) return true; - - return false; -} - -/** - * Get the real unsplit word of the given word. - * While the Wordgraph is getting constructed, when a subword has itself as one - * of its own alternatives, it appears in the wordgraph only once, still - * pointing to its original unsplit_word. It appears once in order not to - * complicate the graph, and the unsplit_word is not changed in order not loss - * information (all of these are implementation decisions). However, for the - * hierarchy position of the word (when it is a word to be issued, i.e. a leaf - * node) the real unsplit word is needed, which is the word itself. It is fine - * since such a word cannot get split further. - */ -Gword *find_real_unsplit_word(Gword *word, bool is_leaf) -{ - /* For the terminating word, return something unique. */ - if (NULL == word->unsplit_word) - return word; - - if (is_leaf && (word->status & WS_UNSPLIT)) - return word; - - return word->unsplit_word; -} - -/* FIXME The following debug functions can be generated by a script running - * from a Makefile and taking the values from structures.h, instead of hard - * coding the strings as done here. */ - -/** - * Create a short form of flags summary for displaying in a word node. - */ -const char *gword_status(Sentence sent, const Gword *w) -{ - dyn_str *s = dyn_str_new(); - const char *r; - size_t len; - - if (w->status & WS_UNKNOWN) - dyn_strcat(s, "UNK|"); - if (w->status & WS_INDICT) - dyn_strcat(s, "IN|"); - if (w->status & WS_REGEX) - dyn_strcat(s, "RE|"); - if (w->status & WS_SPELL) - dyn_strcat(s, "SP|"); - if (w->status & WS_RUNON) - dyn_strcat(s, "RU|"); - if (w->status & WS_HASALT) - dyn_strcat(s, "HA|"); - if (w->status & WS_UNSPLIT) - dyn_strcat(s, "UNS|"); - if (w->status & WS_PL) - dyn_strcat(s, "PL|"); - - len = strlen(s->str); - if (len > 0) s->str[len-1] = '\0'; - r = string_set_add(s->str, sent->string_set); - dyn_str_delete(s); - return r; -} - -#if USE_WORDGRAPH_DISPLAY || defined(DEBUG) -GNUC_UNUSED const char *gword_morpheme(Sentence sent, const Gword *w) -{ - const char *mt; - char buff[64]; - - switch (w->morpheme_type) - { - case MT_INVALID: - mt = "MT_INVALID"; - break; - case MT_WORD: - mt = "MT_WORD"; - break; - case MT_FEATURE: - mt = "MT_FEATURE"; - break; - case MT_INFRASTRUCTURE: - mt = "MT_I-S"; - break; - case MT_WALL: - mt = "MT_WALL"; - break; - case MT_EMPTY: - mt = "MT_EMPTY"; - break; - case MT_UNKNOWN: - mt = "MT_UNKNOWN"; - break; - case MT_TEMPLATE: - mt = "MT_TEMPLATE"; - break; - case MT_ROOT: - mt = "MT_ROOT"; - break; - case MT_CONTR: - mt = "MT_CONTR"; - break; - case MT_PUNC: - mt = "MT_PUNC"; - break; - case MT_STEM: - mt = "MT_STEM"; - break; - case MT_PREFIX: - mt = "MT_PREFIX"; - break; - case MT_MIDDLE: - mt = "MT_MIDDLE"; - break; - case MT_SUFFIX: - mt = "MT_SUFFIX"; - break; - default: - /* No truncation is expected. */ - snprintf(buff, sizeof(buff), "MT_%d", w->morpheme_type); - mt = string_set_add(buff, sent->string_set); - } - - return mt; -} -#endif /* USE_WORDGRAPH_DISPLAY || defined(DEBUG) */ - -#if USE_WORDGRAPH_DISPLAY -/* === Wordgraph graphical representation === */ - -static void wordgraph_legend(String *wgd, unsigned int mode) -{ - size_t i; - static char const *wst[] = { - "RE", "Matched a regex", - "SP", "Result of spell guess", - "RU", "Separated run-on word", - "HA", "Has an alternative", - "UNS", "Also unsplit_word", - "IN", "In the dict file", - "FI", "First char is uppercase" - }; - - append_string(wgd, - "subgraph cluster_legend {\n" - "label=Legend;\n" - "%s" - "legend [label=\"subword\\n(status-flags)\\nmorpheme-type\"];\n" - "legend [xlabel=\"ordinal-number\\ndebug-label\"];\n" - "%s" - "legend_width [width=4.5 height=0 shape=none label=<\n" - "\n" - "\n", - (mode & WGR_SUB) ? "subgraph cluster_unsplit_word {\n" - "label=\"ordinal-number unsplit-word\";\n" : "", - (mode & WGR_SUB) ? "}\n" : "" - - ); - for (i = 0; i < sizeof(wst)/sizeof(wst[0]); i += 2) - { - append_string(wgd, - "\n", - wst[i], wst[i+1]); - } - - append_string(wgd, - "
status-flags
%s%s
>];" - "}\n" - "subgraph cluster_legend_top_space {\n" - "style=invis legend_dummy [style=invis height=0 shape=box]\n" - "};\n" - ); -} - -/** - * Graph node name: Add "Sentence:" for the main node; Convert SUBSCRIPT_MARK. - * Also escape " and \ with a \. - */ -static const char *wlabel(Sentence sent, const Gword *w) -{ - const char *s; - const char sentence_label[] = "Sentence:\\n"; - dyn_str *l = dyn_str_new(); - char c0[] = "\0\0"; - - assert((NULL != w) && (NULL != w->subword), "Word must exist"); - if ('\0' == *w->subword) - return string_set_add("(nothing)", sent->string_set); - - if (w == sent->wordgraph) dyn_strcat(l, sentence_label); - - for (s = w->subword; *s; s++) - { - switch (*s) - { - case SUBSCRIPT_MARK: - dyn_strcat(l, "."); - break; - case '\"': - dyn_strcat(l, "\\\""); - break; - case '\\': - dyn_strcat(l, "\\"); - break; - default: - *c0 = *s; - dyn_strcat(l, c0); - } - } - - s = string_set_add(l->str, sent->string_set); - dyn_str_delete(l); - return s; -} - -/** - * Generate the wordgraph in dot(1) format, for debug. - */ -static String *wordgraph2dot(Sentence sent, unsigned int mode, const char *modestr) -{ - const Gword *w; - Gword **wp; - String *wgd = string_new(); /* the wordgraph in dot representation */ - char nn[2*sizeof(char *) + 2 + 2 + 1]; /* \"%p\" node name: "0x..."+NUL*/ - - append_string(wgd, "# Mode: %s\n", modestr); - append_string(wgd, "digraph G {\nsize =\"30,20\";\nrankdir=LR;\n"); - if ((mode & (WGR_SUB)) && !(mode & WGR_COMPACT)) - append_string(wgd, "newrank=true;\n"); - if (mode & WGR_LEGEND) wordgraph_legend(wgd, mode); - append_string(wgd, "\"%p\" [shape=box,style=filled,color=\".7 .3 1.0\"];\n", - sent->wordgraph); - - for (w = sent->wordgraph; w; w = w->chain_next) - { - bool show_node; - - if (!(mode & WGR_UNSPLIT) && (MT_INFRASTRUCTURE != w->morpheme_type)) - { - Gword *wu; - - show_node = false; - /* In this mode nodes that are only unsplit_word are not shown. */ - for (wu = sent->wordgraph; wu; wu = wu->chain_next) - { - if (NULL != wu->next) - { - for (wp = wu->next; *wp; wp++) - { - if (w == *wp) - { - show_node = true; - break; - } - } - } - } - - if (!show_node) continue; - } - - snprintf(nn, sizeof(nn), "\"%p\"", w); - - /* Subword node format: - * +------------------+ - * + + - * + w->subword + - * + (w->flags) + - * + w->morpheme_type + - * + + - * +------------------+ - * w->node_num } <- external node label - * w->label } - * - * The flags and morpheme type are printed symbolically. - * The node_num field is the ordinal number of word creation. - * The label shows the code positions that created the subword. - * The external node label may appear at other positions near the node. - * - * FIXME: Use HTML labels. - */ - - append_string(wgd, "%s [label=\"%s\\n(%s)\\n%s\"];\n", nn, - wlabel(sent, w), gword_status(sent, w), gword_morpheme(sent, w)); - - if (!(mode & WGR_DBGLABEL)) - { - append_string(wgd, "%s [xlabel=\"%zu", - nn, w->node_num); - } - else - { - append_string(wgd, "%s [xlabel=\"%zu\\n%s", - nn, w->node_num, w->label); - } - - /* For debugging this function: display also hex node names. */ - if (mode & WGR_DOTDEBUG) - append_string(wgd, "\\n%p-%s", w, wlabel(sent, w)); - - append_string(wgd, "\"];\n"); - - if (NULL != w->next) - { - for (wp = w->next; *wp; wp++) - { - append_string(wgd, "%s->\"%p\" [label=next color=red];\n", - nn, *wp); - } - } - if (mode & WGR_PREV) - { - if (NULL != w->prev) - { - for (wp = w->prev; *wp; wp++) - { - append_string(wgd, "%s->\"%p\" [label=prev color=blue];\n", - nn, *wp); - } - } - } - if (mode & WGR_UNSPLIT) - { - if (!(mode & WGR_SUB) && (NULL != w->unsplit_word)) - { - append_string(wgd, "%s->\"%p\" [label=unsplit];\n", - nn, w->unsplit_word); - } - } - } - - if (mode & WGR_SUB) - { - const Gword *old_unsplit = NULL; - - for (w = sent->wordgraph; w; w = w->chain_next) - { - if (NULL != w->unsplit_word) - { - if (w->unsplit_word != old_unsplit) - { - if (NULL != old_unsplit) append_string(wgd, "}\n"); - append_string(wgd, "subgraph \"cluster-%p\" {", w->unsplit_word); - append_string(wgd, "label=\"%zu %s\"; \n", - w->unsplit_word->node_num, wlabel(sent, w->unsplit_word)); - - old_unsplit = w->unsplit_word; - } - snprintf(nn, sizeof(nn), "\"%p\"", w); - if (strstr(string_value(wgd), nn)) - append_string(wgd, "\"%p\"; ", w); - } - } - append_string(wgd, "}\n"); - } - else - { -#ifdef WGR_SHOW_TERMINATOR_AT_LHS /* not defined - not useful */ - const Gword *terminating_node = NULL; -#endif - - append_string(wgd, "{rank=same; "); - for (w = sent->wordgraph->chain_next; w; w = w->chain_next) - { - snprintf(nn, sizeof(nn), "\"%p\"", w); - if ((w->unsplit_word == sent->wordgraph) && - ((mode & WGR_UNSPLIT) || strstr(string_value(wgd), nn))) - { - append_string(wgd, "%s; ", nn); - } - -#ifdef WGR_SHOW_TERMINATOR_AT_LHS - if (NULL == w->next) terminating_node = w; -#endif - } - append_string(wgd, "}\n"); - -#ifdef WGR_SHOW_TERMINATOR_AT_LHS - if (terminating_node) - append_string(wgd, "{rank=sink; \"%p\"}\n", terminating_node); -#endif - } - - append_string(wgd, "\n}\n"); - - return wgd; -} - -#if defined(HAVE_FORK) && !defined(POPEN_DOT) -static pid_t pid; /* XXX not reentrant */ - -#ifndef HAVE_PRCTL -/** - * Cancel the wordgraph viewers, to be used if there is fork() but no prctl(). - */ -static void wordgraph_show_cancel(void) -{ - kill(pid, SIGTERM); -} -#endif /* HAVE_FORK */ -#endif /* HAVE_PRCTL */ - -#ifndef DOT_COMMNAD -#define DOT_COMMAND "dot" -#endif - -#ifndef DOT_DRIVER -#define DOT_DRIVER "-Txlib" -#endif - -/* In case files are used, their names are fixed. So more than one thread - * (or program) cannot use the word-graph display at the same time. This - * can be corrected, even though there is no much point to do that - * (displaying the word-graph is for debug). */ -#define DOT_FILENAME "lg-wg.vg" - -#define POPEN_DOT_CMD DOT_COMMAND" "DOT_DRIVER -#ifndef POPEN_DOT_CMD_NATIVE -# ifdef _WIN32 -# ifndef IMAGE_VIEWER -# define IMAGE_VIEWER "rundll32 PhotoViewer,ImageView_Fullscreen" -# endif -# define WGJPG "%TEMP%\\lg-wg.jpg" -# define POPEN_DOT_CMD_NATIVE \ - DOT_COMMAND" -Tjpg>"WGJPG"&"IMAGE_VIEWER" "WGJPG"&del "WGJPG -# elif __APPLE__ -# ifndef IMAGE_VIEWER -# define IMAGE_VIEWER "open -W" -# endif -# define WGJPG "$TMPDIR/lg-wg.jpg" -# define POPEN_DOT_CMD_NATIVE \ - DOT_COMMAND" -Tjpg>"WGJPG";"IMAGE_VIEWER" "WGJPG";rm "WGJPG -# else -# define POPEN_DOT_CMD_NATIVE POPEN_DOT_CMD -# endif -#endif - -#if !defined HAVE_FORK || defined POPEN_DOT -#ifdef _MSC_VER -#define popen _popen -#define pclose _pclose -#endif -/** - * popen a command with the given input. - * If the system doesn't have fork(), popen() is used to launch "dot". - * This is an inferior implementation than the one below that uses - * fork(), in which the window remains open and is updated automatically - * when new sentences are entered. With popen(), the program blocks at - * pclose() and the user needs to close the window after each sentence. - */ -static void x_popen(const char *cmd, const char *wgds) -{ - FILE *const cmdf = popen(cmd, "w"); - - if (NULL == cmdf) - { - prt_error("Error: popen of '%s' failed: %s\n", cmd, strerror(errno)); - } - else - { - if (fprintf(cmdf, "%s", wgds) == -1) - prt_error("Error: print to display command: %s\n", strerror(errno)); - if (pclose(cmdf) == -1) - prt_error("Error: pclose of display command: %s\n", strerror(errno)); - } -} -#else -static void x_forkexec(const char *const argv[], pid_t *pid) -{ - /* Fork/exec a graph viewer, and leave it in the background until we exit. - * On exit, send SIGHUP. If prctl() is not available and the program - * crashes, then it is left to the user to exit the viewer. */ - if (0 < *pid) - { - pid_t rpid = waitpid(*pid, NULL, WNOHANG); - - if (0 == rpid) return; /* viewer still active */ - if (-1 == rpid) - { - prt_error("Error: waitpid(%d): %s\n", *pid, strerror(errno)); - *pid = 0; - return; - } - } - - *pid = fork(); - switch (*pid) - { - case -1: - prt_error("Error: fork(): %s\n", strerror(errno)); - break; - case 0: -#ifdef HAVE_PRCTL - if (-1 == prctl(PR_SET_PDEATHSIG, SIGHUP)) - prt_error("Error: prctl: %s\n", strerror(errno)); -#endif - /* Not closing fd 0/1/2, to allow interaction with the program */ - execvp(argv[0], (char **)argv); - prt_error("Error: execlp of %s: %s\n", argv[0], strerror(errno)); - _exit(1); - default: -#ifndef HAVE_PRCTL - if (0 != atexit(wordgraph_show_cancel)) - prt_error("Warning: atexit(wordgraph_show_cancel) failed.\n"); -#endif - break; - } -} -#endif /* !defined HAVE_FORK || defined POPEN_DOT */ - -#ifdef _WIN32 -#define TMPDIR (getenv("TEMP") ? getenv("TEMP") : ".") -#else -#define TMPDIR (getenv("TMPDIR") ? getenv("TMPDIR") : "/tmp") -#endif - -#define concatfn(fn, fn1, fn2) \ - (fn=alloca(strlen(fn1)+strlen(fn2)+2),\ - strcpy(fn, fn1), strcat(fn, "/"), strcat(fn, fn2)) - -static void wordgraph_unlink_xtmpfile(void) -{ - char *fn; - - if (!test_enabled("gvfile")) - { - concatfn(fn, TMPDIR, DOT_FILENAME); - if (unlink(fn) == -1) - prt_error("Warning: Cannot unlink %s: %s\n", fn, strerror(errno)); - } -} - -/** - * Display the word-graph in the indicated mode. - * This is for debug. It is not reentrant due to the static pid and the - * possibly created fixed filenames. - * When Using X11, a "dot -Txlib" program is launched on the graph - * description file. The xlib driver refreshes the graph when the file is - * changed, displaying additional sentences in the same window. The viewer - * program exits on program end (see the comments in the code). When - * compiled with MSVC or MINGW, the system PhotoViewer is used by default, - * unless !wg=x is used (for using X11 when available). - * - * The "dot" and the "PhotoViewer" programs must be in the PATH. - * - * FIXME? "dot" may get a SEGV due to memory corruptions in it (a known - * problem - exists even in 2.38). This can be worked-around by trying it - * again until it succeeds (but the window size, if changed by the user, - * will not be preserved). - * - * modestr: a graph display mode as defined in wordgraph.h (default "ldu"). - */ -void wordgraph_show(Sentence sent, const char *modestr) -{ - String *wgd; - char *gvf_name = NULL; - bool generate_gvfile = test_enabled("gvfile"); /* keep it for debug */ - char *wgds; - bool gvfile = false; - unsigned int mode = 0; - const char *mp; - - /* No check is done for correct flags - at most "mode" will be nonsense. */ - for (mp = modestr; '\0' != *mp && ',' != *mp; mp++) mode |= 1<<(*mp-'a'); - /* test=wg: sets the mode to ":" (0x2000000) and thus no flags are set. */ - if ((0 == mode) || (WGR_X11 == mode)) - mode |= WGR_LEGEND|WGR_DBGLABEL|WGR_UNSPLIT; - - wgd = wordgraph2dot(sent, mode, modestr); - wgds = string_copy(wgd); - string_delete(wgd); - -#if defined(HAVE_FORK) && !defined(POPEN_DOT) - gvfile = true; -#endif - - if (gvfile || generate_gvfile) - { - FILE *gvf; - bool gvf_error = false; - static bool wordgraph_unlink_xtmpfile_needed = true; - - concatfn(gvf_name, TMPDIR, DOT_FILENAME); - gvf = fopen(gvf_name, "w"); - if (NULL == gvf) - { - prt_error("Error: wordgraph_show: open %s failed: %s\n", - gvf_name, strerror(errno)); - } - else - { - if (fprintf(gvf, "%s", wgds) == -1) - { - gvf_error = true; - prt_error("Error: wordgraph_show: print to %s failed: %s\n", - gvf_name, strerror(errno)); - } - if (fclose(gvf) == EOF) - { - gvf_error = true; - prt_error("Error: wordgraph_show: close %s failed: %s\n", - gvf_name, strerror(errno)); - } - } - if (gvf_error && gvfile) /* we need it - cannot continue */ - { - free(wgds); - return; - } - - if (wordgraph_unlink_xtmpfile_needed) - { - /* The filename is fixed - removal needed only once. */ - wordgraph_unlink_xtmpfile_needed = false; - atexit(wordgraph_unlink_xtmpfile); - } - } - -#if _WIN32 -#define EXITKEY "ALT-F4" -#elif __APPLE__ -#define EXITKEY "⌘-Q" -#endif - -#ifdef EXITKEY - prt_error("Press "EXITKEY" in the graphical display window to continue\n"); -#endif - -#if !defined HAVE_FORK || defined POPEN_DOT - x_popen((mode & WGR_X11)? POPEN_DOT_CMD : POPEN_DOT_CMD_NATIVE, wgds); -#else - { - assert(NULL != gvf_name, "DOT filename not initialized (#define mess?)"); - const char *const args[] = { DOT_COMMAND, DOT_DRIVER, gvf_name, NULL }; - x_forkexec(args, &pid); - } -#endif - free(wgds); -} -#else -void wordgraph_show(Sentence sent, const char *modestr) -{ - prt_error("Error: Not configured with --enable-wordgraph-display\n"); -} -#endif /* USE_WORDGRAPH_DISPLAY */ diff -Nru link-grammar-5.3.16/link-grammar/wordgraph.h link-grammar-5.5.0/link-grammar/wordgraph.h --- link-grammar-5.3.16/link-grammar/wordgraph.h 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/wordgraph.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,37 +0,0 @@ -#ifndef _WORDGRAPH_H -#define _WORDGRAPH_H - -#ifdef USE_WORDGRAPH_DISPLAY -/* Wordgraph display representation modes. */ -#define lo(l) (l-'a') -#define WGR_SUB (1< - -#include "dict-api.h" -#include "string-set.h" -#include "word-utils.h" - -/* ======================================================== */ -/* Exp utilities ... */ - -void free_E_list(E_list *); -void free_Exp(Exp * e) -{ - if (e->type != CONNECTOR_type) { - free_E_list(e->u.l); - } - xfree((char *)e, sizeof(Exp)); -} - -void free_E_list(E_list * l) -{ - if (l == NULL) return; - free_E_list(l->next); - free_Exp(l->e); - xfree((char *)l, sizeof(E_list)); -} - -/* Returns the number of connectors in the expression e */ -int size_of_expression(Exp * e) -{ - int size; - E_list * l; - if (e->type == CONNECTOR_type) return 1; - size = 0; - for (l=e->u.l; l!=NULL; l=l->next) { - size += size_of_expression(l->e); - } - return size; -} - -/** - * Build a copy of the given expression (don't copy strings, of course) - */ -static E_list * copy_E_list(E_list * l); -Exp * copy_Exp(Exp * e) -{ - Exp * n; - if (e == NULL) return NULL; - n = (Exp *) xalloc(sizeof(Exp)); - *n = *e; - if (e->type != CONNECTOR_type) { - n->u.l = copy_E_list(e->u.l); - } - return n; -} - -static E_list * copy_E_list(E_list * l) -{ - E_list * nl; - if (l == NULL) return NULL; - nl = (E_list *) xalloc(sizeof(E_list)); - nl->next = copy_E_list(l->next); - nl->e = copy_Exp(l->e); - return nl; -} - -/** - * Compare two expressions, return 1 for equal, 0 for unequal - */ -static int exp_compare(Exp * e1, Exp * e2) -{ - E_list *el1, *el2; - - if ((e1 == NULL) && (e2 == NULL)) - return 1; /* they are equal */ - if ((e1 == NULL) || (e2 == NULL)) - return 0; /* they are not equal */ - if (e1->type != e2->type) - return 0; - if (fabs (e1->cost - e2->cost) > 0.001) - return 0; - if (e1->type == CONNECTOR_type) - { - if (e1->dir != e2->dir) - return 0; - /* printf("%s %s\n",e1->u.string,e2->u.string); */ - if (!string_set_cmp(e1->u.string, e2->u.string)) - return 0; - } - else - { - el1 = e1->u.l; - el2 = e2->u.l; - /* while at least 1 is non-null */ - for (;(el1!=NULL)||(el2!=NULL);) { - /*fail if 1 is null */ - if ((el1==NULL)||(el2==NULL)) - return 0; - /* fail if they are not compared */ - if (exp_compare(el1->e, el2->e) == 0) - return 0; - if (el1!=NULL) - el1 = el1->next; - if (el2!=NULL) - el2 = el2->next; - } - } - return 1; /* if never returned 0, return 1 */ -} - -/** - * Sub-expression matcher -- return 1 if sub is non-NULL and - * contained in super, 0 otherwise. - */ -static int exp_contains(Exp * super, Exp * sub) -{ - E_list * el; - -#if 0 /* DEBUG */ - printf("SUP: "); - if (super) print_expression(super); - printf("\n"); -#endif - - if (sub==NULL || super==NULL) - return 0; - if (exp_compare(sub,super)==1) - return 1; - if (super->type==CONNECTOR_type) - return 0; /* super is a leaf */ - - /* proceed through supers children and return 1 if sub - is contained in any of them */ - for(el = super->u.l; el!=NULL; el=el->next) { - if (exp_contains(el->e, sub)==1) - return 1; - } - return 0; -} - -/* ======================================================== */ -/* X_node utilities ... */ -/** - * frees the list of X_nodes pointed to by x, and all of the expressions - */ -void free_X_nodes(X_node * x) -{ - X_node * y; - for (; x!= NULL; x = y) { - y = x->next; - free_Exp(x->exp); - xfree((char *)x, sizeof(X_node)); - } -} - -/** - * Destructively catenates the two disjunct lists d1 followed by d2. - * Doesn't change the contents of the disjuncts. - * Traverses the first list, but not the second. - */ -X_node * catenate_X_nodes(X_node *d1, X_node *d2) -{ - X_node * dis = d1; - - if (d1 == NULL) return d2; - if (d2 == NULL) return d1; - while (dis->next != NULL) dis = dis->next; - dis->next = d2; - return d1; -} - -/* ======================================================== */ -/* Connector utilities ... */ - -/** - * free_connectors() -- free the list of connectors pointed to by e - * (does not free any strings) - */ -void free_connectors(Connector *e) -{ - Connector * n; - for (; e != NULL; e = n) - { - n = e->next; - xfree((char *)e, sizeof(Connector)); - } -} - -Connector * connector_new(void) -{ - Connector *c = (Connector *) xalloc(sizeof(Connector)); - init_connector(c); - c->nearest_word = 0; - c->multi = false; - c->lc_start = 0; - c->uc_length = 0; - c->uc_start = 0; - c->next = NULL; - c->string = ""; - c->tableNext = NULL; - return c; -} - -/* ======================================================== */ -/* Connector-set utilities ... */ -/** - * This hash function only looks at the leading upper case letters of - * the string, and the direction, '+' or '-'. - */ -static unsigned int connector_set_hash(Connector_set *conset, const char * s, int d) -{ - unsigned int i; - if (islower((int)*s)) s++; /* skip head-dependent indicator */ - - /* djb2 hash */ - i = 5381; - i = ((i << 5) + i) + d; - while (isupper((int) *s)) /* connector tables cannot contain UTF8, yet */ - { - i = ((i << 5) + i) + *s; - s++; - } - return (i & (conset->table_size-1)); -} - -static void build_connector_set_from_expression(Connector_set * conset, Exp * e) -{ - E_list * l; - Connector * c; - unsigned int h; - if (e->type == CONNECTOR_type) - { - c = connector_new(); - c->string = e->u.string; - h = connector_set_hash(conset, c->string, e->dir); - c->next = conset->hash_table[h]; - conset->hash_table[h] = c; - } else { - for (l=e->u.l; l!=NULL; l=l->next) { - build_connector_set_from_expression(conset, l->e); - } - } -} - -Connector_set * connector_set_create(Exp *e) -{ - unsigned int i; - Connector_set *conset; - - conset = (Connector_set *) xalloc(sizeof(Connector_set)); - conset->table_size = next_power_of_two_up(size_of_expression(e)); - conset->hash_table = - (Connector **) xalloc(conset->table_size * sizeof(Connector *)); - for (i=0; itable_size; i++) conset->hash_table[i] = NULL; - build_connector_set_from_expression(conset, e); - return conset; -} - -void connector_set_delete(Connector_set * conset) -{ - unsigned int i; - if (conset == NULL) return; - for (i=0; itable_size; i++) free_connectors(conset->hash_table[i]); - xfree(conset->hash_table, conset->table_size * sizeof(Connector *)); - xfree(conset, sizeof(Connector_set)); -} - -/** - * Returns TRUE the given connector is in this conset. FALSE otherwise. - * d='+' means this connector is on the right side of the disjunct. - * d='-' means this connector is on the left side of the disjunct. - */ - -bool match_in_connector_set(Connector_set *conset, Connector * c) -{ - unsigned int h; - Connector * c1; - if (conset == NULL) return false; - h = connector_set_hash(conset, c->string, '+'); - for (c1 = conset->hash_table[h]; c1 != NULL; c1 = c1->next) - { - if (easy_match(c1->string, c->string)) return true; - } - return false; -} - -/* ======================================================== */ -/* More connector utilities ... */ - -/** - * word_has_connector() -- return TRUE if dictionary expression has connector - * This function takes a dict_node (corresponding to an entry in a - * given dictionary), a string (representing a connector), and a - * direction (+ = right-pointing, '-' = left-pointing); it returns true - * if the dictionary expression for the word includes the connector, - * false otherwise. This can be used to see if a word is in a certain - * category (checking for a category connector in a table), or to see - * if a word has a connector in a normal dictionary. The connector - * check uses a "smart-match", the same kind used by the parser. - */ -#if CRAZY_OBESE_CHECKING_AGLO -bool word_has_connector(Dict_node * dn, const char * cs, char direction) -{ - Connector * c2 = NULL; - Disjunct *d, *d0; - if (dn == NULL) return false; - d0 = d = build_disjuncts_for_dict_node(dn); - if (d == NULL) return false; - for (; d != NULL; d = d->next) { - if (direction == '+') c2 = d->right; - if (direction == '-') c2 = d->left; - for (; c2 != NULL; c2 = c2->next) { - if (easy_match(c2->string, cs)) { - free_disjuncts(d0); - return true; - } - } - } - free_disjuncts(d0); - return false; -} -#else /* CRAZY_OBESE_CHECKING_AGLO */ - -/** - * Return true if the given expression has the given connector. - * The connector cs argument must originally be in the dictionary string set. - */ -static bool exp_has_connector(const Exp * e, int depth, const char * cs, - char direction, bool smart_match) -{ - E_list * el; - if (e->type == CONNECTOR_type) - { - if (direction != e->dir) return false; - return smart_match ? easy_match(e->u.string, cs) - : string_set_cmp(e->u.string, cs); - } - - if (depth == 0) return false; - if (depth > 0) depth--; - - for (el = e->u.l; el != NULL; el = el->next) - { - if (exp_has_connector(el->e, depth, cs, direction, smart_match)) - return true; - } - return false; -} - -bool word_has_connector(Dict_node * dn, const char * cs, char direction) -{ - return exp_has_connector(dn->exp, -1, cs, direction, /*smart_match*/true); -} -#endif /* CRAZY_OBESE_CHECKING_AGLO */ - -/** - * Find if an expression has a connector ZZZ- (that an empty-word has). - * This is a costly way to find it. To reduce the overhead, the - * exp_has_connector() "depth" argument limits the expression depth check, - * supposing the ZZZ- connectors are not deep in the word expression. - * FIXME? A cheaper way is to have a dictionary entry which lists such - * words, or to mark such words at dictionary read time. - **/ -bool is_exp_like_empty_word(Dictionary dict, Exp *exp) -{ - const char *cs = string_set_lookup(EMPTY_CONNECTOR, dict->string_set); - if (NULL == cs) return false; - return exp_has_connector(exp, 2, cs, '-', /*smart_match*/false); -} - -/** - * If word has a connector, return it. - * If word has more than one connector, return NULL. - */ -const char * word_only_connector(Dict_node * dn) -{ - Exp * e = dn->exp; - if (CONNECTOR_type == e->type) - return e->u.string; - return NULL; -} - -/* ======================================================== */ -/* Dictionary utilities ... */ - -static bool dn_word_contains(Dictionary dict, - Dict_node * w_dn, const char * macro) -{ - Exp * m_exp; - Dict_node *m_dn; - - if (w_dn == NULL) return false; - - m_dn = dictionary_lookup_list(dict, macro); - if (m_dn == NULL) return false; - - m_exp = m_dn->exp; - -#if 0 /* DEBUG */ - printf("\nWORD: "); - print_expression(w_dn->exp); - printf("\nMACR: "); - print_expression(m_exp); - printf("\n"); -#endif - - for (;w_dn != NULL; w_dn = w_dn->right) - { - if (1 == exp_contains(w_dn->exp, m_exp)) - { - free_lookup_list(dict, m_dn); - return true; - } - } - free_lookup_list(dict, m_dn); - return false; -} - -/** - * word_contains: return true if the word may involve application of - * a rule. - * - * @return: true if word's expression contains macro's expression, - * false otherwise. - */ -bool word_contains(Dictionary dict, const char * word, const char * macro) -{ - Dict_node *w_dn = abridged_lookup_list(dict, word); - bool ret = dn_word_contains(dict, w_dn, macro); - free_lookup_list(dict, w_dn); - return ret; -} - -/** - * This hash function only looks at the leading upper case letters of - * the connector string, and the label fields. This ensures that if two - * strings match (formally), then they must hash to the same place. - */ -int calculate_connector_hash(Connector * c) -{ - const char *s; - unsigned int i; - - /* For most situations, all three hashes are very nearly equal; - * as to which is faster depends on the parsed text. - * For both English and Russian, there are about 100 pre-defined - * connectors, and another 2K-4K autogen'ed ones (the IDxxx idiom - * connectors, and the LLxxx suffix connectors for Russian). - * Turns out the cost of setting up the hash table dominates the - * cost of collisions. */ -#ifdef USE_DJB2 - /* djb2 hash */ - i = 5381; - s = c->string; - if (islower((int) *s)) s++; /* ignore head-dependent indicator */ - while (isupper((int) *s)) /* connector tables cannot contain UTF8, yet */ - { - i = ((i << 5) + i) + *s; - s++; - } - i += i>>14; -#endif /* USE_DJB2 */ - -#define USE_JENKINS -#ifdef USE_JENKINS - /* Jenkins one-at-a-time hash */ - i = 0; - s = c->string; - if (islower((int) *s)) s++; /* ignore head-dependent indicator */ - c->uc_start = s - c->string; - while (isupper((int) *s)) /* connector tables cannot contain UTF8, yet */ - { - i += *s; - i += (i<<10); - i ^= (i>>6); - s++; - } - i += (i << 3); - i ^= (i >> 11); - i += (i << 15); -#endif /* USE_JENKINS */ - -#ifdef USE_SDBM - /* sdbm hash */ - i = 0; - s = c->string; - if (islower((int) *s)) s++; /* ignore head-dependent indicator */ - c->uc_start = s - c->string; - while (isupper((int) *s)) - { - i = *s + (i << 6) + (i << 16) - i; - s++; - } -#endif /* USE_SDBM */ - - c->lc_start = ('\0' == *s) ? 0 : s - c->string; - c->uc_length = s - c->string - c->uc_start; - c->hash = i; - return i; -} - -/* ========================= END OF FILE ============================== */ diff -Nru link-grammar-5.3.16/link-grammar/word-utils.h link-grammar-5.5.0/link-grammar/word-utils.h --- link-grammar-5.3.16/link-grammar/word-utils.h 2017-01-27 21:57:17.000000000 +0000 +++ link-grammar-5.5.0/link-grammar/word-utils.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,159 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2004 */ -/* Daniel Sleator, David Temperley, and John Lafferty */ -/* Copyright (c) 2009, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the link grammar parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LINK_GRAMMAR_WORD_UTILS_H_ -#define _LINK_GRAMMAR_WORD_UTILS_H_ - -#include "structures.h" - -/* Exp utilities ... */ -void free_Exp(Exp *); -void free_E_list(E_list *); -int size_of_expression(Exp *); -Exp * copy_Exp(Exp *); -bool is_exp_like_empty_word(Dictionary dict, Exp *); -/* int exp_compare(Exp * e1, Exp * e2); */ -/* int exp_contains(Exp * super, Exp * sub); */ - - -/* X_node utilities ... */ -X_node * catenate_X_nodes(X_node *, X_node *); -void free_X_nodes(X_node *); - - -/* Connector utilities ... */ -Connector * connector_new(void); -void free_connectors(Connector *); - -static inline Connector * init_connector(Connector *c) -{ - c->hash = -1; - c->length_limit = UNLIMITED_LEN; - return c; -} - -/* Connector-set utilities ... */ -Connector_set * connector_set_create(Exp *e); -void connector_set_delete(Connector_set * conset); -bool word_has_connector(Dict_node *, const char *, char); -const char * word_only_connector(Dict_node *); -bool match_in_connector_set(Connector_set*, Connector*); - - -/** - * Returns TRUE if s and t match according to the connector matching - * rules. The connector strings must be properly formed, starting with - * zero or one lower case letters, followed by one or more upper case - * letters, followed by some other letters. - * - * The algorithm is symmetric with respect to a and b. - * - * Connectors starting with lower-case letters match ONLY if the initial - * letters are DIFFERENT. Otherwise, connectors only match if the - * upper-case letters are the same, and the trailing lower case letters - * are the same (or have wildcards). - * - * The initial lower-case letters allow an initial 'h' (denoting 'head - * word') to match an initial 'd' (denoting 'dependent word'), while - * rejecting a match 'h' to 'h' or 'd' to 'd'. This allows the parser - * to work with catena, instead of just links. - */ -static inline bool easy_match(const char * s, const char * t) -{ - char is = 0, it = 0; - if (islower((int) *s)) { is = *s; s++; } - if (islower((int) *t)) { it = *t; t++; } - - if (is != 0 && it != 0 && is == it) return false; - - while (isupper((int)*s) || isupper((int)*t)) - { - if (*s != *t) return false; - s++; - t++; - } - - while ((*s!='\0') && (*t!='\0')) - { - if ((*s == '*') || (*t == '*') || (*s == *t)) - { - s++; - t++; - } - else - return false; - } - return true; -} - - -/* Dictionary utilities ... */ -bool word_contains(Dictionary dict, const char * word, const char * macro); - -static inline int string_hash(const char *s) -{ - unsigned int i; - - /* djb2 hash */ - i = 5381; - while (*s) - { - i = ((i << 5) + i) + *s; - s++; - } - return i; -} - -int calculate_connector_hash(Connector *); - -static inline int connector_hash(Connector * c) -{ - if (-1 != c->hash) return c->hash; - return calculate_connector_hash(c); -} - -/** - * hash function. Based on some tests, this seems to be an almost - * "perfect" hash, in that almost all hash buckets have the same size! - */ -static inline unsigned int pair_hash(unsigned int table_size, - int lw, int rw, - const Connector *le, const Connector *re, - unsigned int cost) -{ - unsigned int i; - -#if 0 - /* hash function. Based on some tests, this seems to be - * an almost "perfect" hash, in that almost all hash buckets - * have the same size! */ - i = 1 << cost; - i += 1 << (lw % (log2_table_size-1)); - i += 1 << (rw % (log2_table_size-1)); - i += ((unsigned int) le) >> 2; - i += ((unsigned int) le) >> log2_table_size; - i += ((unsigned int) re) >> 2; - i += ((unsigned int) re) >> log2_table_size; - i += i >> log2_table_size; -#else - /* sdbm-based hash */ - i = cost; - i = lw + (i << 6) + (i << 16) - i; - i = rw + (i << 6) + (i << 16) - i; - i = ((int)(intptr_t)le) + (i << 6) + (i << 16) - i; - i = ((int)(intptr_t)re) + (i << 6) + (i << 16) - i; -#endif - - return i & (table_size-1); -} -#endif /* _LINK_GRAMMAR_WORD_UTILS_H_ */ diff -Nru link-grammar-5.3.16/link-grammar.spec link-grammar-5.5.0/link-grammar.spec --- link-grammar-5.3.16/link-grammar.spec 2017-04-15 21:51:08.000000000 +0000 +++ link-grammar-5.5.0/link-grammar.spec 2018-04-29 21:40:30.000000000 +0000 @@ -1,5 +1,5 @@ %define name link-grammar -%define version 5.3.16 +%define version 5.5.0 %define release 1 Summary: A Natural Language Parser based on Link Grammar Theory diff -Nru link-grammar-5.3.16/link-parser/command-line.c link-grammar-5.5.0/link-parser/command-line.c --- link-grammar-5.3.16/link-parser/command-line.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-parser/command-line.c 2018-04-26 02:38:14.000000000 +0000 @@ -11,14 +11,14 @@ /* */ /*************************************************************************/ +#include #include #include #include #include -#include "parser-utilities.h" #include "command-line.h" -#include +#include "parser-utilities.h" static struct { @@ -52,31 +52,26 @@ int display_disjuncts; int display_senses; int display_morphology; -} local; + int display_wordgraph; +} local, local_saved; -typedef enum +static const char *value_type[] = { - Int, - Bool, - Float, - String, -} ParamType; + "(integer) ", "(Boolean) ", "(float) ", "(string) ", "(command) ", "" +}; -typedef struct -{ - const char *string; - ParamType param_type; - const char *description; - void *ptr; -} Switch; +static int variables_cmd(const Switch*, int); +static int file_cmd(const Switch*, int); +static int help_cmd(const Switch*, int); +static int exit_cmd(const Switch*, int); -static Switch default_switches[] = +Switch default_switches[] = { {"bad", Bool, "Display of bad linkages", &local.display_bad}, {"batch", Bool, "Batch mode", &local.batch_mode}, - {"cluster", Bool, "Use clusters to loosen parsing", &local.use_cluster_disjuncts}, + {"cluster", Bool, UNDOC "Use clusters to loosen parsing", &local.use_cluster_disjuncts}, {"constituents", Int, "Generate constituent output", &local.display_constituents}, - {"cost-model", Int, "Cost model used for ranking", &local.cost_model}, + {"cost-model", Int, UNDOC "Cost model used for ranking", &local.cost_model}, {"cost-max", Float, "Largest cost to be considered", &local.max_cost}, {"disjuncts", Bool, "Display of disjuncts used", &local.display_disjuncts}, {"echo", Bool, "Echoing of input sentence", &local.echo_on}, @@ -84,14 +79,14 @@ {"islands-ok", Bool, "Use of null-linked islands", &local.islands_ok}, {"limit", Int, "The maximum linkages processed", &local.linkage_limit}, {"links", Bool, "Display of complete link data", &local.display_links}, - {"memory", Int, "Max memory allowed", &local.memory}, + {"memory", Int, UNDOC "Max memory allowed", &local.memory}, {"morphology", Bool, "Display word morphology", &local.display_morphology}, {"null", Bool, "Allow null links", &local.allow_null}, {"panic", Bool, "Use of \"panic mode\"", &local.panic_mode}, {"postscript", Bool, "Generate postscript output", &local.display_postscript}, {"ps-header", Bool, "Generate postscript header", &local.display_ps_header}, {"rand", Bool, "Use repeatable random numbers", &local.repeatable_rand}, - {"senses", Bool, "Display of word senses", &local.display_senses}, + {"senses", Bool, UNDOC "Display of word senses", &local.display_senses}, {"short", Int, "Max length of short links", &local.short_length}, #if defined HAVE_HUNSPELL || defined HAVE_ASPELL {"spell", Int, "Up to this many spell-guesses per unknown word", &local.spell_guess}, @@ -101,26 +96,40 @@ {"use-sat", Bool, "Use Boolean SAT-based parser", &local.use_sat_solver}, #endif /* USE_SAT_SOLVER */ {"verbosity", Int, "Level of detail in output", &local.verbosity}, - {"debug", String, "comma-separated function list to debug", &local.debug}, - {"test", String, "comma-separated features to test", &local.test}, + {"debug", String, "Comma-separated function names to debug", &local.debug}, + {"test", String, "Comma-separated test features", &local.test}, #ifdef USE_VITERBI {"viterbi", Bool, "Use Viterbi-based parser", &local.use_viterbi}, #endif {"walls", Bool, "Display wall words", &local.display_walls}, {"width", Int, "The width of the display", &local.screen_width}, - {NULL, Bool, NULL, NULL} + {"wordgraph", Int, "Display sentence word-graph", &local.display_wordgraph}, + {"help", Cmd, "List the commands and what they do", help_cmd}, + {"variables", Cmd, "List user-settable variables and their functions", variables_cmd}, + {"file", Cmd, "Read input from the specified filename", file_cmd}, + {"exit", Cmd, "Exit the program", exit_cmd}, + {"quit", Cmd, UNDOC "Exit the program", exit_cmd}, + {NULL, Cmd, NULL, NULL} }; -struct {const char * s; const char * str;} user_command[] = +static void put_opts_in_local_vars(Command_Options *); + +/* + * A way to record the options default values. + */ +void save_default_opts(Command_Options *copts) { - {"variables", "List user-settable variables and their functions"}, - {"help", "List the commands and what they do"}, - {"file", "Read input from the specified filename"}, - {NULL, NULL} -}; + put_opts_in_local_vars(copts); + local_saved = local; +} + +static void restore_default_local_vars(void) +{ + local = local_saved; +} /** - * Gets rid of all the white space in the string s. Changes s + * Gets rid of all the white space in the string s. */ static void clean_up_string(char * s) { @@ -135,7 +144,12 @@ while(*x != '\0') { w = mbrtowc(&p, x, len, &state); - if ((0 == w) || ((size_t)-1 == w)) break; + if (0 == w) break; + if (0 > (ssize_t)w) + { + prt_error("Unable to process UTF8 command input string.\n"); + break; + } len -= w; if (!iswspace(p)) { @@ -168,7 +182,11 @@ { w = mbrtowc(&p, s, len, &state); if (0 == w) break; - if ((size_t)-1 == w) return false; + if (0 > (ssize_t)w) + { + prt_error("Unable to process UTF8 command input string.\n"); + break; + } len -= w; if (!iswdigit(p)) return false; } @@ -185,22 +203,461 @@ *((int *) s.ptr) = val; } +/** + * Return the value description for the given switch. + */ +static const char *switch_value_description(const Switch *as) +{ + if (Bool == as->param_type) + return ival(*as) ? " (On)" : " (Off)"; + if (Int == as->param_type) + return (-1 == ival(*as)) ? " (Unlimited)" : ""; + + return ""; +} + +/** + * Return a static buffer with a string value of the given switch. + * + * Since the static buffer is overwritten on each call, this function + * should not use more than once as an argument of the same function. + */ +static const char *switch_value_string(const Switch *as) +{ + static char buf[128]; /* Size of buf is much more than we need */ + + switch (as->param_type) + { + case Float: /* Float point print! */ + snprintf(buf, sizeof(buf), "%.2f", *((double *)as->ptr)); + break; + case Bool: + /* FALLTHRU */ + case Int: + /* FALLTHRU (why another one is needed?) */ + snprintf(buf, sizeof(buf), "%d", ival(*as)); + break; + case String: + if ((NULL == *(char **)as->ptr) || ('\0' == **(char **)as->ptr)) + strcpy(buf, " (not set)"); + else + snprintf(buf, sizeof(buf), "%s", *(char **)as->ptr); + break; + case Cmd: + buf[0] = '\0'; /* No value to print. */ + break; + default: + /* Internal error. */ + snprintf(buf, sizeof(buf), "Unknown type %d\n", as->param_type); + } + + return buf; +} + +#define HELPFILE_BASE "command-help-" +#define HELPFILE_EXT ".txt" +#define HELPFILE_LANG_TEMPLATE "LL" /* we use only the 2-letter language code */ +#define HELPFILE_LANG_TEMPLATE_SIZE (sizeof(HELPFILE_LANG_TEMPLATE)-1) +#define HELPFILE_TEMPLATE_SIZE \ + (sizeof(HELPFILE_BASE HELPFILE_EXT)+HELPFILE_LANG_TEMPLATE_SIZE) +#define D_USER_FILES 3 /* Debug level for files */ +#define DEFAULT_HELP_LANG "en" + +static FILE *help_file; +/* Used in atexit() below. */ +static void close_help_file(void) +{ + fclose(help_file); +} + +/** + * On each call, return the next locale to try for constructing the help + * file name (NULL if no more). + * + * @param nextlang true on calls to get the next language. + * @return Locale to try, NULL if no more. + * + * This function should be called until it returns NULL (it order to + * free memory that it may allocate). + * After it returns NULL, the code that uses it below doesn't call it again. + * + * See "Specifying a Priority List of Languages": + * www.gnu.org/software/gettext/manual/html_node/The-LANGUAGE-variable.html + */ +static const char *get_next_locale(void) +{ + enum state + {Initial_s, Language_s, Next_language_s, Default_language_s, Final_s}; + static int state = Initial_s; + static char *language; + char *lc_all; + const char *lang = NULL; + + while (1) + { + switch (state) + { + case Initial_s: + lc_all = getenv("LC_ALL"); + if ((NULL != lc_all) && (0 == strcmp(lc_all, "C"))) + { + /* LC_ALL=C */ + state = Default_language_s; + continue; + } + + if ((NULL == lc_all) || ('\0' == *lc_all)) + { + /* LC_ALL= */ + lang = getenv("LANG"); + if ((NULL != lang) && (0 == strcmp(lang, "C"))) + { + /* LANG=C */ + state = Default_language_s; + continue; + } + } + /* LC_ALL=x */ + state = Language_s; + continue; + + case Language_s: + language = getenv("LANGUAGE"); + if ((language == NULL) || ('\0' == *language)) + { + /* LANGUAGE= */ + language = NULL; /* so it doesn't get freed at Finals_s */ + state = Default_language_s; + if ((NULL != lang) && ('\0' != *lang)) + { + /* LANG=x */ + return lang; + } + /* LANG= */ + continue; + } + + /* LANGUAGE=x:y:z */ + state = Next_language_s; + language = strdup(language); /* strdup() for strtok() */ + return strtok(language, ":"); + break; + + case Next_language_s: + { + char *next_language = strtok(NULL, ":"); + if (NULL == next_language) + { + /* LANGUAGE tokens exhausted */ + state = Default_language_s; + continue; + } + /* Unchanged state. */ + return next_language; + } + break; + + case Default_language_s: + state = Final_s; + return DEFAULT_HELP_LANG; + + case Final_s: + free(language); + state = Initial_s; + break; + /* NULL is returned below. */ + } + break; + } + + return NULL; +} + +static FILE *open_help_file(int verbosity) +{ + char *help_filename; + + if (NULL != help_file) + { + rewind(help_file); + return help_file; + } + atexit(close_help_file); + + /* Construct the help filename from its template. */ + help_filename = malloc(HELPFILE_TEMPLATE_SIZE); + strcpy(help_filename, HELPFILE_BASE); + char *ll_pos = &help_filename[strlen(help_filename)]; + strcpy(ll_pos, HELPFILE_LANG_TEMPLATE HELPFILE_EXT); + + const char *ll; + while ((ll = get_next_locale())) + { + if (NULL != help_file) + continue; /* until get_next_locale() returns NULL */ + strncpy(ll_pos, ll, HELPFILE_LANG_TEMPLATE_SIZE); + help_file = linkgrammar_open_data_file(help_filename); + } + + if ((NULL == help_file) && (verbosity > D_USER_FILES)) + { + prt_error("Error: Cannot open help file '%s': %s\n", + help_filename, strerror(errno)); + } + + free(help_filename); + return help_file; +} + +/** + * Print basic info: name, description, current value and type. + * Iff is_completion is true, display also the variable value, and use + * fixed fields for the value related info and the description. + * This is intended for use from the command completion code. + * + * The display format is ([] denotes optional - if is_completion is true): + * varname[=varvalue] (vartype) - description + */ +void display_1line_help(const Switch *sp, bool is_completion) +{ + int undoc = !!(UNDOC[0] == sp->description[0]); + int vtw = 0; /* value_type field width */ + int vnw = 0; /* varname field width */ + bool display_eq = is_completion && (Cmd != sp->param_type); + const char *value = ""; + + if (is_completion) + { + vtw = 10; + vnw = 18; + if (Cmd != sp->param_type) + value = switch_value_string(sp); + } + + int n; /* actual varname and optional varvalue print length */ + printf("%s%s%s%n", sp->string, display_eq ? "=" : " ", value, &n); + if (is_completion) printf("%*s", MAX(0, vnw - n), ""); + printf("%*s- %s\n", vtw, value_type[sp->param_type], sp->description+undoc); +} + +static void display_help(const Switch *sp, Command_Options *copts) +{ + char line[MAX_INPUT]; /* Maximum number of character in a help file line */ + + display_1line_help(sp, /*is_completion*/false); + if (Cmd != sp->param_type) + { + printf("Current value: %s\n",switch_value_string(sp)); + restore_default_local_vars(); + printf("Default value: %s\n", switch_value_string(sp)); + put_opts_in_local_vars(copts); + } + + FILE *hf = open_help_file(local.verbosity); + if (NULL == hf) + { + prt_error("Warning: Help file not found\n"); + return; + } + + bool help_found = false; + while (!help_found && (NULL != fgets(line, sizeof(line), hf))) + { + if ('[' != line[0]) continue; + { +#define CMDTAG_SEP ", \t]" + /* Allow for several names to map to the same help text, + * in a hope that the same help text can be used for the + * language bindings too (which have different option names + * from historical reasons). + * Note: We suppose the lines are not longer than MAX_INPUT. + * Longer lines may render the help text incorrectly. + * FIXME: Add command reference notation in the help text if + * needed. + * Help file format: + * % comment + * [cmd1 cmd2 ...] + * text ... + * [nextcmd ...] + * text ... + */ + char *t = strtok(line+1, CMDTAG_SEP); + if (NULL == t) continue; + do { + if (0 == strcasecmp(t, sp->string)) + { + help_found = true; + break; + } + t = strtok(NULL, CMDTAG_SEP); + } while (NULL != t); + } + } + + if (ferror(hf)) + { + prt_error("Error: Reading help file: %s\n", strerror(errno)); + return; + } + + if (feof(hf)) + { + if (local.verbosity >= D_USER_FILES) + prt_error("Error: Cannot find command \"%s\" in help file\n", + sp->string); + } + else + { + help_found = false; + bool issue_blank_line = false; + while (NULL != fgets(line, sizeof(line), hf)) + { + const size_t len = strlen(line); + if ((MAX_INPUT == len-1) && '\n' != line[MAX_INPUT-2]) + { + prt_error("Warning: Help-file text line too long at offset %ld\n", + ftell(hf)); + } + if (COMMENT_CHAR == line[0]) continue; + if ('[' == line[0]) break; + + /* Suppress the ending blank lines of the help text. */ + if (strspn(line, WHITESPACE) == len) /* we encountered a blank line */ + { + issue_blank_line = true; + continue; + } + + if (!help_found) + { + printf("\n"); /* issue a blank line separator after basic info */ + help_found = true; + } + + /* We have a line to print. Print a blank line before it if needed. */ + if (issue_blank_line) + { + issue_blank_line = false; + printf("\n"); + } + printf("%s", line); + } + if (!help_found) /* the command tag has no help text */ + prt_error("Info: No help text found for command \"%s\"\n", sp->string); + } +} + +static int help_cmd(const Switch *uc, int n) +{ + printf("Special commands always begin with \"!\". Command and variable names\n"); + printf("can be abbreviated. Here is a list of the commands:\n\n"); + + printf(" !help command Show a detailed help for the given command\n"); + for (int i = 0; uc[i].string != NULL; i++) + { + if (Cmd != uc[i].param_type) continue; + if (UNDOC[0] == uc[i].description[0]) continue; + printf(" !%-14s ", uc[i].string); + printf("%s\n", uc[i].description); + } + + printf("\n"); + printf(" !! Print all the dictionary words that match .\n"); + printf(" A wildcard * may be used to find multiple matches.\n"); + printf("\n"); + printf(" ! Toggle the specified Boolean variable.\n"); + printf(" != Assign that value to that variable.\n"); + + return 'c'; +} + +static int variables_cmd(const Switch *uc, int n) +{ + printf(" Variable Controls Value\n"); + printf(" -------- -------- -----\n"); + for (int i = 0; uc[i].string != NULL; i++) + { + if (Cmd == uc[i].param_type) continue; + if (UNDOC[0] == uc[i].description[0]) continue; + printf(" %-13s", uc[i].string); + printf("%-*s", FIELD_WIDTH(uc[i].description, 46), uc[i].description); + printf("%5s", switch_value_string(&uc[i])); + printf("%s\n", switch_value_description(&uc[i])); + } + + printf("\n"); + printf("Toggle a Boolean variable as in \"!batch\"; "); + printf("Set a variable as in \"!width=100\".\n"); + printf("Get a more detailed help on a variable as in \"!help var\".\n"); + return 'c'; +} + +static int exit_cmd(const Switch *uc, int n) +{ + return 'e'; +} + +static int file_cmd(const Switch *uc, int n) +{ + return 'f'; +} + static int x_issue_special_command(char * line, Command_Options *copts, Dictionary dict) { char *s, *x, *y; - int i, count, j, k; - Switch * as = default_switches; - Parse_Options opts = copts->popts; + int count, j; + const Switch *as = default_switches; + + /* Handle a request for a particular command help. */ + if (NULL != dict) + { + /* If we are here, it is not a command-line parameter. */ + s = strtok(line, WHITESPACE); + if ((s != NULL) && strncasecmp(s, "help", strlen(s)) == 0) + { + s = strtok(NULL, WHITESPACE); + if (s != NULL) + { + /* This is a help request for the command name at s. */ + j = -1; /* command index */ + count = 0; /* number of matching commands */ + + /* Is it a unique abbreviation? */ + for (int i = 0; as[i].string != NULL; i++) + { + if (strncasecmp(s, as[i].string, strlen(s)) == 0) + { + count++; + j = i; + } + } + + if (count == 1) + { + display_help(&as[j], copts); + return 'c'; + } + + if (count > 1) + prt_error("Ambiguous command: \"%s\". ", s); + else + prt_error("Undefined command: \"%s\". ", s); + + prt_error("Type \"!help\" or \"!variables\"\n"); + return -1; + } + } + } clean_up_string(line); s = line; - j = k = -1; + j = -1; count = 0; - /* Look for boolean flippers */ - for (i=0; as[i].string != NULL; i++) + /* Look for Boolean flippers or command abbreviations. */ + for (int i = 0; as[i].string != NULL; i++) { - if ((Bool == as[i].param_type) && + if (((Bool == as[i].param_type) || (Cmd == as[i].param_type)) && strncasecmp(s, as[i].string, strlen(s)) == 0) { count++; @@ -208,16 +665,6 @@ } } - /* Look for abbreviations */ - for (i=0; user_command[i].s != NULL; i++) - { - if (strncasecmp(s, user_command[i].s, strlen(s)) == 0) - { - count++; - k = i; - } - } - if (count > 1) { printf("Ambiguous command. Type \"!help\" or \"!variables\"\n"); @@ -225,84 +672,57 @@ } else if (count == 1) { - /* flip boolean value */ - if (j >= 0) + /* Flip Boolean value. */ + if (Bool == as[j].param_type) { setival(as[j], (0 == ival(as[j]))); - printf("%s turned %s.\n", as[j].description, (ival(as[j]))? "on" : "off"); - return 0; + int undoc = !!(UNDOC[0] == as[j].description[0]); + printf("%s turned %s.\n", + as[j].description+undoc, (ival(as[j]))? "on" : "off"); + return 'c'; } - /* Found an abbreviated command, but it wasn't a boolean. + /* Found an abbreviated, but it wasn't a Boolean. * It means it is a user command, to be handled below. */ + return ((int (*)(const Switch*, int)) (as[j].ptr))(as, j); + } - if (strcmp(user_command[k].s, "variables") == 0) + if (s[0] == '!') + { + Parse_Options opts = copts->popts; + char *out; + + out = dict_display_word_info(dict, s+1, opts); + if (NULL != out) { - printf(" Variable Controls Value\n"); - printf(" -------- -------- -----\n"); - for (i = 0; as[i].string != NULL; i++) + printf("%s\n", out); + free(out); + out = dict_display_word_expr(dict, s+1, opts); + if (NULL != out) { - printf(" "); - left_print_string(stdout, as[i].string, 13); - left_print_string(stdout, as[i].description, 46); - if (Float == as[i].param_type) - { - /* Float point print! */ - printf("%5.2f", *((double *)as[i].ptr)); - } - else - if ((Bool == as[i].param_type) || Int == as[i].param_type) - { - printf("%5d", ival(as[i])); - } - else - if (String == as[i].param_type) - { - printf("%s", *(char **)as[i].ptr); - } - if (Bool == as[i].param_type) - { - if (ival(as[i])) printf(" (On)"); else printf(" (Off)"); - } - printf("\n"); + printf("%s", out); + free(out); } - printf("\n"); - printf("Toggle a boolean variable as in \"!batch\"; "); - printf("set a variable as in \"!width=100\".\n"); - return 0; - } - - if (strcmp(user_command[k].s, "help") == 0) - { - printf("Special commands always begin with \"!\". Command and variable names\n"); - printf("can be abbreviated. Here is a list of the commands:\n\n"); - for (i=0; user_command[i].s != NULL; i++) { - printf(" !"); - left_print_string(stdout, user_command[i].s, 15); - left_print_string(stdout, user_command[i].str, 52); - printf("\n"); + else + { + prt_error("Error: '%s': Internal Error: Missing expression.\n", s+1); } - printf(" !! Print all the dictionary words that matches .\n"); - printf(" A wildcard * may be used to find multiple matches.\n"); - printf("\n"); - printf(" ! Toggle the specified boolean variable.\n"); - printf(" != Assign that value to that variable.\n"); - return 0; } - } + else + { + printf("Token \"%s\" matches nothing in the dictionary.\n", s+1); + } - if (s[0] == '!') - { - dict_display_word_info(dict, s+1, opts); - dict_display_word_expr(dict, s+1, opts); - return 0; + return 'c'; } #ifdef USE_REGEX_TOKENIZER if (s[0] == '/') { + /* experimental code -- ignore nested extern warning for now */ + extern int regex_tokenizer_test(Dictionary, const char *); int rc = regex_tokenizer_test(dict, s+1); if (0 != rc) printf("regex_tokenizer_test: rc %d\n", rc); - return 0; + return 'c'; } #endif @@ -318,8 +738,9 @@ /* Figure out which command it is .. it'll be the j'th one */ j = -1; - for (i=0; as[i].string != NULL; i++) + for (int i = 0; as[i].string != NULL; i++) { + if (Cmd == as[i].param_type) continue; if (strncasecmp(x, as[i].string, strlen(x)) == 0) { j = i; @@ -355,14 +776,14 @@ setival(as[j], val); printf("%s set to %d\n", as[j].string, val); - return 0; + return 'c'; } else if (as[j].param_type == Float) { - double val = -1.0; - val = atof(y); - if (val < 0.0) + char *err; + double val = strtod(y, &err); + if ('\0' != *err) { printf("Invalid value %s for variable %s Type \"!help\" or \"!variables\"\n", y, as[j].string); return -1; @@ -370,14 +791,14 @@ *((double *) as[j].ptr) = val; printf("%s set to %5.2f\n", as[j].string, val); - return 0; + return 'c'; } else if (as[j].param_type == String) { *((char **) as[j].ptr) = y; printf("%s set to %s\n", (char *)as[j].string, y); - return 0; + return 'c'; } else { @@ -390,9 +811,9 @@ /* Look for valid commands, but ones that needed an argument */ j = -1; count = 0; - for (i = 0; as[i].string != NULL; i++) + for (int i = 0; as[i].string != NULL; i++) { - if ((Bool != as[i].param_type) && + if ((Bool != as[i].param_type) && (Cmd != as[i].param_type) && strncasecmp(s, as[i].string, strlen(s)) == 0) { j = i; @@ -429,7 +850,7 @@ local.use_sat_solver = parse_options_get_use_sat_parser(opts); local.use_viterbi = parse_options_get_use_viterbi(opts); - local.screen_width = copts->screen_width; + local.screen_width = (int)copts->screen_width; local.echo_on = copts->echo_on; local.batch_mode = copts->batch_mode; local.panic_mode = copts->panic_mode; @@ -439,6 +860,7 @@ local.display_postscript = copts->display_postscript; local.display_ps_header = copts->display_ps_header; local.display_constituents = copts->display_constituents; + local.display_wordgraph = copts->display_wordgraph; local.display_bad = copts->display_bad; local.display_disjuncts = copts->display_disjuncts; @@ -470,7 +892,7 @@ parse_options_set_use_viterbi(opts, local.use_viterbi); parse_options_set_display_morphology(opts, local.display_morphology); - copts->screen_width = local.screen_width; + copts->screen_width = (size_t)local.screen_width; copts->echo_on = local.echo_on; copts->batch_mode = local.batch_mode; copts->panic_mode = local.panic_mode; @@ -480,6 +902,7 @@ copts->display_postscript = local.display_postscript; copts->display_ps_header = local.display_ps_header; copts->display_constituents = local.display_constituents; + copts->display_wordgraph = local.display_wordgraph; copts->display_bad = local.display_bad; copts->display_disjuncts = local.display_disjuncts; @@ -503,6 +926,10 @@ char *cline = strdup(line); rc = x_issue_special_command(cline, opts, dict); put_local_vars_in_opts(opts); + /* Read back: + * - So we can see if the option has actually got changed. + * - We need non-stale addresses for the test and debug variables. */ + put_opts_in_local_vars(opts); free(cline); if (save) opts->popts = save; @@ -517,7 +944,7 @@ co->panic_opts = parse_options_create(); /* "Unlimited" screen width when writing to a file, auto-updated - * later, wen writing to a tty. */ + * later, when writing to a tty. */ co->screen_width = 16381; co->allow_null = true; co->batch_mode = false; @@ -533,6 +960,7 @@ co->display_disjuncts = false; co->display_links = false; co->display_senses = false; + co->display_wordgraph = 0; return co; } diff -Nru link-grammar-5.3.16/link-parser/command-line.h link-grammar-5.5.0/link-parser/command-line.h --- link-grammar-5.3.16/link-parser/command-line.h 2016-09-14 18:37:36.000000000 +0000 +++ link-grammar-5.5.0/link-parser/command-line.h 2018-04-26 02:38:14.000000000 +0000 @@ -11,10 +11,23 @@ /* */ /*************************************************************************/ -#include -#include - -LINK_BEGIN_DECLS /* Needed to keep MSVC6 happy */ +#include "link-grammar/link-includes.h" +/* If you compile your own program out of the LG source-code tree, + * include this file as follows in order to pick it up from the + * LG-package system installation. */ +/* #include */ + +#define COMMENT_CHAR '%' /* input lines beginning with this are ignored */ +#define WHITESPACE " \t\v\r\n" /* ASCII-only is sufficient here */ +#define FIELD_WIDTH(str, width) (int)((width)+strlen(str)-utf8_strwidth(str)) + +#if !defined(MIN) +#define MIN(X,Y) (((X) < (Y)) ? (X) : (Y)) +#endif + +#if !defined(MAX) +#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y)) +#endif typedef struct { Parse_Options popts; @@ -35,12 +48,30 @@ bool display_disjuncts; /* if true, print disjuncts that were used */ bool display_links; /* if true, a list o' links is printed out */ bool display_senses; /* if true, sense candidates are printed out */ + int display_wordgraph; /* if true, the word-graph is displayed */ } Command_Options; -LINK_END_DECLS +typedef enum +{ + Int, + Bool, + Float, + String, + Cmd, +} ParamType; + +typedef struct +{ + const char *string; + ParamType param_type; + const char *description; + void *ptr; +} Switch; +void save_default_opts(Command_Options*); int issue_special_command(const char*, Command_Options*, Dictionary); Command_Options* command_options_create(void); void command_options_delete(Command_Options*); +void display_1line_help(const Switch *, bool); - +#define UNDOC "\1" /* undocumented command */ diff -Nru link-grammar-5.3.16/link-parser/lg_readline.c link-grammar-5.5.0/link-parser/lg_readline.c --- link-grammar-5.3.16/link-parser/lg_readline.c 2017-02-17 21:27:03.000000000 +0000 +++ link-grammar-5.5.0/link-parser/lg_readline.c 2018-04-26 02:38:14.000000000 +0000 @@ -32,12 +32,273 @@ #ifdef HAVE_WIDECHAR_EDITLINE #include +#include "command-line.h" + +extern Switch default_switches[]; +static const Switch **sorted_names; /* sorted command names */ static wchar_t * wc_prompt = NULL; static wchar_t * prompt(EditLine *el) { return wc_prompt; } +/** + * Try to complete the wide string in \p input, whose length is \p len. + * + * @param input Non-NUL-terminated wide string with length \p len. + * @param len Number of wide characters in \p input. + * @param is_help \p input is the argument of an !help command. + * @return 3 types of values: + * "" There are no completions. + * NULL Choices help have been printed. + * A NUL-terminated byte string (to be used as a completion). + */ +static char *complete_command(const wchar_t *input, size_t len, bool is_help) +{ + const Switch **start = NULL; + const Switch **end; + const Switch **match; + const char *prev; + size_t addlen; + bool is_assignment = false; /* marking for the help facility */ + + if ((1 < len) && L'=' == input[len-1] && !is_help) + { + /* This is a variable assignment without a value. + * Arrange for displaying an help line (only - no completion). */ + is_assignment = true; + len--; /* disregard the ending '=' to enable command search. */ + } + + /* Our commands are ASCII strings. + * So for simplicity, convert the input to an ASCII string. */ + char *astr = malloc(len+1); + for (size_t i = 0; i < len; i++) + { + if (input[i] < 0 || input[i] > 127) + { + free(astr); + return NULL; /* unsupported input */ + } + astr[i] = (char)input[i]; + } + astr[len] = '\0'; + + /* Find the possible completions. */ + for (match = sorted_names; NULL != *match; match++) + { + if (UNDOC[0] == (*match)->description[0]) continue; + if (0 == strncmp(astr, (*match)->string, len)) + { + if (NULL == start) + { + start = match; + addlen = strlen((*match)->string) - len; + } + else + { + /* There is more than one match. Find the maximal common + * additional substring that we can append. */ + for (; addlen > 0; addlen--) + { + if (0 == strncmp((*match)->string+len, prev+len, addlen)) + break; + } + } + prev = (*match)->string; + } + else if (NULL != start) break; + } + free(astr); + end = match; + + if (NULL == start) return strdup(""); /* nothing found - no completions */ + int cnum = end - start; + + /* Show a possible completion list in a form of 1-line command help, + * for these cases: + * 1. Multiple completions with no common substring to add. + * 2. An assignment command. + */ + if (((cnum > 1) && (0 == addlen)) || is_assignment) + { + bool all_commands = true; + + printf("\n"); + if (is_assignment) + { + for (size_t i = 0; &start[i] < end; i++) + if ((Cmd != start[i]->param_type)) all_commands = false; + } + do + { + if (UNDOC[0] == (*start)->description[0]) continue; + if (is_assignment && all_commands) + { + printf("\"%s\" is not a user variable\n", (*start)->string); + } + else + { + if (!is_assignment || (Cmd != (*start)->param_type)) + display_1line_help(*start, /*is_completion*/true); + } + } + while (++start < end); + + return NULL; + } + + /* Here addlen>0, so we have 1 or more possible completions. */ + char *addstr = malloc(addlen + 2); /* + '=' + '\0' */ + strncpy(addstr, (*start)->string + len, addlen); + + if ((1 == cnum) && !is_help) + { + /* A single completion. Indicate a success: + * - For a non-Boolean user variable appending '='. + * - For a command or a Boolean append ' '. */ + addstr[addlen++] = + ((Cmd != (*start)->param_type) && (Bool != (*start)->param_type)) + ? '=' : ' '; + } + else + { + /* Multiple completions. The maximal common substring will be returned. */ + } + + addstr[addlen] = '\0'; + return addstr; +} + +static int by_byteorder(const void *a, const void *b) +{ + const Switch * const *sa = a; + const Switch * const *sb = b; + + return strcmp((*sa)->string, (*sb)->string); +} + +static void build_command_list(const Switch ds[]) +{ + size_t cl_num = 0; + for (size_t i = 0; NULL != ds[i].string; i++) + { + if (UNDOC[0] == ds[i].description[0]) continue; + cl_num++; + } + sorted_names = malloc((cl_num+1) * sizeof(*sorted_names)); + + int j = 0; + for (size_t i = 0; NULL != ds[i].string; i++) + { + if (UNDOC[0] == ds[i].description[0]) continue; + sorted_names[j++] = &ds[i]; + } + sorted_names[cl_num] = NULL; + qsort(sorted_names, cl_num, sizeof(*sorted_names), by_byteorder); +} + +/** + * Complete variables / commands and also file name after "!file ". + * Filenames with blanks are not supported well. + * This is partially a problem of editline, so there is no point to + * fix that. + * + * There is a special treatment of the !help command since it can be + * abbreviated up to one character and its argument position also + * needs a command completion (which is slightly different - no '=' is + * appended to variables). + * + * FIXME: The file completion knows about ~ and ~user. However, I + * don't know how to force their expanding, so they are not useful + * for now. + */ +static unsigned char lg_complete(EditLine *el, int ch) +{ + const LineInfoW *li = el_wline(el); + const wchar_t *word_start; /* the word to be completed */ + const wchar_t *word_end; + size_t word_len; + const wchar_t *ctemp; + unsigned char rc; + bool is_file_command = false; + bool is_help_command = false; + + /* A command must start with '!' on column 0. */ + if (L'!' != li->buffer[0]) + return CC_ERROR; + + /* Allow for whitespace after the initial "!" */ + for (ctemp = li->buffer+1; ctemp < li->lastchar; ctemp++) + if (!iswspace(*ctemp)) break; + + word_start = ctemp; + + /* Find the word (if any) end. */ + for (ctemp = word_start; ctemp < li->lastchar; ctemp++) + if (iswspace(*ctemp)) break; + + word_end = ctemp; + word_len = word_end - word_start; + + if (0 < word_len) + { + /* Don't try to complete in a middle of word. */ + if ((li->cursor >= word_start) && ((li->cursor < word_end))) + return CC_ERROR; + + /* Now check if it is !help or !file (or their abbreviation), as + * these commands need an argument completion. Whitespace is + * needed after these commands as else this is a command completion. */ + if (iswspace(*word_end)) + { + if (0 == wcsncmp(word_start, L"help", word_len)) + is_help_command = true; + else if (0 == wcsncmp(word_start, L"file", word_len)) + is_file_command = true; + + if (is_file_command || is_help_command) + { + /* Check if the command has an argument. */ + for (ctemp = word_end + 1; ctemp < li->lastchar; ctemp++) + if (!iswspace(*ctemp)) break; + word_start = ctemp; + for (ctemp = word_start; ctemp < li->lastchar; ctemp++) + if (iswspace(*ctemp)) break; + word_end = ctemp; + word_len = word_end - word_start; + } + } + } + + /* Cannot complete if the cursor is not directly after the word. */ + if ((0 < word_len) && li->cursor != word_end) return CC_ERROR; + + /* Cannot complete if there is non-whitespace after the cursor. */ + for (ctemp = word_end; ctemp < li->lastchar; ctemp++) + if (!iswspace(*ctemp)) return CC_ERROR; + + if (is_file_command) + { + rc = _el_fn_complete(el, ch); + /* CC_NORM is returned if there is no possible completion. */ + return (rc == CC_NORM) ? CC_ERROR : rc; + } + + char *completion = complete_command(word_start, word_len, is_help_command); + if (NULL == completion) + rc = CC_REDISPLAY; /* completions got printed - redraw the input line */ + else if ('\0' == completion[0]) + rc = CC_NORM; /* nothing to complete */ + else if (el_insertstr(el, completion) == -1) + rc = CC_ERROR; /* no more space in the line buffer */ + else + rc = CC_REFRESH; /* there is a completion */ + + free(completion); + return (rc == CC_NORM) ? CC_ERROR : rc; +} + char *lg_readline(const char *mb_prompt) { static bool is_init = false; @@ -63,9 +324,9 @@ hist = history_winit(); /* Init built-in history */ el = el_init("link-parser", stdin, stdout, stderr); - history_w(hist, &ev, H_SETSIZE, 20); /* Remember 20 events */ + history_w(hist, &ev, H_SETSIZE, 100); + history_w(hist, &ev, H_SETUNIQUE, 1); el_wset(el, EL_HIST, history_w, hist); - el_source(el, NULL); /* Source the user's defaults file. */ history_w(hist, &ev, H_LOAD, HFILE); el_set(el, EL_SIGNAL, 1); /* Restore tty setting on returning to shell */ @@ -73,8 +334,15 @@ /* By default, it comes up in vi mode, with the editor not in * insert mode; and even when in insert mode, it drops back to * command mode at the drop of a hat. Totally confusing/lame. */ - el_wset(el, EL_EDITOR, L"emacs"); - el_wset(el, EL_PROMPT_ESC, prompt, '\1'); /* Set the prompt function */ + el_set(el, EL_EDITOR, "emacs"); + el_wset(el, EL_PROMPT, prompt); /* Set the prompt function */ + + el_set(el, EL_ADDFN, "lg_complete", "command completion", lg_complete); + el_set(el, EL_BIND, "^I", "lg_complete", NULL); + + build_command_list(default_switches); + + el_source(el, NULL); /* Source the user's defaults file. */ } wc_line = el_wgets(el, &numc); diff -Nru link-grammar-5.3.16/link-parser/link-parser.c link-grammar-5.5.0/link-parser/link-parser.c --- link-grammar-5.3.16/link-parser/link-parser.c 2017-02-12 13:13:20.000000000 +0000 +++ link-grammar-5.5.0/link-parser/link-parser.c 2018-04-26 02:40:30.000000000 +0000 @@ -37,6 +37,7 @@ #include #include #include +#include /* Used for terminal resizing */ #ifndef _WIN32 @@ -68,8 +69,6 @@ #endif /* USE_VITERBI */ #define DISPLAY_MAX 1024 -#define COMMENT_CHAR '%' /* input lines beginning with this are ignored */ -#define WHITESPACE " \t\v\r\n" /* ASCII-only is sufficient here. */ static int batch_errors = 0; static int verbosity = 0; @@ -128,7 +127,7 @@ input_string[MAX_INPUT-2] = '\0'; - if ((in != stdin) || !isatty_stdin) + if (((in != stdin) && !check_return) || !isatty_stdin) { /* Get input from a file. */ pline = fgets(input_string, MAX_INPUT, in); @@ -139,7 +138,7 @@ pline = get_terminal_line(input_string, in, out); } - if (NULL == pline) return NULL; /* EOF */ + if (NULL == pline) return NULL; /* EOF or error */ if (('\0' != input_string[MAX_INPUT-2]) && ('\n' != input_string[MAX_INPUT-2])) @@ -152,7 +151,8 @@ { if (('\0' == pline[0]) || ('\r' == pline[0]) || ('\n' == pline[0])) return (char *)"\n"; /* Continue linkage display */ - input_pending = true; + if ((in == stdin) || ('!' == pline[0])) + input_pending = true; /* In !file mode allow commands */ return (char *)"x"; /* Stop linkage display */ } @@ -266,10 +266,10 @@ * the following can be used: * link-parser -limit=30000 -test=auto-next-linkage:20000 < file.batch */ -static int auto_next_linkage_test(const char *test) +static int auto_next_linkage_test(const char *test_opt) { char auto_next_linkage_str[] = ",auto-next-linkage"; - char *auto_next_linkage_pos = strstr(test, auto_next_linkage_str); + char *auto_next_linkage_pos = strstr(test_opt, auto_next_linkage_str); int max_display; if (auto_next_linkage_pos == NULL) return 0; @@ -343,16 +343,16 @@ if ((sentence_num_valid_linkages(sent) == 1) && !copts->display_bad) { - fprintf(stdout, " Unique linkage, "); + fprintf(stdout, "\tUnique linkage, "); } else if (copts->display_bad && (sentence_num_violations(sent, i) > 0)) { - fprintf(stdout, " Linkage %d (bad), ", num_displayed+1); + fprintf(stdout, "\tLinkage %d (bad), ", num_displayed+1); } else { - fprintf(stdout, " Linkage %d, ", num_displayed+1); + fprintf(stdout, "\tLinkage %d, ", num_displayed+1); } corpus_cost = linkage_corpus_cost(linkage); @@ -380,11 +380,11 @@ { if (!auto_next_linkage) { - if ((verbosity > 0) && (in == stdin) && isatty_stdin && isatty_stdout) + if ((verbosity > 0) && (!copts->batch_mode) && isatty_stdin && isatty_stdout) { fprintf(stdout, "Press RETURN for the next linkage.\n"); } - char *rc = fget_input_string(in, stdout, /*check_return*/true); + char *rc = fget_input_string(stdin, stdout, /*check_return*/true); if ((NULL == rc) || (*rc != '\n')) return rc; } } @@ -451,14 +451,16 @@ } } -static bool special_command(char *input_string, Command_Options* copts, Dictionary dict) +/** + * If input_string is !command, try to issue it. + */ + +static char special_command(char *input_string, Command_Options* copts, Dictionary dict) { - if (input_string[0] == COMMENT_CHAR) return true; - if (input_string[0] == '!') { - issue_special_command(input_string+1, copts, dict); - return true; - } - return false; + if (input_string[0] == COMMENT_CHAR) return 'c'; + if (input_string[0] == '!') + return issue_special_command(input_string+1, copts, dict); + return 'n'; } static Label strip_off_label(char * input_string) @@ -485,12 +487,12 @@ parse_options_set_max_parse_time(opts, 60); parse_options_set_islands_ok(opts, false); parse_options_set_short_length(opts, 12); - parse_options_set_all_short_connectors(opts, 1); + parse_options_set_all_short_connectors(opts, true); parse_options_set_linkage_limit(opts, 100); parse_options_set_spell_guess(opts, 0); } -static void print_usage(char *str) +static void print_usage(char *str, int exit_value) { Command_Options *copts; fprintf(stderr, @@ -501,13 +503,12 @@ fprintf(stderr, "\nSpecial commands are:\n"); copts = command_options_create(); issue_special_command("var", copts, NULL); - exit(-1); + exit(exit_value); } /** * On Unix, this checks for the current window size, * and sets the output screen width accordingly. - * Not sure how MS Windows does this. */ static void check_winsize(Command_Options* copts) { @@ -524,11 +525,11 @@ /* Calculate the size of the console window. */ if (GetConsoleScreenBufferInfo(console, &info) == 0) goto fail; - copts->screen_width = info.srWindow.Right - info.srWindow.Left; + copts->screen_width = (size_t)(info.srWindow.Right - info.srWindow.Left + 1); return; fail: - copts->screen_width = 79; + copts->screen_width = 80; return; #else struct winsize ws; @@ -552,7 +553,7 @@ */ if ((10 < ws.ws_col) && (16123 > ws.ws_col)) { - copts->screen_width = ws.ws_col - 1; + copts->screen_width = ws.ws_col; } #endif /* _WIN32 */ } @@ -572,7 +573,7 @@ isatty_stdout = isatty(fileno(stdout)); #ifdef _WIN32 - /* If compiled with MSVC/MSYS, we still support running under Cygwin. + /* If compiled with MSVC/MinGW, we still support running under Cygwin. * This is done by checking running_under_cygwin to resolve * incompatibilities. */ const char *ostype = getenv("OSTYPE"); @@ -597,22 +598,28 @@ i++; } + if ((i < argc) && strcmp("--help", argv[i]) == 0) + { + print_usage(argv[0], 0); + } + for (; ipopts; - if (copts == NULL || opts == NULL || copts->panic_opts == NULL) + if (copts == NULL || copts->panic_opts == NULL) { prt_error("Fatal error: unable to create parse options\n"); exit(-1); } + opts = copts->popts; setup_panic_parse_options(copts->panic_opts); copts->panic_mode = true; @@ -624,14 +631,21 @@ parse_options_set_short_length(opts, 16); parse_options_set_islands_ok(opts, false); + save_default_opts(copts); /* Options so far are the defaults */ + /* Process command line variable-setting commands (only) */ for (i = 1; i < argc; i++) { if (argv[i][0] == '-') { const char *var = argv[i] + ((argv[i][1] != '-') ? 1 : 2); - if ((var[0] != '!') && issue_special_command(var, copts, NULL)) - print_usage(argv[0]); + if ((var[0] != '!') && (0 > issue_special_command(var, copts, NULL))) + print_usage(argv[0], -1); + } + else if (i != 1) + { + prt_error("Fatal error: Unknown argument '%s'.\n", argv[i]); + print_usage(argv[0], -1); } } @@ -664,7 +678,7 @@ if ((argv[i][0] == '-') && (argv[i][1] == '!')) { if (issue_special_command(argv[i]+1, copts, dict)) - print_usage(argv[0]); + print_usage(argv[0], -1); } } @@ -682,6 +696,10 @@ char *input_string; Sentence sent = NULL; + /* Make sure stderr is shown even when MSVC binary runs under + * Cygwin/MSYS pty (in that case it is fully buffered(!)). */ + fflush(stderr); + verbosity = parse_options_get_verbosity(opts); debug = parse_options_get_debug(opts); test = parse_options_get_test(opts); @@ -691,6 +709,9 @@ if (NULL == input_string) { + if (ferror(input_fh)) + prt_error("Error: Read: %s\n", strerror(errno)); + if (input_fh == stdin) break; fclose (input_fh); input_fh = stdin; @@ -704,35 +725,50 @@ *p = '\0'; } - if ((strcmp(input_string, "!quit") == 0) || - (strcmp(input_string, "!exit") == 0)) break; + /* If the input string is just whitespace, then ignore it. */ + if (strspn(input_string, WHITESPACE) == strlen(input_string)) + continue; + + char command = special_command(input_string, copts, dict); + if ('e' == command) break; /* It was an exit command */ + if ('c' == command) continue; /* It was another command */ + if (-1 == command) continue; /* It was a bad command */ /* We have to handle the !file command inline; its too hairy * otherwise ... */ - if (strncmp(input_string, "!file", 5) == 0) + if ('f' == command) { - char * filename = &input_string[6]; + char * filename = &input_string[strcspn(input_string, WHITESPACE)] + 1; int fnlen = strlen(filename); + if (0 == fnlen) + { + prt_error("Error: Missing file name argument\n"); + continue; + } + if ('\n' == filename[fnlen-1]) filename[fnlen-1] = '\0'; + struct stat statbuf; + if ((0 == stat(filename, &statbuf)) && statbuf.st_mode & S_IFDIR) + { + fprintf(stderr, "Error: Cannot open %s: %s\n", + filename, strerror(EISDIR)); + continue; + } + input_fh = fopen(filename, "r"); + if (NULL == input_fh) { - int perr = errno; - fprintf(stderr, "Error: %s (%d) %s\n", - filename, perr, strerror(perr)); + fprintf(stderr, "Error: Cannot open %s: %s\n", + filename, strerror(errno)); input_fh = stdin; continue; } continue; } - /* If the input string is just whitespace, then ignore it. */ - if (strspn(input_string, WHITESPACE) == strlen(input_string)) - continue; - - if (special_command(input_string, copts, dict)) continue; if (!copts->batch_mode) batch_in_progress = false; if ('\0' != test[0]) @@ -793,6 +829,32 @@ sent = NULL; continue; } + + if (0 != copts->display_wordgraph) + { + const char *wg_display_flags = ""; /* default flags */ + switch (copts->display_wordgraph) + { + case 1: /* default flags */ + break; + case 2: /* subgraphs with a legend */ + wg_display_flags = "sl"; + break; + case 3: + { + /* Use esoteric flags from the test user variable. */ + const char wg[] = ",wg"; + const char *s = strstr(test, wg); + if (NULL != s) wg_display_flags = s+2; + } + break; + default: + prt_error("Warning: wordgraph=%d: Unknown value, using 1\n", + copts->display_wordgraph); + copts->display_wordgraph = 1; + } + sentence_display_wordgraph(sent, wg_display_flags); + } #if 0 /* Try again, this time omitting the requirement for * definite articles, etc. This should allow for the parsing @@ -881,7 +943,7 @@ parse_options_get_use_sat_parser(opts)); parse_options_reset_resources(copts->panic_opts); parse_options_set_verbosity(copts->panic_opts, verbosity); - num_linkages = sentence_parse(sent, copts->panic_opts); + (void)sentence_parse(sent, copts->panic_opts); if (verbosity > 0) { if (parse_options_timer_expired(copts->panic_opts)) @@ -891,24 +953,21 @@ /* print_total_time(opts); */ + const char *rc = ""; if (copts->batch_mode) { batch_process_some_linkages(label, sent, copts); } else { - const char *rc = process_some_linkages(input_fh, sent, copts); - if (NULL == rc) - { - sentence_delete(sent); - sent = NULL; - break; - } + rc = process_some_linkages(input_fh, sent, copts); } - fflush(stdout); + fflush(stdout); sentence_delete(sent); sent = NULL; + + if ((NULL == rc) && (input_fh == stdin)) break; } } diff -Nru link-grammar-5.3.16/link-parser/Makefile.am link-grammar-5.5.0/link-parser/Makefile.am --- link-grammar-5.3.16/link-parser/Makefile.am 2016-11-17 18:36:07.000000000 +0000 +++ link-grammar-5.5.0/link-parser/Makefile.am 2018-03-04 14:56:46.000000000 +0000 @@ -26,12 +26,6 @@ link_parser_LDADD += $(SQLITE3_LIBS) endif -if WITH_VITERBI -link_parser_LDADD += $(top_builddir)/viterbi/libvitacog.la -link_parser_LDADD += $(LIBGC_LIBS) -endif - - # Installation checks, to be manually done after "make install". # link-parser checks: # 1. Show the location of its binary. diff -Nru link-grammar-5.3.16/link-parser/Makefile.in link-grammar-5.5.0/link-parser/Makefile.in --- link-grammar-5.3.16/link-parser/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/link-parser/Makefile.in 2018-04-29 21:40:04.000000000 +0000 @@ -94,9 +94,6 @@ host_triplet = @host@ bin_PROGRAMS = link-parser$(EXEEXT) @HAVE_SQLITE_TRUE@am__append_1 = $(SQLITE3_LIBS) -@WITH_VITERBI_TRUE@am__append_2 = \ -@WITH_VITERBI_TRUE@ $(top_builddir)/viterbi/libvitacog.la \ -@WITH_VITERBI_TRUE@ $(LIBGC_LIBS) subdir = link-parser ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \ @@ -116,13 +113,9 @@ link_parser_OBJECTS = $(am_link_parser_OBJECTS) am__DEPENDENCIES_1 = @HAVE_SQLITE_TRUE@am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1) -@WITH_VITERBI_TRUE@am__DEPENDENCIES_3 = \ -@WITH_VITERBI_TRUE@ $(top_builddir)/viterbi/libvitacog.la \ -@WITH_VITERBI_TRUE@ $(am__DEPENDENCIES_1) link_parser_DEPENDENCIES = \ $(top_builddir)/link-grammar/liblink-grammar.la \ - $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2) \ - $(am__DEPENDENCIES_3) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) am__v_lt_0 = --silent @@ -198,6 +191,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -226,6 +220,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -234,13 +229,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -307,12 +305,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -388,7 +386,7 @@ link_parser_LDFLAGS = $(LINK_CFLAGS) link_parser_LDADD = $(top_builddir)/link-grammar/liblink-grammar.la \ - $(LIBEDIT_LIBS) $(am__append_1) $(am__append_2) + $(LIBEDIT_LIBS) $(am__append_1) all: all-am .SUFFIXES: diff -Nru link-grammar-5.3.16/link-parser/parser-utilities.c link-grammar-5.5.0/link-parser/parser-utilities.c --- link-grammar-5.3.16/link-parser/parser-utilities.c 2017-01-19 18:40:29.000000000 +0000 +++ link-grammar-5.5.0/link-parser/parser-utilities.c 2018-03-11 23:13:19.000000000 +0000 @@ -20,6 +20,7 @@ #ifdef __MINGW32__ #include #endif /* __MINGW32__ */ +#include #include "parser-utilities.h" @@ -64,9 +65,9 @@ return NULL; } - /* Make sure we don't have conversion problems, by searching for '�'. */ - const char *invlid_char = strstr(utf8inbuf, "\xEF\xBF\xBD"); - if (NULL != invlid_char) + /* Make sure we don't have conversion problems, by searching for U+FFFD. */ + const char *invalid_char = strstr(utf8inbuf, "\xEF\xBF\xBD"); + if (NULL != invalid_char) prt_error("Warning: Invalid input character encountered.\n"); /* ^Z is read as a character. Convert it to an EOF indication. */ @@ -183,16 +184,17 @@ /* Now check the name pattern. The filename of a Cygwin pseudo tty pipe looks like this: - \cygwin-%16llx-pty%d-{to,from}-master + \{cygwin,msys}-%16llx-pty%d-{to,from}-master %16llx is the hash of the Cygwin installation, (to support multiple parallel installations), %d id the pseudo tty number, "to" or "from" differs the pipe direction. "from" is a stdin, "to" a stdout-like pipe. */ cp = pfni->FileName; - if (!wcsncmp(cp, L"\\cygwin-", 8) && !wcsncmp(cp + 24, L"-pty", 4)) + if ((!wcsncmp(cp, L"\\cygwin-", 8) && !wcsncmp(cp + 24, L"-pty", 4)) || + (!wcsncmp(cp, L"\\msys-", 6) && !wcsncmp(cp + 22, L"-pty", 4))) { - cp = wcschr(cp + 28, '-'); + cp = wcschr(cp + 26, '-'); if (!cp) goto no_tty; if (!wcscmp(cp, L"-from-master") || !wcscmp(cp, L"-to-master")) diff -Nru link-grammar-5.3.16/link-parser/parser-utilities.h link-grammar-5.5.0/link-parser/parser-utilities.h --- link-grammar-5.3.16/link-parser/parser-utilities.h 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/link-parser/parser-utilities.h 2018-04-22 14:57:33.000000000 +0000 @@ -17,11 +17,13 @@ #define MAX_INPUT 1024 -#if !defined(MIN) -#define MIN(X,Y) ( ((X) < (Y)) ? (X) : (Y)) +#ifdef _WIN32 +#ifndef __MINGW32__ +/* There is no ssize_t definition in native Windows. */ +#include +typedef SSIZE_T ssize_t; #endif -#ifdef _WIN32 #define strcasecmp _stricmp #ifndef strncasecmp #define strncasecmp _strnicmp diff -Nru link-grammar-5.3.16/Makefile.am link-grammar-5.5.0/Makefile.am --- link-grammar-5.3.16/Makefile.am 2017-02-10 20:16:17.000000000 +0000 +++ link-grammar-5.5.0/Makefile.am 2018-04-29 21:29:01.000000000 +0000 @@ -3,7 +3,6 @@ # SUBDIRS = \ link-grammar \ - viterbi \ bindings \ link-parser \ tests \ @@ -33,7 +32,7 @@ docker/docker-parser/Dockerfile \ docker/docker-python/Dockerfile \ docker/docker-server/Dockerfile \ - m4/varcheckpoint.m4 + m4/varcheckpoint.m4 \ msvc14/LGlib-features.props \ msvc14/LinkGrammarExe.vcxproj \ msvc14/LinkGrammarExe.vcxproj.filters \ @@ -51,6 +50,7 @@ msvc14/Python3.vcxproj.filters \ msvc14/README.md \ msvc14/make-check.py \ - mingw/README.Cygwin \ - mingw/README.MSYS \ + mingw/README-Cygwin.md \ + mingw/README-MSYS.md \ + mingw/README-MSYS2.md \ TODO diff -Nru link-grammar-5.3.16/Makefile.in link-grammar-5.5.0/Makefile.in --- link-grammar-5.3.16/Makefile.in 2017-04-15 21:49:48.000000000 +0000 +++ link-grammar-5.5.0/Makefile.in 2018-04-29 21:40:02.000000000 +0000 @@ -191,7 +191,7 @@ am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/link-grammar.pc.in \ $(srcdir)/link-grammar.spec.in AUTHORS ChangeLog NEWS TODO \ compile config.guess config.sub install-sh ltmain.sh missing \ - py-compile + py-compile ylwrap DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) distdir = $(PACKAGE)-$(VERSION) top_distdir = $(distdir) @@ -240,6 +240,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -268,6 +269,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -276,13 +278,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -349,12 +354,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -421,7 +426,6 @@ # SUBDIRS = \ link-grammar \ - viterbi \ bindings \ link-parser \ tests \ @@ -449,7 +453,28 @@ docker/docker-parser/Dockerfile \ docker/docker-python/Dockerfile \ docker/docker-server/Dockerfile \ - m4/varcheckpoint.m4 + m4/varcheckpoint.m4 \ + msvc14/LGlib-features.props \ + msvc14/LinkGrammarExe.vcxproj \ + msvc14/LinkGrammarExe.vcxproj.filters \ + msvc14/LinkGrammarJava.vcxproj \ + msvc14/LinkGrammarJava.vcxproj.filters \ + msvc14/LinkGrammar.sln \ + msvc14/LinkGrammar.vcxproj \ + msvc14/LinkGrammar.vcxproj.filters \ + msvc14/Local.props \ + msvc14/confvar.bat \ + msvc14/MSVC-common.props \ + msvc14/post-build.bat \ + msvc14/Python2.vcxproj \ + msvc14/Python2.vcxproj.filters \ + msvc14/Python3.vcxproj.filters \ + msvc14/README.md \ + msvc14/make-check.py \ + mingw/README-Cygwin.md \ + mingw/README-MSYS.md \ + mingw/README-MSYS2.md \ + TODO all: all-recursive @@ -945,26 +970,6 @@ .PRECIOUS: Makefile - msvc14/LGlib-features.props \ - msvc14/LinkGrammarExe.vcxproj \ - msvc14/LinkGrammarExe.vcxproj.filters \ - msvc14/LinkGrammarJava.vcxproj \ - msvc14/LinkGrammarJava.vcxproj.filters \ - msvc14/LinkGrammar.sln \ - msvc14/LinkGrammar.vcxproj \ - msvc14/LinkGrammar.vcxproj.filters \ - msvc14/Local.props \ - msvc14/confvar.bat \ - msvc14/MSVC-common.props \ - msvc14/post-build.bat \ - msvc14/Python2.vcxproj \ - msvc14/Python2.vcxproj.filters \ - msvc14/Python3.vcxproj.filters \ - msvc14/README.md \ - msvc14/make-check.py \ - mingw/README.Cygwin \ - mingw/README.MSYS \ - TODO # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff -Nru link-grammar-5.3.16/man/link-parser.man1 link-grammar-5.5.0/man/link-parser.man1 --- link-grammar-5.3.16/man/link-parser.man1 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/man/link-parser.man1 2018-04-22 14:57:33.000000000 +0000 @@ -48,10 +48,12 @@ .. .\" define .EX/.EE (for multiline user-command examples; normal Courier font) .de EX +.Vb .nf .ft CW .. .de EE +.Ve .ft P .fi .. @@ -260,7 +262,7 @@ .IP 1 Treebank-style constituent tree .PP -.IP 1 +.IP 2 Flat, bracketed tree [A like [B this B] A] .IP 3 Flat, treebank-style tree (A like (B this)) diff -Nru link-grammar-5.3.16/man/Makefile.am link-grammar-5.5.0/man/Makefile.am --- link-grammar-5.3.16/man/Makefile.am 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/man/Makefile.am 2018-01-04 07:06:21.000000000 +0000 @@ -1,9 +1,13 @@ ## Process this file with automake to produce Makefile.in. +man_MANS = link-parser.1 dist_man_MANS = link-parser.man1 # Change the system dictionary location according the configuration options. # This tracks --datadir. link-parser.1: link-parser.man1 - sed 's/%DATA_DIR%/$(datadir)/' $< >$@ + sed 's,%DATA_DIR%,$(datadir),' $< >$@ + +clean-local: + -rm -f link-parser.1 diff -Nru link-grammar-5.3.16/man/Makefile.in link-grammar-5.5.0/man/Makefile.in --- link-grammar-5.3.16/man/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/man/Makefile.in 2018-04-29 21:40:04.000000000 +0000 @@ -118,6 +118,37 @@ n|no|NO) false;; \ *) (install-info --version) >/dev/null 2>&1;; \ esac +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +man1dir = $(mandir)/man1 +am__installdirs = "$(DESTDIR)$(man1dir)" +NROFF = nroff +MANS = $(dist_man_MANS) $(man_MANS) am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) am__DIST_COMMON = $(dist_man_MANS) $(srcdir)/Makefile.in DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) @@ -127,6 +158,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -155,6 +187,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -163,13 +196,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -236,12 +272,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -302,6 +338,7 @@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ +man_MANS = link-parser.1 dist_man_MANS = link-parser.man1 all: all-am @@ -341,6 +378,49 @@ clean-libtool: -rm -rf .libs _libs +install-man1: $(dist_man_MANS) $(man_MANS) + @$(NORMAL_INSTALL) + @list1=''; \ + list2='$(dist_man_MANS) $(man_MANS)'; \ + test -n "$(man1dir)" \ + && test -n "`echo $$list1$$list2`" \ + || exit 0; \ + echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \ + { for i in $$list1; do echo "$$i"; done; \ + if test -n "$$list2"; then \ + for i in $$list2; do echo "$$i"; done \ + | sed -n '/\.1[a-z]*$$/p'; \ + fi; \ + } | while read p; do \ + if test -f $$p; then d=; else d="$(srcdir)/"; fi; \ + echo "$$d$$p"; echo "$$p"; \ + done | \ + sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \ + -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \ + sed 'N;N;s,\n, ,g' | { \ + list=; while read file base inst; do \ + if test "$$base" = "$$inst"; then list="$$list $$file"; else \ + echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \ + $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \ + fi; \ + done; \ + for i in $$list; do echo "$$i"; done | $(am__base_list) | \ + while read files; do \ + test -z "$$files" || { \ + echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \ + $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \ + done; } + +uninstall-man1: + @$(NORMAL_UNINSTALL) + @list=''; test -n "$(man1dir)" || exit 0; \ + files=`{ for i in $$list; do echo "$$i"; done; \ + l2='$(dist_man_MANS) $(man_MANS)'; for i in $$l2; do echo "$$i"; done | \ + sed -n '/\.1[a-z]*$$/p'; \ + } | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \ + -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \ + dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir) tags TAGS: ctags CTAGS: @@ -380,8 +460,11 @@ done check-am: all-am check: check-am -all-am: Makefile +all-am: Makefile $(MANS) installdirs: + for dir in "$(DESTDIR)$(man1dir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done install: install-am install-exec: install-exec-am install-data: install-data-am @@ -414,7 +497,7 @@ @echo "it deletes files that may require special tools to rebuild." clean: clean-am -clean-am: clean-generic clean-libtool mostlyclean-am +clean-am: clean-generic clean-libtool clean-local mostlyclean-am distclean: distclean-am -rm -f Makefile @@ -432,7 +515,7 @@ info-am: -install-data-am: +install-data-am: install-man install-dvi: install-dvi-am @@ -448,7 +531,7 @@ install-info-am: -install-man: +install-man: install-man1 install-pdf: install-pdf-am @@ -476,21 +559,24 @@ ps-am: -uninstall-am: +uninstall-am: uninstall-man + +uninstall-man: uninstall-man1 .MAKE: install-am install-strip .PHONY: all all-am check check-am clean clean-generic clean-libtool \ - cscopelist-am ctags-am distclean distclean-generic \ + clean-local cscopelist-am ctags-am distclean distclean-generic \ distclean-libtool distdir dvi dvi-am html html-am info info-am \ install install-am install-data install-data-am install-dvi \ install-dvi-am install-exec install-exec-am install-html \ install-html-am install-info install-info-am install-man \ - install-pdf install-pdf-am install-ps install-ps-am \ - install-strip installcheck installcheck-am installdirs \ - maintainer-clean maintainer-clean-generic mostlyclean \ - mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ - tags-am uninstall uninstall-am + install-man1 install-pdf install-pdf-am install-ps \ + install-ps-am install-strip installcheck installcheck-am \ + installdirs maintainer-clean maintainer-clean-generic \ + mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \ + ps ps-am tags-am uninstall uninstall-am uninstall-man \ + uninstall-man1 .PRECIOUS: Makefile @@ -498,7 +584,10 @@ # Change the system dictionary location according the configuration options. # This tracks --datadir. link-parser.1: link-parser.man1 - sed 's/%DATA_DIR%/$(datadir)/' $< >$@ + sed 's,%DATA_DIR%,$(datadir),' $< >$@ + +clean-local: + -rm -f link-parser.1 # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff -Nru link-grammar-5.3.16/mingw/README-Cygwin.md link-grammar-5.5.0/mingw/README-Cygwin.md --- link-grammar-5.3.16/mingw/README-Cygwin.md 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/mingw/README-Cygwin.md 2018-03-04 14:56:46.000000000 +0000 @@ -0,0 +1,109 @@ +Building on Windows using MinGW under Cygwin +-------------------------------------------- + + Tested on an up-to-date Cygwin 2.5.2 under Windows 10, + with link-grammar version 5.3.8. + + +Supported target versions +------------------------- + The intention is to support versions from Vista on (some WIN32 + functions which are used are not supported in earlier versions.) + The resulted link-parser executable is able to run under Cygwin too. + + The system compatibility definitions: + In configure.ac: + AC_DEFINE(NTDDI_VERSION, NTDDI_VISTA) + AC_DEFINE(_WIN32_WINNT, _WIN32_WINNT_VISTA) + + +Configuring +----------- + + $ LDFLAGS=-L$gnuregex/lib CPPFLAGS=-I$gnuregex/include \ + configure -host=x86_64-w64-mingw32 \ + --enable-wordgraph-display \ + --disable-sat-solver \ + --disable-python-bindings \ + --disable-java-bindings + + In the configure command above, $gnuregex points to the directory of the + POSIX regex for Windows. The library basename must b:e "libregex". The + -host value is for compiling for 64-bits. + + The SAT solver cannot be enabled for now due to a missing definition in the + build system. Python bindings fail because it currently tries to use the + Cygwin Python system. More development work is needed on these. + + (The Java bindings has not been tested in this version. Most probably the + way described in [README-MSYS.md](README-MSYS.md) can be used.) + + $ make + + $ make install + + The dictionaries are installed by default under + /usr/local/share/link-grammar. + + +Running +------- + + * From the sources directory, using cmd.exe Windows console: + + Note: ^Z/^D/^C are not supported when running under Cygwin! + In particular, don't press ^Z - it may crash or stuck the window. + To exit, use the !exit command. + + > PATH-TO-LG-CONF-DIRECTORY\link-parser\link-parser [arguments] + + * Form the Cygwin shell: + + Before installation: + $ PATH-TO-LG-CONF-DIRECTORY/link-parser/link-parser [args] + + After "make install" (supposing /usr/local/bin is in the PATH): + $ link-parser [arguments] + + To run the executable from other location, liblink-grammar-5.dll needs to be + in a directory in PATH (e.g. in the directory of the executable). + + For more details, see "RUNNING the program" in the main + [README](/README.md#running-the-program). + +Limitations +----------- + + Since MinGW currently doesn't support locale_t and the isw*() functions + that use it, the library doesn't support per-dictionary locale setting, + which just means that if several dictionaries are used, all of them share + the same global locale, so if they are used from different threads the + must use languages with the same codeset. If the program is not + multi-threaded, dictionaries of several different languages can be created + and then used one by one, provided that the global locale is switched + (using setlocale()) to the locale of each dictionary just before using + this dictionary. + + (To get a complete per-dictionary locale support, the library should be + compiled using MSVC.) + + +Implementation Notes +-------------------- + + MinGW uses by default a Windows STDIO from an unsupported Windows library + that just happens to be included even in Windows 10. This STDIO is not C99 + compliant, and in particular doesn't support the %z formats (it crashes + when it encounters them). + + Hence __USE_MINGW_ANSI_STDIO=1 is defined, so MinGW uses its own C99 + compatible STDIO. However, the *printf() functions of these implementation + cannot print UTF-8 to the console (to files/pipe they print UTF-8 just + fine), because they use Windows' putchar(), which cannot write UTF-8 to + the console. A workaround is implemented in the LG library and in + link-parser. + + If you write a C/C++ program (to be compiled with MinGW) that uses the + library and needs to print to the console, and this problem is not fixed + by then (in Windows or MinGW), then you need to copy this workaround + implementation. See utilities.c and/or parser_utilities.c. diff -Nru link-grammar-5.3.16/mingw/README-MSYS2.md link-grammar-5.5.0/mingw/README-MSYS2.md --- link-grammar-5.3.16/mingw/README-MSYS2.md 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/mingw/README-MSYS2.md 2018-03-04 14:56:46.000000000 +0000 @@ -0,0 +1,120 @@ +BUILDING on Windows (MinGW/MSYS2) +================================= + +General +------- +In this document we will suppose you use a 64-bit machine. + +MinGW/MSYS2 uses the Gnu toolset to compile Windows programs for +Windows. This is probably the easiest way to obtain workable Java +bindings for Windows. + +In order to prepare this document, MSYS2 version 20161025 was installed. + +A note for new MSYS2 users +-------------------------- +Download and install MinGW/MSYS2 from http://msys2.org. + +MSYS2 uses the `pacman`package management. If you are not familiar +with it, consult the +[Pacman Rosetta](https://wiki.archlinux.org/index.php/Pacman/Rosetta). + +Also note that MSYS2 have two working modes (shells): MSYS and MINGW64. +The difference between them is the value of environment variables, e.g. +PATH, PKG_CONFIG_PATH and MANPATH. For native Windows software +development, the MINGW64 shell must be used (the MSYS shell is for +MinGW/MSYS2 development- applications compiled from it recognize the MSYS +virtual filesystem). + +First install `mingw-w64-x86_64-toolchain`. Also install the rest of the +prerequisite tools from the list in the main +[README](/README.md#building-from-the-github-repository). + +You may find that the system is extremely slow. In that case, consult the +Web for how to make tweaks that considerably speed it up. In addition, to +avoid I/O trashing, don't use a too high `make` parallelism (maybe even +only `-j 2`). + +Packages that are used by the link-grammar library +-------------------------------------------------- + +mingw-w64-x86_64-sqlite3
+mingw-w64-x86_64-libtre-git
+mingw-w64-x86_64-gettext
+mingw-w64-x86_64-hunspell, mingw-w64-x86_64-hunspell-en (optional)
+libedit-devel (optional but recommended)
+zlib-devel (optional - for the SAT parser)
+ +Java bindings +------------- +Install [Apache Ant](ant.apache.org/manual/install.html) and +Java JDK & JRE (both under Windows). Make sure you have +the environment variable JAVA_HOME set as needed (under Windows, +the MINGW64 shell will inherit it). + +Then build and install link-grammar with + + mkdir build + cd build + ../configure + make + make install + +In MINGW64, the default install prefix is `/mingw64` which is mapped to +`C:\msys64\mingw64`, so after 'make install', the libraries and executable +will be found at `C:\msys64\mingw64\bin` and the dictionary files at +`C:\msys64\mingw64\share\link-grammar`. + +In case you would like to build with hunspell, invoke `configure` as follows:
+`--with-hunspell-dictdir=`cygpath -m /mingw64/share/myspell/dicts` + + +Python bindings +--------------- +The bindings for Python2 (package `python2 2.7.13-1` for MSYS) work fine.
+All the tests pass (when configured with `hunspell` and the SAT parser). + +Here is a way to work with it from Windows: +``` +C:\>cd msys64\mingw64\bin +C:\msys64\mingw64\bin>C:\msys64\usr\bin\python2.exe +Python 2.7.13 (default, Feb 14 2017, 14:46:01) +[GCC 6.3.0] on msys +Type "help", "copyright", "credits" or "license" for more information. +>>> import sys +>>> sys.path.insert(0, 'C:\msys64\mingw64\lib\python2.7\site-packages') +>>> import linkgrammar +>>> +``` +(Alternatively, you can add `C:\>cd msys64\mingw64;C:\msys64\usr\bin` to the PATH +and set `PYTHONPATH=C:\msys64\mingw64\lib\python2.7\site-packages`). + +However the bindings for the MINGW64 Python2 and Python3 don't work. For the MINGW64 version +`mingw-w64-x86_64-python3-3.6.4-2` (and similarly for mingw-w64-x86_64-python2-2.7.14-5`) +it even doesn't compile due to problems in its `pyconfig.h`: +``` +/mingw64/include/python3.6m/pyconfig.h:1546:15: error: two or more data types in declaration specifiers + #define uid_t int +/mingw64/include/python3.6m/pyport.h:705:2: error: #error "LONG_BIT definition appears wrong for platform (bad gcc/glibc config?)." + #error "LONG_BIT definition appears wrong for platform (bad gcc/glibc config?)." +``` + +The binding for the MSYS Python3 (named just `python`) compile, but +importing `link-grammar` causes the infamous problem of +`Fatal Python error: PyThreadState_Get: no current thread`. +(This version also insists that the _clinkparser module name will end with +`.pyd` and not `dll`.) + +Test results +------------ +All the `tests.py` tests pass, along with all the tests in the `tests` +directory (including the `multi-java` test) and `make installcheck`. + +Here is how to run the `java-multi` directly:
+`cd tests; make TEST_LOGS=multi-java check-TESTS` + +Running +------- +On MINGW64, just invoke `link-parser`.
+In Windows, put `C:\msys64\mingw64\bin` in your PATH (or cd to it), then invoke `link-parser`. +For more details see [RUNNING the program](/README.md#running-the-program). diff -Nru link-grammar-5.3.16/mingw/README-MSYS.md link-grammar-5.5.0/mingw/README-MSYS.md --- link-grammar-5.3.16/mingw/README-MSYS.md 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/mingw/README-MSYS.md 2018-03-04 14:56:46.000000000 +0000 @@ -0,0 +1,60 @@ +Note: This document is obsolete. +Instead, see [BUILDING on Windows (MinGW/MSYS2)](README-MSYS2.md). + +BUILDING on Windows (MinGW/MSYS) +-------------------------------- + MinGW/MSYS uses the Gnu toolset to compile Windows programs for + Windows. This is probably the easiest way to obtain workable Java + bindings for Windows. Download and install MinGW, MSYS and MSYS-DTK + from http://mingw.org. + + Then build and install link-grammar with + + ./configure + make + make install + + If you used the standard installation paths, the directory /usr/ is + mapped to C:\msys\1.0, so after 'make install', the libraries and + executable will be found at C:\msys\1.0\local\bin and the dictionary + files at C:\msys\1.0\local\share\link-grammar. + +Running +------- + See "RUNNING the program" in the main README. + + +Java bindings +------------- + + In order to use the Java bindings you'll need to build two extra + DLLs, by running the following commands from the link-grammar base + directory: + + cd link-grammar + + gcc -g -shared -Wall -D_JNI_IMPLEMENTATION_ -Wl,--kill-at \ + .libs/analyze-linkage.o .libs/and.o .libs/api.o \ + .libs/build-disjuncts.o .libs/constituents.o \ + .libs/count.o .libs/disjuncts.o .libs/disjunct-utils.o \ + .libs/error.o .libs/expand.o .libs/extract-links.o \ + .libs/fast-match.o .libs/idiom.o .libs/massage.o \ + .libs/post-process.o .libs/pp_knowledge.o .libs/pp_lexer.o \ + .libs/pp_linkset.o .libs/prefix.o .libs/preparation.o \ + .libs/print-util.o .libs/print.o .libs/prune.o \ + .libs/read-dict.o .libs/read-regex.o .libs/regex-morph.o \ + .libs/resources.o .libs/spellcheck-aspell.o \ + .libs/spellcheck-hun.o .libs/string-set.o .libs/tokenize.o \ + .libs/utilities.o .libs/word-file.o .libs/word-utils.o \ + -o /usr/local/bin/link-grammar.dll + + gcc -g -shared -Wall -D_JNI_IMPLEMENTATION_ -Wl,--kill-at \ + .libs/jni-client.o /usr/local/bin/link-grammar.dll \ + -o /usr/local/bin/link-grammar-java.dll + + This will create link-grammar.dll and link-grammar-java.dll in the + directory c:\msys\1.0\local\bin . These files, together with + link-grammar-*.jar, will be used by Java programs. + + Make sure that this directory is in the %PATH setting, as otherwise, + the DLL's will not be found. diff -Nru link-grammar-5.3.16/morphology/Makefile.in link-grammar-5.5.0/morphology/Makefile.in --- link-grammar-5.3.16/morphology/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/morphology/Makefile.in 2018-04-29 21:40:04.000000000 +0000 @@ -127,6 +127,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -155,6 +156,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -163,13 +165,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -236,12 +241,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ diff -Nru link-grammar-5.3.16/msvc14/confvar.bat link-grammar-5.5.0/msvc14/confvar.bat --- link-grammar-5.3.16/msvc14/confvar.bat 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/confvar.bat 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,58 @@ +@echo off +%= Filter - replace the *VERSION atoconf variables according to the ones =% +%= set in configure.ac. =% + +setlocal +set configuration=%1 +if "%Configuration%"=="Debug" ( set Debug=DEBUG; ) else ( set Debug= ) + +setlocal DisableDelayedExpansion +mkdir Temp 2>nul + +set conf=..\configure.ac +set tmp=Temp\version.txt + +echo 1>&2 %~f0: Info: Replacing configuration variables + +findstr "^LINK_.*_VERSION=" %conf% > %tmp% +findstr "^VERSION=" %conf% >> %tmp% +for /F "tokens=*" %%i in (%tmp%) do set %%i +del %tmp% + + if "%Configuration%"=="" GOTO skipos + %= Get OS name =% + set "osreg=HKLM\SOFTWARE\Microsoft\Windows NT\CurrentVersion" + for /f "tokens=3*" %%i in ('reg query "%osreg%" /v ProductName') do ( + set "win_os= %%i %%j" + ) + %= Get OS version =% + for /f "tokens=3*" %%i in ('ver') do ( + set "win_ver= %%i %%j" + ) +:skipos + +%= Process the ".in" file, preserving blank lines and exlamation marks =% +for /f "tokens=1* delims=]" %%i in ('find /v /n ""') do ( + set "line=%%j" + if "%%j"=="" (echo.) else ( + setlocal EnableDelayedExpansion + set "line=!line:@LINK_MAJOR_VERSION@=%LINK_MAJOR_VERSION%!" + set "line=!line:@LINK_MINOR_VERSION@=%LINK_MINOR_VERSION%!" + set "line=!line:@LINK_MICRO_VERSION@=%LINK_MICRO_VERSION%!" + set "line=!line:@VERSION@=%VERSION%!" + + set "line=!line:$LINK_MAJOR_VERSION=%LINK_MAJOR_VERSION%!" + set "line=!line:$LINK_MINOR_VERSION=%LINK_MINOR_VERSION%!" + set "line=!line:$LINK_MICRO_VERSION=%LINK_MICRO_VERSION%!" + + if NOT "%Configuration%"=="" ( + set "line=!line:@HOST_OS@=%win_os% %win_ver%!" + set "line=!line:@LG_DEFS@=%DEFS%!" + set "line=!line:@CPPFLAGS@=%Debug%!" + set "line=!line:@CFLAGS@=%CFLAGS%!" + ) + + echo(!line! + endlocal + ) +) diff -Nru link-grammar-5.3.16/msvc14/LGlib-features.props link-grammar-5.5.0/msvc14/LGlib-features.props --- link-grammar-5.3.16/msvc14/LGlib-features.props 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/LGlib-features.props 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,21 @@ + + + + + USE_WORDGRAPH_DISPLAY; + + + <_PropertySheetDisplayName>LGlib-features + + + + $(DEFS);%(PreprocessorDefinitions) + + + + + $(DEFS) + true + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/LinkGrammarExe.vcxproj link-grammar-5.5.0/msvc14/LinkGrammarExe.vcxproj --- link-grammar-5.3.16/msvc14/LinkGrammarExe.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/LinkGrammarExe.vcxproj 2016-11-17 18:36:07.000000000 +0000 @@ -0,0 +1,222 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + + + + + + + {532EFF4D-758A-4705-91EE-9A4AC72C017B} + LinkGrammarExe + Win32Proj + 8.1 + + + + Application + v140 + true + + + Application + v140 + true + + + Application + v140 + + + Application + v140 + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + true + Temp\$(ProjectName)$(Platform)$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + + + true + Temp\$(ProjectName)$(Platform)$(Configuration)\ + + + false + Temp\$(ProjectName)$(Platform)$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + + + false + Temp\$(ProjectName)$(Platform)$(Configuration)\ + + + + Disabled + _DEBUG;DEBUG;_CONSOLE;%(PreprocessorDefinitions) + false + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + CompileAsC + ..;%(AdditionalIncludeDirectories) + true + EditAndContinue + + + true + Console + MachineX86 + $(SolutionDir)$(Platform)\$(Configuration)\link-grammar-x86.lib;%(AdditionalDependencies) + + + + + + + post-build.bat $(SolutionDir)$(Platform)\$(Configuration)\$(TargetName)$(TargetExt) + Performing post-build commands + + + + + Disabled + _DEBUG;DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + CompileAsC + true + true + ..;%(AdditionalIncludeDirectories) + false + EditAndContinue + + + true + Console + $(SolutionDir)$(Platform)\$(Configuration)\link-grammar-x64.lib;%(AdditionalDependencies) + + + post-build.bat $(SolutionDir)$(Platform)\$(Configuration)\$(TargetName)$(TargetExt) + + + Performing post-build commands + + + + + MaxSpeed + true + _CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + Level3 + ..;%(AdditionalIncludeDirectories) + true + true + Speed + + + true + Console + true + true + MachineX86 + $(SolutionDir)$(Platform)\$(Configuration)\link-grammar-x86.lib;%(AdditionalDependencies) + + + + + + + post-build.bat $(SolutionDir)$(Platform)\$(Configuration)\$(TargetName)$(TargetExt) + Performing post-build commands + + + + + MaxSpeed + true + _CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ..;%(AdditionalIncludeDirectories) + true + true + Speed + + + true + Console + true + true + $(SolutionDir)$(Platform)\$(Configuration)\link-grammar-x64.lib;%(AdditionalDependencies) + + + post-build.bat $(SolutionDir)$(Platform)\$(Configuration)\$(TargetName)$(TargetExt) + + + Performing post-build commands + + + + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/LinkGrammarExe.vcxproj.filters link-grammar-5.5.0/msvc14/LinkGrammarExe.vcxproj.filters --- link-grammar-5.3.16/msvc14/LinkGrammarExe.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/LinkGrammarExe.vcxproj.filters 2016-09-13 18:44:33.000000000 +0000 @@ -0,0 +1,42 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/LinkGrammarJava.vcxproj link-grammar-5.5.0/msvc14/LinkGrammarJava.vcxproj --- link-grammar-5.3.16/msvc14/LinkGrammarJava.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/LinkGrammarJava.vcxproj 2016-09-13 18:44:33.000000000 +0000 @@ -0,0 +1,190 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {D74DF531-C18E-4988-8A8C-4F23556DEC1B} + Win32Proj + 8.1 + + + + DynamicLibrary + v140 + + + DynamicLibrary + v140 + + + DynamicLibrary + v140 + + + DynamicLibrary + v140 + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Platform)\$(Configuration)\ + Temp\$(ProjectName)$(Platform)$(Configuration)\ + true + + + true + Temp\$(ProjectName)$(Platform)$(Configuration)\ + + + $(SolutionDir)$(Platform)\$(Configuration)\ + Temp\$(ProjectName)$(Platform)$(Configuration)\ + false + + + false + Temp\$(ProjectName)$(Platform)$(Configuration)\ + + + + Disabled + ..\.;$(JAVA_HOME)\include\win32;$(JAVA_HOME)\include;%(AdditionalIncludeDirectories) + _DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) + false + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + CompileAsC + + + $(OutDir)link-grammar.lib;%(AdditionalDependencies) + $(OutDir)link-grammar-java.dll + true + Windows + MachineX86 + + + + + Disabled + G:\DEV\Java\SourceControlled\RelEx\link-grammar\trunk\tre\lib;G:\DEV\Java\SourceControlled\RelEx\link-grammar\trunk\link-grammar;..\.;$(JAVA_HOME)\include\win32;$(JAVA_HOME)\include;%(AdditionalIncludeDirectories) + _DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + CompileAsC + + + tre.lib;%(AdditionalDependencies) + $(OutDir)link-grammar-java.dll + true + Windows + G:\DEV\Java\SourceControlled\RelEx\link-grammar\trunk\tre\vcbuild\x64\Release;%(AdditionalLibraryDirectories) + + + + + ..\.;$(JAVA_HOME)\include\win32;$(JAVA_HOME)\include;%(AdditionalIncludeDirectories) + NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + EditAndContinue + CompileAsC + Speed + true + + + .\Release\link-grammar.lib;%(AdditionalDependencies) + $(OutDir)link-grammar-java.dll + ${OutDir};%(AdditionalLibraryDirectories) + true + Windows + true + true + MachineX86 + + + + + ..\.;$(JAVA_HOME)\include\win32;$(JAVA_HOME)\include;%(AdditionalIncludeDirectories) + NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + EditAndContinue + CompileAsC + Speed + true + + + .\Release\link-grammar.lib;%(AdditionalDependencies) + $(OutDir)link-grammar-java.dll + ${OutDir};%(AdditionalLibraryDirectories) + true + Windows + true + true + + + + + {0a6c539a-3140-48bd-865c-05f45637b93b} + + + + + + + + + + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/LinkGrammarJava.vcxproj.filters link-grammar-5.5.0/msvc14/LinkGrammarJava.vcxproj.filters --- link-grammar-5.3.16/msvc14/LinkGrammarJava.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/LinkGrammarJava.vcxproj.filters 2016-09-13 18:44:33.000000000 +0000 @@ -0,0 +1,27 @@ + + + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + + + Source Files + + + + + Header Files + + + diff -Nru link-grammar-5.3.16/msvc14/LinkGrammar.sln link-grammar-5.5.0/msvc14/LinkGrammar.sln --- link-grammar-5.3.16/msvc14/LinkGrammar.sln 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/LinkGrammar.sln 2016-09-13 18:44:33.000000000 +0000 @@ -0,0 +1,73 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.25123.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LinkGrammarJava", "LinkGrammarJava.vcxproj", "{D74DF531-C18E-4988-8A8C-4F23556DEC1B}" + ProjectSection(ProjectDependencies) = postProject + {0A6C539A-3140-48BD-865C-05F45637B93B} = {0A6C539A-3140-48BD-865C-05F45637B93B} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LinkGrammar", "LinkGrammar.vcxproj", "{0A6C539A-3140-48BD-865C-05F45637B93B}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LinkGrammarExe", "LinkGrammarExe.vcxproj", "{532EFF4D-758A-4705-91EE-9A4AC72C017B}" + ProjectSection(ProjectDependencies) = postProject + {0A6C539A-3140-48BD-865C-05F45637B93B} = {0A6C539A-3140-48BD-865C-05F45637B93B} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Python2", "Python2.vcxproj", "{4DBD8AF9-67E2-4F55-B8BD-99903151A98C}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Python3", "Python3.vcxproj", "{B3AF571B-F80D-4A9C-A8C4-29026316A000}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D74DF531-C18E-4988-8A8C-4F23556DEC1B}.Debug|Win32.ActiveCfg = Debug|Win32 + {D74DF531-C18E-4988-8A8C-4F23556DEC1B}.Debug|x64.ActiveCfg = Release|x64 + {D74DF531-C18E-4988-8A8C-4F23556DEC1B}.Release|Win32.ActiveCfg = Release|Win32 + {D74DF531-C18E-4988-8A8C-4F23556DEC1B}.Release|Win32.Build.0 = Release|Win32 + {D74DF531-C18E-4988-8A8C-4F23556DEC1B}.Release|x64.ActiveCfg = Release|x64 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Debug|Win32.ActiveCfg = Debug|Win32 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Debug|Win32.Build.0 = Debug|Win32 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Debug|x64.ActiveCfg = Debug|x64 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Debug|x64.Build.0 = Debug|x64 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Debug|x64.Deploy.0 = Debug|x64 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Release|Win32.ActiveCfg = Release|Win32 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Release|Win32.Build.0 = Release|Win32 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Release|x64.ActiveCfg = Release|x64 + {0A6C539A-3140-48BD-865C-05F45637B93B}.Release|x64.Build.0 = Release|x64 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Debug|Win32.ActiveCfg = Debug|Win32 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Debug|Win32.Build.0 = Debug|Win32 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Debug|x64.ActiveCfg = Debug|x64 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Debug|x64.Build.0 = Debug|x64 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Debug|x64.Deploy.0 = Debug|x64 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Release|Win32.ActiveCfg = Release|Win32 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Release|Win32.Build.0 = Release|Win32 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Release|x64.ActiveCfg = Release|x64 + {532EFF4D-758A-4705-91EE-9A4AC72C017B}.Release|x64.Build.0 = Release|x64 + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C}.Debug|Win32.ActiveCfg = Debug|Win32 + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C}.Debug|Win32.Build.0 = Debug|Win32 + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C}.Debug|x64.ActiveCfg = Debug|x64 + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C}.Debug|x64.Build.0 = Debug|x64 + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C}.Release|Win32.ActiveCfg = Release|Win32 + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C}.Release|Win32.Build.0 = Release|Win32 + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C}.Release|x64.ActiveCfg = Release|x64 + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C}.Release|x64.Build.0 = Release|x64 + {B3AF571B-F80D-4A9C-A8C4-29026316A000}.Debug|Win32.ActiveCfg = Debug|Win32 + {B3AF571B-F80D-4A9C-A8C4-29026316A000}.Debug|Win32.Build.0 = Debug|Win32 + {B3AF571B-F80D-4A9C-A8C4-29026316A000}.Debug|x64.ActiveCfg = Debug|x64 + {B3AF571B-F80D-4A9C-A8C4-29026316A000}.Debug|x64.Build.0 = Debug|x64 + {B3AF571B-F80D-4A9C-A8C4-29026316A000}.Release|Win32.ActiveCfg = Release|Win32 + {B3AF571B-F80D-4A9C-A8C4-29026316A000}.Release|Win32.Build.0 = Release|Win32 + {B3AF571B-F80D-4A9C-A8C4-29026316A000}.Release|x64.ActiveCfg = Release|x64 + {B3AF571B-F80D-4A9C-A8C4-29026316A000}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff -Nru link-grammar-5.3.16/msvc14/LinkGrammar.vcxproj link-grammar-5.5.0/msvc14/LinkGrammar.vcxproj --- link-grammar-5.3.16/msvc14/LinkGrammar.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/LinkGrammar.vcxproj 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,384 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {0A6C539A-3140-48BD-865C-05F45637B93B} + LinkGrammar + Win32Proj + 8.1 + + + + DynamicLibrary + v140 + + + DynamicLibrary + v140 + + + DynamicLibrary + v140 + + + DynamicLibrary + v140 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + link-grammar-x86 + Temp\$(ProjectName)$(Platform)$(Configuration)\ + ClCompile + $(SolutionDir)$(Platform)\$(Configuration)\ + $(WINFLEXBISON);$(VC_ExecutablePath_x86);$(WindowsSDK_ExecutablePath);$(VS_ExecutablePath);$(MSBuild_ExecutablePath);$(SystemRoot)\SysWow64;$(FxCopDir);$(PATH); + + + true + link-grammar-x64 + Temp\$(ProjectName)$(Platform)$(Configuration)\ + ClCompile + $(WINFLEXBISON);$(VC_ExecutablePath_x64);$(WindowsSDK_ExecutablePath);$(VS_ExecutablePath);$(MSBuild_ExecutablePath);$(FxCopDir);$(PATH); + + + link-grammar-x86 + Temp\$(ProjectName)$(Platform)$(Configuration)\ + ClCompile + $(SolutionDir)$(Platform)\$(Configuration)\ + $(WINFLEXBISON);$(VC_ExecutablePath_x86);$(WindowsSDK_ExecutablePath);$(VS_ExecutablePath);$(MSBuild_ExecutablePath);$(SystemRoot)\SysWow64;$(FxCopDir);$(PATH); + + + link-grammar-x64 + Temp\$(ProjectName)$(Platform)$(Configuration)\ + ClCompile + $(WINFLEXBISON);$(VC_ExecutablePath_x64);$(WindowsSDK_ExecutablePath);$(VS_ExecutablePath);$(MSBuild_ExecutablePath);$(FxCopDir);$(PATH); + + + + $(GNUREGEX_DIR)\include;..;..\link-grammar;%(AdditionalIncludeDirectories) + _DEBUG;DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) + false + EnableFastChecks + + CompileAsC + Disabled + MultiThreadedDebugDLL + Level3 + true + + + $(GNUREGEX_DIR)\lib\regex.lib;Shlwapi.lib;%(AdditionalDependencies) + true + Windows + MachineX86 + + + + + + + + + + + + + %(Filename).c + + + + + Disabled + $(GNUREGEX_DIR)\include;..;..\link-grammar;%(AdditionalIncludeDirectories) + _DEBUG;DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + CompileAsC + true + Level3 + false + false + + + true + Windows + $(GNUREGEX_DIR)\lib\regex.lib;Shlwapi.lib;%(AdditionalDependencies) + + + + + + + + + + + + + %(Filename).c + + + + + $(GNUREGEX_DIR)\include;..;..\link-grammar;%(AdditionalIncludeDirectories) + _WINDOWS;_USRDLL;%(PreprocessorDefinitions) + + CompileAsC + Level3 + MultiThreadedDLL + true + Speed + true + + + $(GNUREGEX_DIR)\lib\regex.lib;Shlwapi.lib;%(AdditionalDependencies) + true + Windows + true + true + MachineX86 + + + + + + + + + + + + + %(Filename).c + + + + + $(GNUREGEX_DIR)\include;..;..\link-grammar;%(AdditionalIncludeDirectories) + _WINDOWS;_USRDLL;%(PreprocessorDefinitions) + MultiThreadedDLL + + + CompileAsC + true + Level3 + Speed + true + + + Windows + true + true + true + $(GNUREGEX_DIR)\lib\regex.lib;Shlwapi.lib;%(AdditionalDependencies) + + + + + + + + + + + + + %(Filename).c + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Document + Generating ..\link-grammar\link-features.h + ..\link-grammar\link-features.h + %(Identity);..\configure.ac;confvar.bat + false + Generating ..\link-grammar\link-features.h + ..\link-grammar\link-features.h + %(Identity);..\configure.ac;confvar.bat + false + Generating ..\link-grammar\link-features.h + ..\link-grammar\link-features.h + %(Identity);..\configure.ac;confvar.bat + false + Generating ..\link-grammar\link-features.h + ..\link-grammar\link-features.h + %(Identity);..\configure.ac;confvar.bat + false + confvar $(Configuration) <%(Identity) >%(Outputs) + confvar $(Configuration) <%(Identity) >%(Outputs) + confvar $(Configuration) <%(Identity) >%(Outputs) + confvar $(Configuration) <%(Identity) >%(Outputs) + true + true + true + true + + + + + Document + + + + + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/LinkGrammar.vcxproj.filters link-grammar-5.5.0/msvc14/LinkGrammar.vcxproj.filters --- link-grammar-5.3.16/msvc14/LinkGrammar.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/LinkGrammar.vcxproj.filters 2018-03-06 02:44:41.000000000 +0000 @@ -0,0 +1,353 @@ + + + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + + + + Source Files + + + diff -Nru link-grammar-5.3.16/msvc14/Local.props link-grammar-5.5.0/msvc14/Local.props --- link-grammar-5.3.16/msvc14/Local.props 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/Local.props 2017-06-17 01:29:32.000000000 +0000 @@ -0,0 +1,59 @@ + + + + + $(HOMEDRIVE)$(HOMEPATH)\Libraries\gnuregex + $(GNUREGEX_DIR)\lib + C:\Python27 + $(PYTHON2)\include + $(PYTHON2)\libs\python27.lib + $(PYTHON2)\python.exe + C:\Python34 + $(PYTHON3)\include + $(PYTHON3)\libs\python34.lib + $(PYTHON3)\python.exe + C:\win_flex_bison + + + + + %(PreprocessorDefinitions) + + + + + $(GNUREGEX_DIR) + + + $(LG_DLLPATH) + true + + + $(PYTHON2) + + + $(PYTHON2_INCLUDE) + + + $(PYTHON2_LIB) + + + $(PYTHON2_EXE) + + + $(PYTHON3) + + + $(PYTHON3_INCLUDE) + + + $(PYTHON3_LIB) + + + $(PYTHON3_EXE) + + + $(WINFLEXBISON) + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/make-check.py link-grammar-5.5.0/msvc14/make-check.py --- link-grammar-5.3.16/msvc14/make-check.py 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/make-check.py 2016-11-17 18:36:07.000000000 +0000 @@ -0,0 +1,136 @@ +r""" +Run Link Grammar Python scripts using the build target locations. + +This program sets PYTHONPATH and PATH, and uses Python2 or Python3 +to run the script, as needed. + +This program is designed to reside in this directory. +It reads Local.props in order to find Python's EXE location. +It also generates relative PYTHONPATH and PATH. +In case it is desired to move it to another directory, there is a need to +change the related variables. + +The default script directory is binding\python-examples and the default script +to run is tests.py. In order to run the file example.py there, the following +can be used: +console-prompt>make-check.py x64\Debug\Python2 example.py +The following starts an interactive python program: +console-prompt>make-check.py x64\Debug\Python2 "" +""" +from __future__ import print_function +import os +import sys +import re + +local_prop_file = 'Local.props' # In this directory +scriptdir = r'..\bindings\python-examples' +pyscript = 'tests.py' +os.environ["LINK_GRAMMAR_DATA"] = r'..' # "data" in the parent directory + +def error(msg): + if msg: + print(msg) + prog = os.path.basename(sys.argv[0]) + print("Usage: ", prog, '[python_flag] PYTHON_OUTDIR [script.py] [script_args]') + print(r' OUTDIR is in the format of "x64\Debug\Python2"') + sys.exit(1) + +local_prop = {} +def read_props(vsfile): + """ Read all the User Macros from the local properties file. """ + vs_f = open(vsfile, 'r') + macdef_re = re.compile(r'<(\w+)>([^<]*)<') + for line in vs_f: + read_m = re.search(macdef_re, line) + if None == read_m: + continue + if 2 != len(read_m.groups()): + error('Bad line in "{}": {}'.format(vsfile, line)) + local_prop[read_m.group(1)] = read_m.group(2) + if not local_prop: + error('No properties found in {}.'.format(vsfile)) + +NODEFAULT = object() +prop_re = re.compile(r'\$\((\w+)') +def get_prop(prop, default=NODEFAULT): + """ + Resolve a macro definition. + """ + prop_val = local_prop.get(prop, None) + if None == prop_val: + if default is NODEFAULT: + error('Property "{}" not found in {}' .format(prop, local_prop_file)) + return default + + while True: + prop_m = re.search(prop_re, prop_val) + if None == prop_m: + break + prop_rep = prop_m.group(1) + prop_repval = local_prop.get(prop_rep, None) + if None == prop_repval: + prop_repval = os.getenv(prop_rep) + if None == prop_repval: + error('Property "{}" not found in "{}" and also not in the environment'. \ + format(prop_rep, local_prop_file)) + prop_val = str.replace(prop_val, '$('+prop_rep+')', prop_repval) + + return prop_val + + +#--- +#print('Running by:', sys.executable) + +rundir = os.path.dirname(sys.argv[0]) +local_prop_file = rundir + '\\' + local_prop_file +read_props(local_prop_file) + +if len(sys.argv) < 2: + error('Missing argument') + +pyargs = '' +if len(sys.argv[1]) > 0 and sys.argv[1][0] == '-': + pyargs = sys.argv.pop(1) + +if len(sys.argv) < 2: + error('Missing argument') + +outdir = rundir + '\\' + sys.argv.pop(1) +if not os.path.isdir(outdir): + error('Directory "{}" doesn\'t exist'.format(outdir)) + +m = re.search(r'(.*)\\(.*)$', outdir) +if not m or len(m.groups()) != 2: + error('Invalid output directory "{}"'.format(outdir)) +config = m.group(1) +pydir = m.group(2).upper() + +pyexe = get_prop(pydir+'_EXE') + +if len(sys.argv) == 2: + if sys.argv[1] == '' or sys.argv[1][0] != '-': + pyscript = sys.argv.pop(1) + +if pyscript != '': + if '\\' not in pyscript: + pyscript = rundir + '\\' + scriptdir + '\\' + pyscript + +args = '' +if len(sys.argv) >= 2: + args = ' '.join(sys.argv[1:]) + +path = os.environ["PATH"] +dllpath = get_prop('LG_DLLPATH') +# For DLLs - linkgrammar-*.dll and regex.dll +os.environ["PATH"] = ('{};{};{}').format(config, dllpath, path) +#print("PATH=" + os.environ["PATH"]) + +# For linkgrammar.py, clinkgrammar.py and _clinkgrammar.pyd +os.environ["PYTHONPATH"] = \ + rundir + '\\' + r'..\bindings\python;{}'.format(outdir) +print("PYTHONPATH=" + os.environ["PYTHONPATH"]) +#print("Searching modules in:\n" + '\n'.join(sys.path)) + +cmd = ' '.join((pyexe, pyargs, pyscript, args)) +print('Issuing command:', cmd) +os.system(cmd) diff -Nru link-grammar-5.3.16/msvc14/MSVC-common.props link-grammar-5.5.0/msvc14/MSVC-common.props --- link-grammar-5.3.16/msvc14/MSVC-common.props 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/MSVC-common.props 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,20 @@ + + + + + NTDDI_VERSION=NTDDI_VISTA;_WIN32_WINNT=_WIN32_WINNT_VISTA; + + + + + $(CFLAGS);WIN32_LEAN_AND_MEAN;_CRT_NONSTDC_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) + 4068 + + + + + $(CFLAGS) + true + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/post-build.bat link-grammar-5.5.0/msvc14/post-build.bat --- link-grammar-5.3.16/msvc14/post-build.bat 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/post-build.bat 2016-11-17 18:36:07.000000000 +0000 @@ -0,0 +1,47 @@ +@echo off +%= This batch file is invoked in the post-build step of the post-build =% +%= event of the LinkGrammar project build. =% +%= Generate a batch file which executes the compilation result. =% +%= To be invoked as a Post-Build event. =% +%= Argument is project target. =% + +if "%1"=="" (echo "%~f0: Missing argument" 1>&2 & exit /b) + +setlocal +if defined ProgramW6432 set ProgramFiles=%ProgramW6432% + +%= Command name to create. =% +set lgcmd=link-parser + +echo %~f0: Info: Creating %lgcmd%.bat in %cd% + +( + echo @echo off + echo setlocal + echo. + + echo REM This file was auto-generated by %~f0 + echo REM at the LinkGrammar Solution directory %cd% + echo REM in the post-build event of the LinkGrammar project. + echo. + echo REM !!! Don't change it manually in that directory, + echo REM since your changes will get overwritten !!! + echo. + echo REM Copy it to a directory in your PATH and modify it if needed. + echo. + + echo REM The following prepends LG_DDLPATH from msvc14\Local.props + echo set "PATH=%LG_DLLPATH%;%%PATH%%" + echo. + echo REM For USE_WORDGRAPH_DISPLAY + echo REM Path for "dot.exe" + echo REM set "PATH=%%PATH%%;C:\cygwin64\bin" + echo set "PATH=%%PATH%%;%ProgramFiles(x86)%\Graphviz2.38\bin" + echo REM Path for "PhotoViewer.dll" + echo set "PATH=%%PATH%%;%ProgramFiles%\Windows Photo Viewer" + echo. + + echo REM Chdir to the link-grammar source directory so the data directory is found. + echo cd /D %cd%\.. + echo %1 %%* +) > %lgcmd%.bat diff -Nru link-grammar-5.3.16/msvc14/Python2.vcxproj link-grammar-5.5.0/msvc14/Python2.vcxproj --- link-grammar-5.3.16/msvc14/Python2.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/Python2.vcxproj 2016-09-15 18:52:17.000000000 +0000 @@ -0,0 +1,273 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + + Document + <nul set/p x="Invoking "& where.exe swig.exe +echo on +cd $(IntDir) +swig.exe -python -outdir $(OutDir) -module clinkgrammar -I..\..\..\link-grammar -o lg_python_wrap.cpp "%(FullPath)" +%40echo off + + <nul set/p x="Invoking "& where.exe swig.exe +echo on +cd $(IntDir) +swig.exe -python -outdir $(OutDir) -module clinkgrammar -I..\..\..\link-grammar -o lg_python_wrap.cpp "%(FullPath)" +%40echo off + + <nul set/p x="Invoking "& where.exe swig.exe +echo on +cd $(IntDir) +swig.exe -python -outdir $(OutDir) -module clinkgrammar -I..\..\..\link-grammar -o lg_python_wrap.cpp "%(FullPath)" +%40echo off + + <nul set/p x="Invoking "& where.exe swig.exe +echo on +cd $(IntDir) +swig.exe -python -outdir $(OutDir) -module clinkgrammar -I..\..\..\link-grammar -o lg_python_wrap.cpp "%(FullPath)" +%40echo off + + Generating Python2 wrapper ^& interface + $(IntDir)\lg_python_wrap.cpp;$(OutDir)\clinkgrammar.py + Generating Python2 wrapper ^& interface + $(IntDir)\lg_python_wrap.cpp;$(OutDir)\clinkgrammar.py + Generating Python2 wrapper ^& interface + $(IntDir)\lg_python_wrap.cpp;$(OutDir)\clinkgrammar.py + Generating Python2 wrapper ^& interface + $(IntDir)\lg_python_wrap.cpp;$(OutDir)\clinkgrammar.py + ..\link-grammar\link-includes.h + ..\link-grammar\link-includes.h + ..\link-grammar\link-includes.h + ..\link-grammar\link-includes.h + false + false + false + false + + + + + + + + {0a6c539a-3140-48bd-865c-05f45637b93b} + + + + + + + + Document + echo on & cd $(ProjectDir) & call confvar <%(Identity) >$(OutDir)__init__.py + echo on & cd $(ProjectDir) & call confvar <%(Identity) >$(OutDir)__init__.py + echo on & cd $(ProjectDir) & call confvar <%(Identity) >$(OutDir)__init__.py + echo on & cd $(ProjectDir) & call confvar <%(Identity) >$(OutDir)__init__.py + Generating $(OutDir)__init__.py + Generating $(OutDir)__init__.py + Generating $(OutDir)__init__.py + Generating $(OutDir)__init__.py + $(OutDir)__init__.py + ..\configure.ac;confvar.bat + false + true + $(OutDir)__init__.py + ..\configure.ac;confvar.bat + false + true + $(OutDir)__init__.py + ..\configure.ac;confvar.bat + false + true + $(OutDir)__init__.py + ..\configure.ac;confvar.bat + false + true + false + false + false + false + + + + {4DBD8AF9-67E2-4F55-B8BD-99903151A98C} + Win32Proj + Python2 + 8.1 + + + + DynamicLibrary + true + v140 + + + DynamicLibrary + false + v140 + true + + + DynamicLibrary + true + v140 + + + DynamicLibrary + false + v140 + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + _clinkgrammar + .pyd + Temp\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + + + true + _clinkgrammar + .pyd + Temp\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + + + false + _clinkgrammar + .pyd + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + Temp\$(ProjectName)\ + + + false + _clinkgrammar + .pyd + Temp\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + + + + + + Level3 + Disabled + SWIG_PYTHON_INTERPRETER_NO_DEBUG;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..;$(PYTHON2_INCLUDE);%(AdditionalIncludeDirectories) + + + Console + true + $(SolutionDir)$(Platform)\$(Configuration)\link-grammar-x86.lib;$(PYTHON2_LIB);%(AdditionalDependencies) + UseFastLinkTimeCodeGeneration + + + + + + + Level3 + Disabled + SWIG_PYTHON_INTERPRETER_NO_DEBUG;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..;$(PYTHON2_INCLUDE);%(AdditionalIncludeDirectories) + false + + + Console + true + $(SolutionDir)$(Platform)\$(Configuration)\link-grammar-x64.lib;$(PYTHON2_LIB);%(AdditionalDependencies) + Default + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..;$(PYTHON2_INCLUDE);%(AdditionalIncludeDirectories) + Speed + true + + + Console + true + true + true + $(SolutionDir)$(Platform)\$(Configuration)\link-grammar-x86.lib;$(PYTHON2_LIB);%(AdditionalDependencies) + UseLinkTimeCodeGeneration + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..;$(PYTHON2_INCLUDE);%(AdditionalIncludeDirectories) + Speed + true + + + Console + true + true + true + $(SolutionDir)$(Platform)\$(Configuration)\link-grammar-x64.lib;$(PYTHON2_LIB);%(AdditionalDependencies) + UseLinkTimeCodeGeneration + + + + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/Python2.vcxproj.filters link-grammar-5.5.0/msvc14/Python2.vcxproj.filters --- link-grammar-5.3.16/msvc14/Python2.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/Python2.vcxproj.filters 2016-09-13 18:44:33.000000000 +0000 @@ -0,0 +1,33 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + + Header Files + + + + + Source Files + + + \ No newline at end of file diff -Nru link-grammar-5.3.16/msvc14/Python3.vcxproj.filters link-grammar-5.5.0/msvc14/Python3.vcxproj.filters --- link-grammar-5.3.16/msvc14/Python3.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/Python3.vcxproj.filters 2016-09-13 18:44:33.000000000 +0000 @@ -0,0 +1,33 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + + Header Files + + + + + Source Files + + + diff -Nru link-grammar-5.3.16/msvc14/README.md link-grammar-5.5.0/msvc14/README.md --- link-grammar-5.3.16/msvc14/README.md 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/msvc14/README.md 2018-04-22 14:57:33.000000000 +0000 @@ -0,0 +1,305 @@ +Building and Running on Windows +=============================== +Note: See also [BUILDING on Windows](/README.md#building-on-windows) +in the main README. + +This directory contains project files for building Link Grammar with the +Microsoft Visual Studio 2015 IDE (MSVC14). They were created and tested with +the Community Edition of that product. + +**!!!WARNING!!!** +In the examples below, "console-prompt>" at start of line means the console +prompt. Don't type it because the result could be destructive! + +Supported target versions +------------------------- +The intention is to support versions from Vista on (some WIN32 +functions which are used are not supported in earlier versions.) + +The system compatibility definitions: +In each project file - Target Platform version: 8.1 +In the `MSVC-common` property sheet - **Common properties->User Macros/CFLAGS**: + +`NTDDI_VERSION=NTDDI_VISTA;_WIN32_WINNT=_WIN32_WINNT_VISTA;` + +Dependencies +------------ +The regex package, which includes libraries and header files, must be +separately downloaded. Also see GNUREGEX_DIR below. + +For Python bindings, install the desired Python distributions from +[Python Releases for Windows](https://www.python.org/downloads/windows/). +You also have to install [SWIG](http://www.swig.org/download.html). + +The bindings were testes using swigwin-3.0.10 with Python 2.7.12 and 3.4.4. + +Setup +----- +The build files make use of User Macros, defined in the property +sheet "Local", as follows: + +- GNUREGEX_DIR must be pointing to an unzipped POSIX regex distribution + (which has the subdirectories `include` and `lib`). Its default is + `%HOMEDRIVE%%HOMEPATH%\Libraries\gnuregex`. + If the environment variable GNUREGEX exists, its value is used instead. + The library file name is assumed to be "regex.lib" and needs to be + generated from the DLL if it is not included in the distribution (see + http://stackoverflow.com/questions/9946322). + + A tested library can be downloaded from + [Tre for Windows](http://gnuwin32.sourceforge.net/packages/tre.htm). + If your system is 64-bits, use the provided 64-bits library. + + The corresponding lib file is missing there. If you have Cygwin + installed, you can generate it as follows (console command): + +``` +dlltool -l regex.lib -d libtre/win32/tre.def -D regex.dll libtre/win32/bin/x64_release/libtre_dll.dll +``` + +- LG_DLLPATH should include the directory of the regex DLL, which is + normally under GNUREGEX_DIR (or it can be a central DLL directory etc.). + The default is `$(GNUREGEX_DIR)\lib`. + +- JAVA_HOME, if used, must be pointing to a locally installed SDK/JDK, + which has the subdirectories "include" and `include/win32` (defined in + the LinkGrammarJava project under **Common properties->C/C++->General-> + Additional Include Directories**. + If your JAVA SDK/JDK installation has defined the JAVA_HOME environment + variable (check it) then there is no need to define this User Macro. + +- WINFLEXBISON should be the directory of the "Win flex-bison" project, + as downloaded from its [Web site](https://winflexbison.sourceforge.io/). + Tested with version 2.5.9. + Leave it blank if would like to use a ready **pp_lexer.c** file. + The default is **C:\win_flex_bison**. + +### Definitions for Python bindings + + Macro | Default value | +---|---| +PYTHON2 | C:\Python27 | +PYTHON2_INCLUDE | $(PYTHON2)\include | +PYTHON2_LIB | $(PYTHON2)\lib | +PYTHON2_EXE | $(PYTHON2)\python.exe | +PYTHON3 | C:\Python34 | +PYTHON3_INCLUDE | $(PYTHON3)\include | +PYTHON3_LIB | $(PYTHON3)\lib | +PYTHON3_EXE | $(PYTHON3)\python.exe | + +If you want to build any of the bindings, make sure it is marked for build +in the configuration manager, or select "build" for the desired bindings in +Solution Explorer. + +Compiling +--------- +- Compiling and running got checked on Windows 10, but is intended to be + compatible to Vista and on (XP is not supported). + +- To compile LinkGrammar, open the solution file LinkGrammar.sln, change + the solution platform to x64 if desired, and build. + +- The solution configuration is configured to create a debug version by + default. It's probably a good idea to switch a "Release" configuration. + You can do this at **Build Menu->Configuration Manager**. + +- The wordgraph-display feature is enabled when compiled with + USE_WORDGRAPH_DISPLAY (already defined in the `LGlib-features` property sheet + - **Common properties->C/User Macros/DEFS**). + +- By default, the library is configured to create a DLL. If you want + to instead build a static library, the macro LINK_GRAMMAR_STATIC must + be defined before the inclusion of any header files for both the compiling + of the link-grammar library and for the application that uses it. Other + compiler settings will also have to be changed to create a static library + of course. + +Running +------- +The last step of the "LinkGrammarExe" project creates `link-parser.bat`. Copy +it to somewhere in your PATH and then customize it if needed (don't customize +it in-place, since rebuilding would overwrite it). Note that it prepends PATH +with the User Macro LG_DLLPATH (see Setup above). If you would like to display +the wordgraph (see below) set also the PATH for `dot.exe` if it is not already +in your PATH. + +If USE_WORDGRAPH_DISPLAY has been used when compiling (the default), then +typing `!test=w`g at the `linkparser>` prompt can be used in order to display +the wordgraph of the next sentences to be parsed. See +[Word-graph display](/linkgrammar/tokenize/README.md#word-graph-display) +in `/linkgrammar/tokenize/README.md`. +for how to use optional display flags. By default, `PhotoViewer.dll` is +invoked to display the graph. If X11 is available and your `dot.exe` command +has the "xlib" driver, it can be used to display the wordgraph when the x flag +is set (i.e. `!test=wg:x`), for example when running under Cygwin/X. In any +case it uses the `dot` command of Graphviz. Graphviz can be installed as part +of Cygwin (in which case it included the "xlib" driver"), or separately from +[Graphviz](http://www.graphviz.org/Download_windows.php). +Both `PhotoViewer.dll` (if used) and `dot` must be in your PATH (if needed, +you can customized that in a copy of `link-parser.bat`, as described above). + +BTW, when running and MSVC-compiled binary under Cygwin, don't exit +link-parser by using `^Z` - the shell may get stuck because the program +somehow may continue to run in the background. Instead, exit using `!exit` . + +NOTE: The created DLLs need the MSVC14 runtime environment to run. This is +normally already installed in your machine with the installation of the IDE. +But to be able to run Link Grammar on other computer you need to install +[Visual C++ Redistributable for Visual Studio 2015](https://www.microsoft.com/en-us/download/details.aspx?id=48145). +This redistributable does not contain debug version of the MSVC runtime, so +only "Release" Link Grammar will work with it. + +Running Python programs +----------------------- +Since the Link Grammar library has no installation script yet, +running Python programs that use the bindings needs a careful setup +of PYTHONPATH and PATH. A program named make-check.py (named after +"make check" that runs tests.py in POSIX systems) is provided to +set them automatically. + +Also see "Permanent installation". + +### Using make-check.py +The **make-check.py** program is designed to reside in the MSVC +configuration directory. However, it can run from any directory +using a full or a relative path to invoke it. + +Usage: +``` +console-prompt>make-check [PYTHON_FLAG] PYTHON_OUTDIR [script.py] [ARGUMENTS] +``` + +- PYTHON_FLAG: Optional flag for the Python program, e.g. `-vv` to debug +imports. +- PYTHON_OUTDIR: The directory to which the Python bindings got written. +For example, `x64\Release\Python3`. +- script.py: Path leading to the script. If only a filename is specified +(i.e. no `\` in the path) the specified script file is taken from the +`bindings\python-examples\` directory. In order to run tests.py in +its default location you can leave it empty. +- ARGUMENTS: Optional script arguments, for example `-v` for `tests.py`. + +So in order to run `tests.py` with Python2 for a Debug compilation on x64 +platform, enter: +``` +console-prompt>make-check x64\Debug\Python2 +``` +To debug a Python3 script "mylgtest.py" that resides in +`bindings\python-examples`: +``` +console-prompt>make-check -mpdb x64\Debug\Python3 mylgtest.py +``` +To run a script in your home directory: +``` +console-prompt>make-check x64\Debug\Python3 \Users\username\mylgtest.py +``` +The following starts an interactive Python with the correct PYTHONPATH +and PATH: +``` +console-prompt>make-check.py x64\Debug\Python2 "" +``` +Locale and code page settings +----------------------------- +In this version, the language dictionaries under the data directory define +the locale that they need in order to process input sentences and the +library automatically uses this locale, in a manner that is intended to be +compatible with creating several different dictionaries per thread. + +If you use a dictionary which doesn't have this definition, or would like +to set the default language when link-parser is invoked without a language +argument, you can set the locale using the environment variable LANG, +using a 2-letter language code and a 2-letter country code: +``` +console-prompt>set LANG=ll-CC +``` +For example: +``` +console-prompt>set LANG=en-US +``` + +If you open a dictionary which doesn't have a locale definition, from a +program (C program or a language with LG bindings) you have to set the +correct locale (for the whole program or the particular thread)for the +dictionary before creating the dictionary. Alternatively, the LANG +environment variable can be set before creating the dictionary. + +The code page of the console should not be changed, unless it is desired +to pipe the output to another program. In that case, this other program +may read garbage due to a limitation in the way cmd.exe implements pipes. +This can be solved as following ("more" is the example program) +``` +link-parser | (chcp 65001 & more) +``` +In that case `more` will be able to read UTF-8 input. However, the +display may still not be perfect due to additional cmd.exe limitations. +The code page can be changed manually to 65001, see below in the "Note for +Python bindings". + +### A note for the Python bindings +In order to produce UTF-8 output to the console, it needs use the CP_UTF8 +code page (65001): + +``` +console-prompt>chcp +Active code page NNNN +console-prompt>chcp 65001 +Active code page: 65001 +``` + +Other programs may malfunction with this code page, so it can be restored +when needed (or this console window can be used only for working with +link-grammar). The link-parser command also changes the code page to +CP_UTF8, but it restores the original one on exit. + +Console fonts +------------- +Courier New may be appropriate for all the languages in the data +directory. If you don't have it in the console menu, it can be added +through the registry (Google it). + +Permanent installation +---------------------- +For using the library independently of the build directory: + +1) If Python bindings were generated, copy the following modules to a + directory `linkgrammar` in a fixed location: `linkgrammar.py`, + `clinkgrammar.py`, `__init__.py`, `_clinkgrammar.pyd`. + Set the PYTHONPATH environment variable to point to the said + "linkgrammar" directory. + +2) Copy the link-grammar DLL to a fixed location. + Add the DLL location permanently to PATH. + +3) Copy the `data` directory to the location of the DLL so it will get found. + + +Implementation notes: +--------------------- + +- The file `link-grammar/link-features.h.in` has a Custom Build Tool definition + which invokes `mk-link-features-h` to generate + `link-grammar/link-features.h`, with LINK_*_VERSION variable replacement as + defined in `configure.ac`. + +- The project file `LinkGrammarExe` has a Post-Build-Event definition for + generating `link-parser.bat`. + + +Using a remote network share +---------------------------- +In order to use a link-grammar source repository from a network share (e.g. via +Samba), and still be able to use the custom build steps in the Project files, +there is a need to "convince" Windows it is a local filesystem. Else you will +get "UNC path are not supported." on the batch runs, with bad results. This +method will also allow the "link-parser.bat" file to run. (For other solutions +see tackoverflow.com/questions/9013941). You will need to find out by yourself +if this makes a security or another problem in your case. + +Here is what worked for me: +Suppose you use host:/usr/local/src remotely as share "src": +``` +mklink /J src-j \\host\src +mklink /D src src-link +``` +The second one needs administrator privileges. +Then use the repository through "src". diff -Nru link-grammar-5.3.16/NEWS link-grammar-5.5.0/NEWS --- link-grammar-5.3.16/NEWS 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/NEWS 2018-03-12 05:27:41.000000000 +0000 @@ -1,188 +1,54 @@ +[ANNOUNCE] Link-Grammar Version 5.4.4 is now available. +I'm pleased to announce that version 5.4.4 is now available. I don't +normally announce minor versions, but this one was almost named 5.5.0. +Which suggests that there were some important changes. Dictionary +loading is now thread safe. Security vulnerabilities are fixed. Parsing +of Russian is now 2x faster than before. Connectors can be individually +given length limits - handy for morphology and phonetic agreement - and +the root reason for the Russian speedup. An assortment of fixes to the +English dictionary, including a reversal of some back-sliding in the +test corpus. -Corpus Statistics: ------------------- - Version 4.4.2 (January 2009) introduced a parse-ranking system based - on corpus statistics. This allows the most likely parse to be - identified in terms of the probabilities of word disjuncts observed - on actual text. The system also includes a way to assign WordNet - word senses to a word, based on the grammatical usage of tha word. - An overview of the idea is given on the OpenCog blog, here: - - http://brainwave.opencog.org/2009/01/12/determining-word-senses-from-grammatical-usage/ - - As of 2012, this parse-ranking system is obsolescent. The primary - issue is that the data files need to be rebuilt, to reflect the new - dictionary structure; the version skew between the old databases and - the current dictionaries will invalidate results. If you are - interested, contact the mailing list, and take a look at - https://github.com/opencog/link-grammar/issues/292 - - To enable the corpus statistics, specify - - ./configure --enable-corpus-stats +You can download link-grammar from +http://www.abisource.com/downloads/link-grammar/current/ - prior to compiling. - - There are no currently-maintained databases for this ranking system. - Older databases can be downloaded from - - http://www.abisource.com/downloads/link-grammar/sense-dictionary/ - or - http://gnucash.org/linas/nlp/data/linkgrammar-wsd/ - - These older databases are not very accurate, since the English - language dictionaries have seen significant changes since these - were first created. To be usable, the databases should be recreated - for eht curent dictionaries. - - The data is contained in an sqlite3 database file, - - disjuncts.20090430.db.bz2 - - Unzip this file (using bunzip2) rename it to "disjuncts.db", and - place it in the subdirectory "sql", in the same directory that - contains the "en" directory. For default unix installations, the - final location would be - - /usr/local/share/link-grammar/sql/disjuncts.db - - where, by comparison, the usual dictionary would be at +The website is here: +https://www.abisource.com/projects/link-grammar/ - /usr/local/share/link-grammar/en/4.0.dict - - After this is installed, parse ranking scores should be printed - automatically, as floating-point numbers: for example: - - Unique linkage, cost vector = (CORP=4.4257 UNUSED=0 DIS=1 AND=0 LEN=5) - - Lower numbers are better. The scores can be interpreted as -log_2 - of a certain probability, so the lower the number, the higher the - probability. - - The display of disjunct scores can be enabled with the !disjuncts - flag, and senses with the !senses flag, at the link-parser prompt. - Entering !var and !help will show all flags. Multiple parses are - sorted and displayed in order from lowest to highest cost; the sort - of can be set by saying !cost=1 for the traditional sort, and - !cost=2 for corpus-based cost. Output similar to the below should - be printed: - - linkparser> !disjunct - Showing of disjunct used turned on. - linkparser> !cost=2 - cost set to 2 - linkparser> !sense - Showing of word senses turned on. - linkparser> this is a test - Found 1 linkage (1 had no P.P. violations) - Unique linkage, cost vector = (CORP=4.4257 UNUSED=0 DIS=1 AND=0 LEN=5) - - +--Ost--+ - +-Ss*b+ +-Ds-+ - | | | | - this.p is.v a test.n - - 2 is.v dj=Ss*b- Ost+ sense=be%2:42:02:: score=2.351568 - 2 is.v dj=Ss*b- Ost+ sense=be%2:42:05:: score=2.143989 - 2 is.v dj=Ss*b- Ost+ sense=be%2:42:03:: score=1.699292 - 4 test.n dj=Ost- Ds- sense=test%1:04:00:: score=0.000000 - this.p 0.0 0.695 Wd- Ss*b+ - is.v 0.0 7.355 Ss*b- Ost+ - a 0.0 0.502 Ds+ - test.n 1.0 9.151 Ost- Ds- - - - Note that the sense labels are not terribly accurate; the verb "to be" - is particularly hard to tag correctly. - - -BioLG merger: -------------- - As of version 4.5.0 (April 2009), the most important parts of the - BioLG project have been merged. The current version of link-grammar - has superior parse coverage to BioLG on all texts, including - biomedical texts. The original BioLG test suite can be found in - data/en/4.0.biolg.batch. - - The following changes in BioLG have NOT been merged: - -- Part of speech hinting. The BioLG code can accept part-of-speech - hints for unknown words. - -- XML I/O. The BioLG code can output parsed text in a certain - idiosyncratic XML format. - -- "term support". Experiments from the 2007-2009 time-frame - indicate these were useless. - -- The link type CH. This was a large, intrusive, incompatible change - to the dictionary, and it is not strictly required -- there is a - better, alternative way of handling adj-noun-adj-noun chains commonly - seen in biomedical text, and this has been implemented. - - All other BioLG changes, and in particular, extensive dictionary fixes, - as well as regex morphology handling, have been incorporated. - - -Medical Terms Merger --------------------- - Many, but not all, of the "medical terms" from Peter Szolovits have - been merged into version 4.3.1 (January 2008) of link-grammar. The - original project page was at: - - http://groups.csail.mit.edu/medg/projects/text/lexicon.html - - The following "extra" files were either merged directly, renamed, or - skipped (omitted): - - /extra.1: -- merged - /extra.2: -- skip, too big - /extra.3: -- skip, too big - /extra.4: -- /en/words/words-medical.v.4.2: - /extra.5: -- /en/words/words-medical.v.4.1: - /extra.6: -- /en/words/words-medical.adj.2: - /extra.7: -- /en/words/words-medical.n.p - /extra.8: -- skip, too big - /extra.9: -- skip, random names - /extra.10: -- /en/words/words-medical.adv.1: - /extra.11: -- /en/words/words-medical.v.4.5: - /extra.12: -- skip, too big - /extra.13: -- /en/words/words-medical.v.4.3: - /extra.14: -- /en/words/words-medical.prep.1 - /extra.15: -- /en/words/words-medical.adj.3: - /extra.16: -- /en/words/words-medical.v.2.1: - /extra.17: -- skip, too big +WHAT IS LINK GRAMMER? +The Link Grammar Parser is a syntactic parser of English (and other +languages as well), based on Link Grammar, an original theory of English +syntax. Given a sentence, the system assigns to it a syntactic structure, +which consists of a set of labelled links connecting pairs of words. - To make use of the "skipped" files, download the original extension, - gut the contents of "extra.dict" except for the parts referring to the - skipped files above, and then append to 4.0.dict (as per original - instructions). - Its not at all clear that the "skipped" files improve parse accuracy - in any way; they may, in fact, damage accuracy. +================================================================= +================================================================= +================================================================= +[ANNOUNCE] Link-Grammar Version 5.4.0 is now available. -Fat Links: ----------- - As of version 4.7.0 (September 2010), parsing using "fat links" has - been disabled by default, and is now deprecated. The function is - still there, and can be turned on by specifying the !use-fat=1 command, - or by calling parse_options_use_fat_links(TRUE) from programs. +I'm pleased to announce that version 5.4.0 is now available. Besides +including various bug fixes, this release is notable for completely +restructuring the organization of the source code, grouping files into +directories according to the processing stage that they implement. See +below for the full ChangeLog. - As of version 4.7.12 (May 2013), the "fat link" code is no longer - compiled by default. To obtain the fat-link version, ./configure - must be run with the --enable-fat-links --disable-sat-solver flag. - Enabling this will generate a lot of warning messages during - compilation. +You can download link-grammar from +http://www.abisource.com/downloads/link-grammar/current/ - As of version 5.2.0 (December 2014) the "fat link" code has been - removed. The fat-link code consisted of about 5 KLOC or about 1/6th - of the total code. About 23 KLOC of the core parser code remains. - Users of the Russian dicts must use versions prior to this to get - Russian sentences with conjunctions in them to parse. +The website is here: +https://www.abisource.com/projects/link-grammar/ - Older versions of the link-grammar parser used "fat links" to - support conjunctions (and, or, but, ...). However, this leads - to a number of complications, including poor performance due to - a combinatorial explosion of linkage possibilities, as well as - an excessively complex parse algorithm. +WHAT IS LINK GRAMMER? +The Link Grammar Parser is a syntactic parser of English (and other +languages as well), based on link grammar, an original theory of English +syntax. Given a sentence, the system assigns to it a syntactic +structure, which consists of a set of labelled links connecting pairs of +words. The parser also produces a "constituent" (Penn tree-bank style +phrase tree) representation of a sentence (showing noun phrases, verb +phrases, etc.). ================================================================= ================================================================= @@ -770,3 +636,186 @@ ================================================================= ================================================================= ================================================================= + +Fat Links: +---------- + As of version 4.7.0 (September 2010), parsing using "fat links" has + been disabled by default, and is now deprecated. The function is + still there, and can be turned on by specifying the !use-fat=1 command, + or by calling parse_options_use_fat_links(TRUE) from programs. + + As of version 4.7.12 (May 2013), the "fat link" code is no longer + compiled by default. To obtain the fat-link version, ./configure + must be run with the --enable-fat-links --disable-sat-solver flag. + Enabling this will generate a lot of warning messages during + compilation. + + As of version 5.2.0 (December 2014) the "fat link" code has been + removed. The fat-link code consisted of about 5 KLOC or about 1/6th + of the total code. About 23 KLOC of the core parser code remains. + Users of the Russian dicts must use versions prior to this to get + Russian sentences with conjunctions in them to parse. + + Older versions of the link-grammar parser used "fat links" to + support conjunctions (and, or, but, ...). However, this leads + to a number of complications, including poor performance due to + a combinatorial explosion of linkage possibilities, as well as + an excessively complex parse algorithm. + +Corpus Statistics: +------------------ + Version 4.4.2 (January 2009) introduced a parse-ranking system based + on corpus statistics. This allows the most likely parse to be + identified in terms of the probabilities of word disjuncts observed + on actual text. The system also includes a way to assign WordNet + word senses to a word, based on the grammatical usage of tha word. + An overview of the idea is given on the OpenCog blog, here: + + http://brainwave.opencog.org/2009/01/12/determining-word-senses-from-grammatical-usage/ + + As of 2012, this parse-ranking system is obsolescent. The primary + issue is that the data files need to be rebuilt, to reflect the new + dictionary structure; the version skew between the old databases and + the current dictionaries will invalidate results. If you are + interested, contact the mailing list, and take a look at + https://github.com/opencog/link-grammar/issues/292 + + To enable the corpus statistics, specify + + ./configure --enable-corpus-stats + + prior to compiling. + + There are no currently-maintained databases for this ranking system. + Older databases can be downloaded from + + http://www.abisource.com/downloads/link-grammar/sense-dictionary/ + or + http://gnucash.org/linas/nlp/data/linkgrammar-wsd/ + + These older databases are not very accurate, since the English + language dictionaries have seen significant changes since these + were first created. To be usable, the databases should be recreated + for eht curent dictionaries. + + The data is contained in an sqlite3 database file, + + disjuncts.20090430.db.bz2 + + Unzip this file (using bunzip2) rename it to "disjuncts.db", and + place it in the subdirectory "sql", in the same directory that + contains the "en" directory. For default unix installations, the + final location would be + + /usr/local/share/link-grammar/sql/disjuncts.db + + where, by comparison, the usual dictionary would be at + + /usr/local/share/link-grammar/en/4.0.dict + + After this is installed, parse ranking scores should be printed + automatically, as floating-point numbers: for example: + + Unique linkage, cost vector = (CORP=4.4257 UNUSED=0 DIS=1 AND=0 LEN=5) + + Lower numbers are better. The scores can be interpreted as -log_2 + of a certain probability, so the lower the number, the higher the + probability. + + The display of disjunct scores can be enabled with the !disjuncts + flag, and senses with the !senses flag, at the link-parser prompt. + Entering !var and !help will show all flags. Multiple parses are + sorted and displayed in order from lowest to highest cost; the sort + of can be set by saying !cost=1 for the traditional sort, and + !cost=2 for corpus-based cost. Output similar to the below should + be printed: + + linkparser> !disjunct + Showing of disjunct used turned on. + linkparser> !cost=2 + cost set to 2 + linkparser> !sense + Showing of word senses turned on. + linkparser> this is a test + Found 1 linkage (1 had no P.P. violations) + Unique linkage, cost vector = (CORP=4.4257 UNUSED=0 DIS=1 AND=0 LEN=5) + + +--Ost--+ + +-Ss*b+ +-Ds-+ + | | | | + this.p is.v a test.n + + 2 is.v dj=Ss*b- Ost+ sense=be%2:42:02:: score=2.351568 + 2 is.v dj=Ss*b- Ost+ sense=be%2:42:05:: score=2.143989 + 2 is.v dj=Ss*b- Ost+ sense=be%2:42:03:: score=1.699292 + 4 test.n dj=Ost- Ds- sense=test%1:04:00:: score=0.000000 + this.p 0.0 0.695 Wd- Ss*b+ + is.v 0.0 7.355 Ss*b- Ost+ + a 0.0 0.502 Ds+ + test.n 1.0 9.151 Ost- Ds- + + + Note that the sense labels are not terribly accurate; the verb "to be" + is particularly hard to tag correctly. + + +BioLG merger: +------------- + As of version 4.5.0 (April 2009), the most important parts of the + BioLG project have been merged. The current version of link-grammar + has superior parse coverage to BioLG on all texts, including + biomedical texts. The original BioLG test suite can be found in + data/en/4.0.biolg.batch. + + The following changes in BioLG have NOT been merged: + -- Part of speech hinting. The BioLG code can accept part-of-speech + hints for unknown words. + -- XML I/O. The BioLG code can output parsed text in a certain + idiosyncratic XML format. + -- "term support". Experiments from the 2007-2009 time-frame + indicate these were useless. + -- The link type CH. This was a large, intrusive, incompatible change + to the dictionary, and it is not strictly required -- there is a + better, alternative way of handling adj-noun-adj-noun chains commonly + seen in biomedical text, and this has been implemented. + + All other BioLG changes, and in particular, extensive dictionary fixes, + as well as regex morphology handling, have been incorporated. + + +Medical Terms Merger +-------------------- + Many, but not all, of the "medical terms" from Peter Szolovits have + been merged into version 4.3.1 (January 2008) of link-grammar. The + original project page was at: + + http://groups.csail.mit.edu/medg/projects/text/lexicon.html + + The following "extra" files were either merged directly, renamed, or + skipped (omitted): + + /extra.1: -- merged + /extra.2: -- skip, too big + /extra.3: -- skip, too big + /extra.4: -- /en/words/words-medical.v.4.2: + /extra.5: -- /en/words/words-medical.v.4.1: + /extra.6: -- /en/words/words-medical.adj.2: + /extra.7: -- /en/words/words-medical.n.p + /extra.8: -- skip, too big + /extra.9: -- skip, random names + /extra.10: -- /en/words/words-medical.adv.1: + /extra.11: -- /en/words/words-medical.v.4.5: + /extra.12: -- skip, too big + /extra.13: -- /en/words/words-medical.v.4.3: + /extra.14: -- /en/words/words-medical.prep.1 + /extra.15: -- /en/words/words-medical.adj.3: + /extra.16: -- /en/words/words-medical.v.2.1: + /extra.17: -- skip, too big + + To make use of the "skipped" files, download the original extension, + gut the contents of "extra.dict" except for the parts referring to the + skipped files above, and then append to 4.0.dict (as per original + instructions). + + Its not at all clear that the "skipped" files improve parse accuracy + in any way; they may, in fact, damage accuracy. diff -Nru link-grammar-5.3.16/README.md link-grammar-5.5.0/README.md --- link-grammar-5.3.16/README.md 2017-03-14 05:43:05.000000000 +0000 +++ link-grammar-5.5.0/README.md 2018-04-27 00:21:47.000000000 +0000 @@ -1,6 +1,6 @@ Link Grammar Parser =================== -***Version 5.3.16*** +***Version 5.5.0*** The Link Grammar Parser implements the Sleator/Temperley/Lafferty theory of natural language parsing. This version of the parser is @@ -59,7 +59,7 @@ The bottom of the display is a listing of the "disjuncts" used for each word. The disjuncts are simply a list of the connectors that -werre employed to form the links. They are particularly intersting +were employed to form the links. They are particularly interesting because they serve as an extremely fine-grained form of a "part of speech". This, for example: the disjunct `S- O+` indicates a transitive verb: its a verb that takes both a subject and an object. @@ -101,7 +101,7 @@ ["Parsing English with a Link Grammar"](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/link/pub/www/papers/ps/tr91-196.pdf) October 1991 *CMU-CS-91-196*. * Daniel D. Sleator, Davy Temperley, - ["Parsing English with a Link Grammar"] (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/link/pub/www/papers/ps/LG-IWPT93.pdf), + ["Parsing English with a Link Grammar"](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/link/pub/www/papers/ps/LG-IWPT93.pdf), *Third International Workshop on Parsing Technologies* (1993). * Dennis Grinberg, John Lafferty, Daniel Sleator, ["A Robust Parsing Algorithm for Link Grammars"](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/link/pub/www/papers/ps/tr95-125.pdf), @@ -186,7 +186,7 @@ no malicious changes were made to the code internals by third parties. The signatures can be checked with the gpg command: -`gpg --verify link-grammar-5.3.16.tar.gz.asc` +`gpg --verify link-grammar-5.5.0.tar.gz.asc` which should generate output identical to (except for the date): ``` @@ -198,6 +198,12 @@ cryptographic security, but they can detect simple corruption. To verify the check-sums, issue `md5sum -c MD5SUM` at the command line. +Tags in `git` can be verified by performing the following: +``` +gpg --recv-keys --keyserver keyserver.ubuntu.com EB6AA534E0C0651C +git tag -v link-grammar-5.5.0 +``` + CREATING the system ------------------- @@ -249,9 +255,6 @@ ``` ./configure --disable-editline ``` -**Note**: utf8 support for libedit is still missing in Ubuntu 1404 and -Mint 17 Qiana See https://bugs.launchpad.net/linuxmint/+bug/1389438 -https://bugs.launchpad.net/ubuntu/+source/libedit/+bug/1375921 Java Bindings ------------- @@ -263,13 +266,31 @@ ./configure --disable-java-bindings ``` -If JAVA_HOME isn't set, if jni.h isn't found, or if ant isn't found, +If `jni.h` isn't found, or if `ant` isn't found, then the java bindings will not be built. +Notes about finding `jni.h`:
+Some common java JVM distributions (most notably, the ones from Sun) +place this file in unusual locations, where it cannot be +automatically found. To remedy this, make sure that environment variable +JAVA_HOME is set. The configure script looks for jni.h in `$JAVA_HOME/Headers` +and in `$JAVA_HOME/include`; it also examines corresponding locations +for $JDK_HOME. If `jni.h `still cannot be found, specify the location +with the CPPFLAGS variable: so, for example, +``` +export CPPFLAGS="-I/opt/jdk1.5/include/:/opt/jdk1.5/include/linux" +``` +or +``` +export CPPFLAGS="-I/c/java/jdk1.6.0/include/ -I/c/java/jdk1.6.0/include/win32/" +``` +Please note that the use of `/opt` is non-standard, and most system +tools will fail to find packages installed there. + Python2 and Python3 Bindings ---------------------------- The Python2 and Python3 bindings are built by default, providing that -the corresponding Python developement packages are installed. +the corresponding Python development packages are installed. These packages are: - Linux: @@ -277,16 +298,21 @@ * Systems using 'deb' packages: Python2: python-dev; Python3: python3-dev - Windows: * Install Python2 and Python3 from https://www.python.org/downloads/windows/ . - You also have to install SWIG from http://www.swig.org/download . + You also have to install SWIG from http://www.swig.org/download.html . - MacOS: * Install the python and python3 packages using [HomeBrew](http://brew.sh/). + Alternatively, install +[Anaconda](https://conda.io/docs/user-guide/install/download.html). + +NOTE: Before issuing `configure` (see below) you have to validate that +the required python versions can be invoked using your `PATH`. The use of the Python bindings is *OPTIONAL*; you do not need these if you do not plan to use link-grammar with python. If you like to disable these bindings, use one of: ``` -./configure --diable-python-bindings +./configure --disable-python-bindings ./configure --enable-python-bindings=2 ./configure --enable-python-bindings=3 ``` @@ -325,9 +351,9 @@ [UNPACKING and signature verification](#unpacking-and-signature-verification)). The current GitHub version is intended for developers (including anyone who -is willing to provide a fix, a new feauture or an improvment). The tip of +is willing to provide a fix, a new feature or an improvement). The tip of the master branch is often unstable, and can sometimes have bad code in it -as it is under developementi. It also needs installing of development tools +as it is under development. It also needs installing of development tools that are not installed by default. Due to these reason the use of the GitHub version is discouraged for regular end users. @@ -340,12 +366,16 @@ Tools that may need installation before you can compile the system: make
+m4
gcc
gcc-c++ (for the SAT solver)
autoconf
autoconf-archive
+pkg-config
swig (for language bindings)
-graphpviz (if you like to ue the word-graph display feature) +flex
+Apache Ant (for Java bindings)
+graphviz (if you like to use the word-graph display feature) The GitHub version doesn't include a `configure` script. To generate it, use: @@ -371,7 +401,7 @@ A feature that may be useful for debugging is the word-graph display. Use the `configure` option `--enable-wordgraph-display` to enable it. For more details on this feature, see -[Word-graph display](link-grammar/README.md#word-graph-display). +[Word-graph display](link-grammar/tokenize/README.md#word-graph-display). BUILDING on MacOS ----------------- @@ -379,16 +409,6 @@ just fine, as described above. At this time, there are no reported issues. -The language bindings for python and java may require additional -packages to be installed. A working editline is nice, since it -allows you to use the arrow keys in the command-line client. -See http://www.macports.org/ to find these. - -You almost surely do not need a Mac portfile; but you can still -find one here: -http://trac.macports.org/browser/trunk/dports/textproc/link-grammar/Portfile .
-It does not currently specify any additional steps to perform. - If you do NOT need the java bindings, you should almost surely configure with: ``` @@ -403,6 +423,10 @@ to the location of the java compiler. Make sure you have ant installed. +If you would like to build from GitHub +(see [BUILDING from the GitHub repository](#building-from-the-github-repository)) +you can install the tools that are listed there using +[HomeBrew](http://brew.sh/). BUILDING on Windows ------------------- @@ -416,25 +440,25 @@ Link-grammar requires a working version of POSIX-standard regex libraries. Since these are not provided by Microsoft, a copy must -be obtained elsewhere. One popular choice is TRE, available at: -http://gnuwin32.sourceforge.net/packages/tre.htm +be obtained elsewhere. One popular choice is +[TRE](http://gnuwin32.sourceforge.net/packages/tre.htm). Another popular choice is PCRE, 'Perl-Compatible Regular Expressions', -available at: -http://www.pcre.org/
-Recent 32 and 64-bit binaries can be found at: -http://www.airesoft.co.uk/pcre
+available at: http://www.pcre.org/ .
+For building on Windows: https://github.com/rivy/PCRE .
+Another popular choice is +[PCRE, 'Perl-Compatible Regular Expressions'](http://www.pcre.org/).
Older 32-bit binaries are at: -http://gnuwin32.sourceforge.net/packages/regex.htm
+http://gnuwin32.sourceforge.net/packages/regex.htm .
See also: -http://ftp.gnome.org/pub/gnome/binaries/win32/dependencies/regex.README +http://ftp.gnome.org/pub/gnome/binaries/win32/dependencies/regex.README . BUILDING on Windows (Cygwin) ---------------------------- The easiest way to have link-grammar working on MS Windows is to use Cygwin, a Linux-like environment for Windows making it possible to port software running on POSIX systems to Windows. Download and -install Cygwin from http://www.cygwin.com/ . +install [Cygwin](http://www.cygwin.com/). Unfortunately, the Cygwin system is not compatible with Java, so if you need the Java bindings, you must use MSVC or MinGW, below. @@ -442,13 +466,13 @@ BUILDING on Windows (MinGW) --------------------------- Another way to build link-grammar is to use MinGW, which uses the GNU -toolset to compile Windows programs for Windows. Using MinGW/MSYS is +toolset to compile POSIX-compliant programs for Windows. Using MinGW/MSYS2 is probably the easiest way to obtain workable Java bindings for Windows. -Download and install MinGW, MSYS and MSYS-DTK from http://mingw.org. +Download and install MinGW/MSYS2 from [msys2.org](msys2.org). -For more details see [mingw/README.MSYS](mingw/README.MSYS). +For more details see [mingw/README-MSYS2.md](mingw/README-MSYS2.md). You can also build with MinGW under Cygwin. -See [mingw/README.Cygwin](mingw/README.Cygwin). +See [mingw/README-Cygwin.md](mingw/README-Cygwin.md). BUILDING and RUNNING on Windows (MSVC) @@ -590,36 +614,6 @@ cc -g -o $@ $^ `pkg-config --libs link-grammar` ``` -JAVA bindings -------------- -This release includes Java bindings. Their use is optional. - -The bindings will be built automatically if `jni.h` can be found. -Some common java JVM distributions (most notably, the ones from Sun) -place this file in unusual locations, where it cannot be -automatically found. To remedy this, make sure that JAVA_HOME is -set. The configure script looks for jni.h in `$JAVA_HOME/Headers` -and in `$JAVA_HOME/include`; it also examines corresponding locations -for $JDK_HOME. If `jni.h `still cannot be found, specify the location -with the CPPFLAGS variable: so, for example, -``` -export CPPFLAGS="-I/opt/jdk1.5/include/:/opt/jdk1.5/include/linux" -``` -or -``` -export CPPFLAGS="-I/c/java/jdk1.6.0/include/ -I/c/java/jdk1.6.0/include/win32/" -``` - -Please note that the use of /opt is non-standard, and most system -tools will fail to find packages installed there. - -The building of the Java bindings can be disabled by configuring as -below: -``` -./configure --disable-java-bindings -``` - - Using JAVA ---------- This release provides java files that offer three ways of accessing @@ -678,43 +672,13 @@ with the `!spell=0` flag. Enter `!help` for more details. -MULTI-THREADED USE ------------------- -It is safe to use link-grammar for parsing in multiple threads, once -the dictionaries have been loaded. The dictionary loading itself is -not thread-safe; it is not protected in any way. Thus, link-grammar -should not be used from multiple threads until the dictionary has -been loaded. Different threads may use different dictionaries. -Parse options can be set on a per-thread basis, with the exception -of verbosity, which is a global, shared by all threads. It is the -only global, outside of the Java bindings. - -For multi-threaded Java use, a per-thread variable is needed. This -must be enabled during the configure stage: -``` -./configure --enable-pthreads -``` - -The following exceptions and special notes apply: - -*utilities.c* − -> has global "verbosity". Memory usage code (disabled -> by default) also has a global, and so requires -> pthreads for tracking memory usage. - -*jni-client.c* − -> uses per-thread struct. This should somehow be -> attached to JNIEnv somehow. A Java JNI expert is needed. - -*malloc-dbg.c* − -> not thread safe, not normally used; -> only for debugging. - -*pp_lexer.c* − -> autogened code, original lex sources lost. -> This is only used when reading dictionaries, -> during initialization, and so doesn't need -> to be thread safe. +Multi-threading +--------------- +It is safe to use link-grammar for parsing in multiple threads. +Different threads may use different dictionaries, or the same dictionary. +Parse options can be set on a per-thread basis, with the exception of +verbosity, which is a global, shared by all threads. It is the only +global. SAT solver @@ -818,8 +782,8 @@ * Directional LG graphs resemble strict partial orders, except that the LG arrows are usually not transitive. - * Directional LG graphs resemble catena - http://en.wikipedia.org/wiki/Catena_(linguistics) + * Directional LG graphs resemble + [catena](http://en.wikipedia.org/wiki/Catena_(linguistics)) except that catena are strictly anti-transitive -- the path to any word is unique, in a catena. @@ -920,6 +884,13 @@ "He is either here or he is there." "He either is here or he is there." ``` + +A different example involves a crossing to the left wall. That is, the +links *LEFT-WALL--remains* crosses over *here--found*: +```text + "Here the remains can be found." +``` + Other examples, per And Rosta: The *allowed--by* link crosses *cake--that*: @@ -980,9 +951,9 @@ | | | | | | | | | | LEFT-WALL I.p want.v to.r look.v at and.j-v listen.v to.r everything ``` -The above rally wants to have a `Js` link from 'at' to 'everything', +The above really wants to have a `Js` link from 'at' to 'everything', but this `Js` link crosses (clashes with - marked by xxx) the link -to the conjunction. These two cases suggest that one sould/should +to the conjunction. These two cases suggest that one should allow most links to cross over the down-links to conjunctions. @@ -991,7 +962,7 @@ ----------- Link Grammar can be understood in the context of type theory. A simple introduction to type theory can be found in chapter 1 -of the HoTT book: https://homotopytypetheory.org/book/ .
+of the [HoTT book](https://homotopytypetheory.org/book/).
This book is freely available online and strongly recommended if you are interested in types. @@ -1003,7 +974,7 @@ easier-to-use. The foundational LG papers make comments to this effect; however, see -also work by Bob Coeke on category theory and grammar. Coecke's +also work by Bob Coecke on category theory and grammar. Coecke's diagramatic approach is essentially identical to the diagrams given in the foundational LG papers; it becomes abundantly clear that the category theoretic approach is equivalent to Link Grammar. See, for @@ -1054,10 +1025,10 @@ Some working notes. Easy to fix: provide a more uniform API to the constituent tree. -i.e provide word index. Also .. provide a clear word API, -showing word extent, suffix, etc. +i.e provide word index. Also, provide a better word API, +showing word extent, subscript, etc. -Capitalized first words: +### Capitalized first words: There are subtle technical issues for handling capitalized first words. This needs to be fixed. In addition, for now these words are shown uncapitalized in the result linkages. This can be fixed. @@ -1066,7 +1037,9 @@ could be handled! After all, it's essentially a nearest-neighbor phenomenon! -Capitalization-mark tokens: +See also [issue 690](https://github.com/opencog/link-grammar/issues/690) + +#### Capitalization-mark tokens: The proximal issue is to add a cost, so that Bill gets a lower cost than bill.n when parsing "Bill went on a walk". The best solution would be to add a 'capitalization-mark token' during @@ -1077,7 +1050,7 @@ where it can be handled like any other language feature. The tokenizer includes experimental code for that. -Corpus-statistics-based parse ranking: +### Corpus-statistics-based parse ranking: The old for parse ranking via corpus statistics needs to be revived. The issue can be illustrated with these example sentences: ```text @@ -1091,7 +1064,46 @@ object, and then turn off the lights. (Perhaps one is pleasing by turning off the lights?) -Punctuation, zero-copula, zero-that: +### Bad grammar: +When a sentence fails to parse, look for: + * confused words: its/it's, there/their/they're, to/too, your/you're ... + These could be added at high cost to the dicts. + * missing apostrophes in possessives: "the peoples desires" + * determiner agreement errors: "a books" + * aux verb agreement errors: "to be hooks up" + +Poor agreement might be handled by giving a cost to mismatched +lower-case connector letters. + +### Zero/phantom words: +An common phenomenon in English is that some words that one might +expect to "properly" be present can disappear under various conditions. +Below is a sampling of these. Some possible solutions are given below. + +Expressions such as "Looks good" have an implicit "it" (also called +a zero-it or phantom-it) in them; that is, the sentence should really +parse as "(it) looks good". The dictionary could be simplified by +admitting such phantom words explicitly, rather than modifying the +grammar rules to allow such constructions. Other examples, with the +phantom word in parenthesis, include: + * I ate all (of) the cookies. + * I taught him (how) to swim. + * I told him (that) it was gone. + * It stopped me (from) flying off the cliff. + * (It) looks good. + * (You) go home! + * (Are) you all right? + * He opened the door and (he) went in. + +Some complex phantom constructions: + * They play billiards but (they do) not (play) snooker. + * I know Ringo, but (I do) not (know) his brother. + * She likes Indian food, but (she does) not (like) Chinese (food). + * If this is true, then (you should) do it. + +See also [github issue #224](https://github.com/opencog/link-grammar/issues/224). + +#### Punctuation, zero-copula, zero-that: Poorly punctuated sentences cause problems: for example: ```text "Mike was not first, nor was he last." @@ -1101,26 +1113,29 @@ deal with this in a simple, fast, elegant way? Similar questions for zero-copula and zero-that sentences. -Zero/phantom words: Expressions such as "Looks good" have an implicit -"it" (also called a zero-it or phantom-it) in them; that is, the -sentence should really parse as "(it) looks good". The dictionary -could be simplified by admitting such phantom words explicitly, -rather than modifying the grammar rules to allow such constructions. -Other examples, with the phantom word in parenthesis, include: - * I ate all (of) the cookies. - * I taught him (how) to swim. - * I told him (that) it was gone. - * (It) looks good. - * (You) go home! - * (Are) you all right? +#### Context-dependent zero phrases. +Consider an argument between a professor and a dean, and the dean +wants the professor to write a brilliant review. At the end of the +argument, the dean exclaims: "I want the review brilliant!" This +is a predicative adjective; clearly it means "I want the review +[that you write to be] brilliant." However, taken out of context, +such a construction is ungrammatical, as the predicativeness is not +at all apparent, and it reads just as incorrectly as would +"*Hey Joe, can you hand me that review brilliant?" -See [this issue on GitHub](https://github.com/opencog/link-grammar/issues/224). +#### Imperatives as phantoms: +```text +"Push button" +"Push button firmly" +``` +The subject is a phantom; the subject is "you". +#### Handling zero/phantom words by explicitly inserting them: One possible solution is to perform a one-point compactification. -The dictionary contains the phantom words, and thier connectors. +The dictionary contains the phantom words, and their connectors. Ordinary disjuncts can link to these, but should do so using a special initial lower-case letter (say, 'z', in addition to -'h' and 't' as is currently implemented). The parser, as it +'h' and 'd' as is currently implemented). The parser, as it works, examines the initial letter of each connector: if it is 'z', then the usual pruning rules no longer apply, and one or more phantom words are selected out of the bucket of phantom words. @@ -1133,17 +1148,42 @@ be inserted into the sentence, with the location deduced from link lengths. -Bad grammar: When a sentence fails to parse, look for: - * confused words: its/it's, there/their/they're, to/too, your/you're ... - These could be added at high cost to the dicts. - * missing apostrophes in possessives: "the peoples desires" - * determiner agreement errors: "a books" - * aux verb agreement errors: "to be hooks up" - -Poor agreement might be handled by giving a cost to mismatched -lower-case connector letters. +#### Handling zero/phantom words as re-write rules. +A more principled approach to fixing the phantom-word issue is to +borrow the diea of re-writing from the theory of +[operator grammar](https://en.wikipedia.org/wiki/Operator_grammar). +That is, certain phrases and constructions can be (should be) +re-written into thier "proper form", prior to parsing. The re-writing +step would insert the missing words, then the parsing proceeds. One +appeal of such an approach is that re-writing can also handle other +"annoying" phenomena, such as typos (missing apostrophes, e.g. "lets" +vs. "let's", "its" vs. "its") as well as multi-word rewrites (e.g. +"let's" vs. "let us", or "it's" vs. "it is"). + +Exactly how to implement this is unclear. However, it seems to open +the door to more abstract, semantic analysis. Thus, for example, in +Meaning-Text Theory (MTT), one must move betweeen SSynt to DSynt +structures. Such changes require a graph re-write from the surface +syntax parse (e.g. provided by link-grammar) to the deep-syntactic +structure. By contrast, handling phantom words by graph re-writing +prior to parsing inverts the order of processing. This suggests that +a more holistic approach is needed to graph rewriting: it must somehow +be performed "during" parsing, so that parsing can both guide the +insertion of the phantom words, and, simultanously guide the deep +syntactic rewrites. + +Another interesting possibility arises with regards to tokenization. +The current tokenizer is clever, in that it splits not only on +whitespace, but can also strip off prefixes, suffixes, and perform +certain limited kinds of morphological splitting. That is, it currently +has the ability to re-write single-words into sequences of words. It +currently does so in a conservative manner; the letters that compose +a word are preserved, with a few exceptions, such as making spelling +correction suggestions. The above considerations suggest that the +boundary between tokenization and parsing needs to become both more +fluid, and more tightly coupled. -Poor linkage choices: +### Poor linkage choices: Compare "she will be happier than before" to "she will be more happy than before." Current parser makes "happy" the head word, and "more" a modifier w/EA link. I believe the correct solution would be to @@ -1154,7 +1194,7 @@ However, this idea needs to be double-checked against, e.g. Hudson's word grammar. I'm confused on this issue ... -Stretchy links: +### Stretchy links: Currently, some links can act at "unlimited" length, while others can only be finite-length. e.g. determiners should be near the noun that they apply to. A better solution might be to employ @@ -1162,7 +1202,8 @@ higher the cost. (This eliminates the "unlimited_connector_set" in the dictionary). -Repulsive parses: Sometimes, the existence of one parse should suggest +### Opposing (repulsing) parses: +Sometimes, the existence of one parse should suggest that another parse must surely be wrong: if one parse is possible, then the other parses must surely be unlikely. For example: the conjunction and.j-g allows the "The Great Southern and Western @@ -1183,27 +1224,41 @@ one, or one should be eliminated. -WordNet hinting: +### WordNet hinting: Use WordNet to reduce the number for parses for sentences containing compound verb phrases, such as "give up", "give off", etc. -Incremental parsing: to avoid a combinatorial explosion of parses, -it would be nice to have an incremental parsing, phrase by phrase, -using a Viterbi-like algorithm to obtain the parse. Thus, for example, -the parse of the last half of a long, run-on sentence should not be -sensitive to the parse of the beginning of the sentence. +### Sliding-window (Incremental) parsing: +To avoid a combinatorial explosion of parses, it would be nice to +have an incremental parsing, phrase by phrase, using a sliding window +algorithm to obtain the parse. Thus, for example, the parse of the +last half of a long, run-on sentence should not be sensitive to the +parse of the beginning of the sentence. Doing so would help with combinatorial explosion. So, for example, if the first half of a sentence has 4 plausible parses, and the -last half has 4 more, then link-grammar reports 16 parses total. -It would be much, much more useful to instead be given the -factored results: i.e. the four plausible parses for the -first half, and the four plausible parses for the last half. -The lower combinatoric stress would ease the burden on -downstream users of link-grammar. - -(This somewhat resembles the application of construction grammar -ideas to the link-grammar dictionary). +last half has 4 more, then currently, the parser reports 16 parses +total. It would be much more useful if it could instead report the +factored results: i.e. the four plausible parses for the first half, +and the four plausible parses for the last half. This would ease +the burden on downstream users of link-grammar. + +This approach has at psychological support. Humans take long sentences +and split them into smaller chunks that "hang together" as phrase- +structures, viz compounded sentences. The most likely parse is the +one where each of the quasi sub-sentences is parsed correctly. + +This could be implemented by saving dangling right-going connectors +into a parse context, and then, when another sentence fragment +arrives, use that context in place of the left-wall. + +This somewhat resembles the application of construction grammar +ideas to the link-grammar dictionary. It also somewhat resembles +Viterbi parsing to some fixed depth. Viz. do a full backward-forward +parse for a phrase, and then, once this is done, take a Viterbi-step. +That is, once the phrase is done, keep only the dangling connectors +to the phrase, place a wall, and then step to the next part of the +sentence. Caution: watch out for garden-path sentences: ```text @@ -1217,8 +1272,10 @@ * Less sensitive to sentence boundaries: this would allow longer, run-on sentences to be parsed far more quickly. * Could do better with slang, hip-speak. -* Would enable co-reference resolution across sentences (resolve - pronouns, etc.) +* Support for real-time dialog (parsing of half-uttered sentences). +* Parsing of multiple streams, e.g. from play/movie scripts. +* Would enable (or simplify) co-reference resolution across sentences + (resolve referents of pronouns, etc.) * Would allow richer state to be passed up to higher layers: specifically, alternate parses for fractions of a sentence, alternate reference resolutions. @@ -1240,7 +1297,7 @@ per Morten Christiansen, Cornell professor of psychology. -Registers, sociolects, dialects (cost vectors): +### Registers, sociolects, dialects (cost vectors): Consider the sentence "Thieves rob bank" -- a typical newspaper headline. LG currently fails to parse this, because the determiner is missing ("bank" is a count noun, not a mass noun, and thus @@ -1278,23 +1335,16 @@ thus causing the costs for that dialect to be employed during parse ranking. -Imperatives: -```text -"Push button" -"Push button firmly" -``` -The zero/phantom-word solution, described above, should help with this. - -Hand-refining verb patterns:
- A good reference for refining verb usage patterns is:
- COBUILD GRAMMAR PATTERNS 1: VERBS
- from THE COBUILD SERIES /from/ THE BANK OF ENGLISH
- HARPER COLLINS
- online at https://arts-ccr-002.bham.ac.uk/ccr/patgram/
- http://www.corpus.bham.ac.uk/publications/index.shtml +### Hand-refining verb patterns: +A good reference for refining verb usage patterns is: +"COBUILD GRAMMAR PATTERNS 1: VERBS from THE COBUILD SERIES", +from THE BANK OF ENGLISH, HARPER COLLINS. Online at +https://arts-ccr-002.bham.ac.uk/ccr/patgram/ and +http://www.corpus.bham.ac.uk/publications/index.shtml -*Quotations*: tokenize.c tokenizes Double-quotes and some UTF8 quotes +### Quotations: + Currently tokenize.c tokenizes double-quotes and some UTF8 quotes (see the RPUNC/LPUNC class in en/4.0.affix - the QUOTES class is not used for that, but for capitalization support), with some very basic support in the English dictionary (see "% Quotation marks." @@ -1302,6 +1352,7 @@ quotes, such as ‘these’ and “these”. This results is some ugly parsing for sentences containing such quotes. (Note that these are in 4.0.affix). + A mechanism is needed to disentangle the quoting from the quoted text, so that each can be parsed appropriately. It's somewhat unclear how to handle this within link-grammar. This is somewhat @@ -1311,6 +1362,9 @@ only... but also ...) which have a long-range structure similar to quoted text (he said ...). + See also [github issue #42](https://github.com/opencog/link-grammar/issues/42). + +### Semantification of the dictionary: "to be fishing": Link grammar offers four parses of "I was fishing for evidence", two of which are given low scores, and two are given high scores. Of the two with high scores, one parse is clearly bad. @@ -1353,7 +1407,7 @@ edited by Elena Tognini-Bonelli, volume 4), 2000
[Book review](http://www.aclweb.org/anthology/J01-2013). - "holes" in collocations (aka "set phrases" of "phrasemes"): +### "holes" in collocations (aka "set phrases" of "phrasemes"): The link-grammar provides several mechanisms to support circumpositions or even more complicated multi-word structures. One mechanism is by ordinary links; see the V, XJ and RJ links. @@ -1363,8 +1417,8 @@ The general problem is of supporting structures that have "holes" in the middle, that require "lacing" to tie them together. - For a general theory, see: - [](http://en.wikipedia.org/wiki/Catena_(linguistics)) + For a general theory, see + [catena](http://en.wikipedia.org/wiki/Catena_(linguistics)). For example, the adposition: ```text @@ -1480,6 +1534,7 @@ mutual information content, they can dominate the syntactic structure of a sentence. +### Lexical functions: MTT suggests that perhaps the correct way to understand the contents of the post-processing rules is as an implementation of 'lexical functions' projected onto syntax. That is, the post-processing @@ -1491,7 +1546,7 @@ of possible parses of a given sentence. It would seem that lexical functions could be used to rule out many of these parses. On the other hand, the results are likely to be similar to that of - statistical pare ranking (which presumably captures such + statistical parse ranking (which presumably captures such quasi-idiomatic collocations at least weakly). Ref. I. Mel'cuk: "Collocations and Lexical Functions", in ''Phraseology: @@ -1501,8 +1556,9 @@ More generally, all of link-grammar could benefit from a MTT-izing of infrastructure. - Compare the above problem to Hebrew morphological analysis. To quote - Wikipedia: +### Morphology: +Compare the above commentary on lexical functions to Hebrew morphological +analysis. To quote Wikipedia: > This distinction between the word as a unit of speech and the > root as a unit of meaning is even more important in the case of @@ -1517,138 +1573,29 @@ > along with many other words such as godel "size" and migdal > "tower". -- Dealing with long, ambiguous sentences: - - These are busted up by humans into smaller sentences that "hang - together" as phrase-structures, viz compounded sentences. The most - likely parse is then when each of the quasi sub-sentences is - parsed correctly. - -- Alternatives: - - A partial solution to the morphology problem and the idiom problem - in link-grammar is to elevate the use of "alternatives" in the - Word struct. Originally, these were morphological split alternatives - for the Russian dicts, but really, they are a way of hierarchically - arranging choices for words... - - Status: DONE! Implemented from version 5.3.0. See the section - titled "Introduction of a word-graph for tokenizing" in - [link-grammar/README](link-grammar/README). - -- Morphology printing: +### Morphology printing: Instead of hard-coding LL, declare which links are morpho links in the dict. -- Word-order flexibility (For Lithuanian, the following are desperately needed): - * connectors with * direction, i.e. either left or right. - * symmetric (commuting) version of &. - * DONE! The new symbols are ^ for commuting-& and $ to meaneither + or -. - - This still needs to be documented. - -- Incremental sentence parsing. - There are multiple reasons to support incremental parsing: - * Real-time dialog - * Parsing of multiple streams, e.g. from play/movie scripts - * segmentation of exceptionally long sentences. - - This could be implemented by saving dangling right-going - connectors into a parse context, and then, when another sentence - fragment arrives, use that context in place of the left-wall. - -- UTF-8 cleanup: +### UTF-8 cleanup: Replace the mbrtowc code with proper language support; it seems - that the correct solution is to use ICU http://site.icu-project.org/ - * ICU pros: runs on windows - * ICU cons: big, complex - - Another alternative is libunistring (which seems to be LGPL!?) - http://www.gnu.org/software/libunistring/ - * pros: smaller, simpler than ICU - * cons: might have problems with MS Windows. + that the correct solution is to use [ICU](http://site.icu-project.org/) + * ICU pros: runs on windows. + * ICU cons: big, complex. + + Another alternative is [libunistring](http://www.gnu.org/software/libunistring/) + (which seems to be LGPL!?) + * Pros: smaller, simpler than ICU. + * Cons: might have problems with MS Windows. -- Assorted minor cleanup: +### Assorted minor cleanup: * Should provide a query that returns compile-time consts, e.g. the max number of characters in a word, or max words - in a sentence + in a sentence. * Should remove compile-time constants, e.g. max words, max length etc. -- Misc TODO: - * finish sqlite3 work - -Version 6.0 TODO list: -Version 6.0 will change `Sentence` to `Sentence*,` `Linkage` to `Linkage*` in the API. Perhaps this is a bad idea... - - -A performance diary -------------------- -Time to parse some long sentences: -The original results below were for version 5.0.8 (April 2014) -The June 2014 results are for version 5.1.0 -The Feb 2017 results are on version 5.3.15 -Times are user-times, with dict loading subtracted. - -These are very highly dependent on the aggressiveness of the token -splitter, on the short length, on the cost-max and the spell checker. -Suggest using flags: -spell=0 -short=10 -cost-max=2.1 - -25 words + 2 punct, 0.2 seconds (0.7 seconds June 2014) -(0.2 secs SAT, June 2014): -```text -Hot runners usually make the mold more expensive to manufacture and run, -but allow savings by reducing plastic waste and by reducing the cycle time. -``` - -38 words + 4 punct: 2.4 seconds (2.6 secs, June 2014) -(0.32 secs, SAT, June 2014) (3.2 sec Feb 2017): -```text -The strongest rain ever recorded in India shut down the financial hub -of Mumbai, snapped communication lines, closed airports and forced -thousands of people to sleep in their offices or walk home during the -night, officials said today. -``` - -50 words + 9 punct: 14 seconds (3.9 secs June 2014) -(0.64 secs, SAT June 2014) (2.1 secs Feb 2017): -```text -In vivo studies of the activity of four of the kinases, KinA, KinC, -KinD (ykvD) and KinE (ykrQ), using abrB transcription as an indicator -of Spo0A~P level,revealed that KinC and KinD were responsible for -Spo0A~P production during the exponential phase of growth in the absence -of KinA and KinB. -``` - -56 words + 8 punct: 4.5 seconds (1.45 secs June 2014) -(0.38 secs, SAT June 2014) (broken, Feb 2017): -```text -New York Post: The new Mel Brooks/Susan Stroman musical extravaganza ... -is nearly very good indeed - but it is not the The Producers ... -this story ... does not lend itself to stage adaptation in the way of -the earlier movie ... Now for the good news ... Brooks and Stroman -pull out every stop. -``` - -57 words + 10 punct: 7.5 seconds (6.8 seconds June 2014) -(0.68 secs, SAT June 2014) (4.5 seconds 4.3.15 Feb 2017): -```text -However, the few tracts, the poetry, and the novels that embodied the -social vision of Young England were directed to a New Generation of -educated, religious, and socially conscious conservatives, who, like -Young Englanders, were appalled at the despiritualizing effects of -industrialization and the perceived amorality of Benthamite philosophy, -which they blamed equally for Victorian social injustices. -``` - -73 words + 8 punct: 145 seconds: -```text -Cortes in his various letters again and again claims the Emperor's -patronage of his bold defiance of the Emperor's officers on the ground -that the latter in their action were moved solely by considerations of -their personal gain, whereas he, Cortes, was striving to endow his -sovereign with a rich new empire and boundless treasure whilst carrying -into the dark pagan land, at the sword's point, the gentle creed of the -Christian God. -``` +### Version 6.0 TODO list: +Version 6.0 will change `Sentence` to `Sentence*,` `Linkage` to +`Linkage*` in the API. But perhaps this is a bad idea... diff -Nru link-grammar-5.3.16/tests/dict-reopen.cc link-grammar-5.5.0/tests/dict-reopen.cc --- link-grammar-5.3.16/tests/dict-reopen.cc 2017-02-12 12:47:43.000000000 +0000 +++ link-grammar-5.5.0/tests/dict-reopen.cc 2018-01-08 06:37:08.000000000 +0000 @@ -9,7 +9,7 @@ /* */ /***************************************************************************/ -// This implelements a simple check, opening and closing the dictionary +// This implements a simple check, opening and closing the dictionary // repeatedly. #include diff -Nru link-grammar-5.3.16/tests/Makefile.am link-grammar-5.5.0/tests/Makefile.am --- link-grammar-5.3.16/tests/Makefile.am 2017-02-12 12:45:29.000000000 +0000 +++ link-grammar-5.5.0/tests/Makefile.am 2018-01-11 19:29:21.000000000 +0000 @@ -12,7 +12,7 @@ # ----------------------------------------------------------- # TESTS declares the tests to actually run; # check_PROGRAMS are the binaries to build. -check_PROGRAMS = dict-reopen multi-thread mem-leak +check_PROGRAMS = dict-reopen multi-dict multi-thread mem-leak if HAVE_JAVA check_PROGRAMS += multi-java @@ -26,6 +26,7 @@ LDFLAGS += $(LINK_CXXFLAGS) dict_reopen_SOURCES = dict-reopen.cc +multi_dict_SOURCES = multi-dict.cc multi_thread_SOURCES = multi-thread.cc mem_leak_SOURCES = mem-leak.cc @@ -34,6 +35,7 @@ LDADD += $(SQLITE3_LIBS) endif +multi_dict_LDADD = -lpthread $(LDADD) multi_thread_LDADD = -lpthread $(LDADD) if WITH_SAT_SOLVER diff -Nru link-grammar-5.3.16/tests/Makefile.in link-grammar-5.5.0/tests/Makefile.in --- link-grammar-5.3.16/tests/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/tests/Makefile.in 2018-04-29 21:40:04.000000000 +0000 @@ -91,8 +91,8 @@ POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ -check_PROGRAMS = dict-reopen$(EXEEXT) multi-thread$(EXEEXT) \ - mem-leak$(EXEEXT) $(am__EXEEXT_1) +check_PROGRAMS = dict-reopen$(EXEEXT) multi-dict$(EXEEXT) \ + multi-thread$(EXEEXT) mem-leak$(EXEEXT) $(am__EXEEXT_1) @HAVE_JAVA_TRUE@am__append_1 = multi-java @HAVE_JAVA_TRUE@am__append_2 = $(JAVA_CPPFLAGS) @HAVE_SQLITE_TRUE@am__append_3 = $(SQLITE3_LIBS) @@ -128,11 +128,14 @@ mem_leak_LDADD = $(LDADD) mem_leak_DEPENDENCIES = $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_3) +am_multi_dict_OBJECTS = multi-dict.$(OBJEXT) +multi_dict_OBJECTS = $(am_multi_dict_OBJECTS) +am__DEPENDENCIES_4 = $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_1) \ + $(am__DEPENDENCIES_3) +multi_dict_DEPENDENCIES = $(am__DEPENDENCIES_4) am__multi_java_SOURCES_DIST = multi-java.cc @HAVE_JAVA_TRUE@am_multi_java_OBJECTS = multi-java.$(OBJEXT) multi_java_OBJECTS = $(am_multi_java_OBJECTS) -am__DEPENDENCIES_4 = $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_3) @HAVE_JAVA_TRUE@multi_java_DEPENDENCIES = $(am__DEPENDENCIES_4) am_multi_thread_OBJECTS = multi-thread.$(OBJEXT) multi_thread_OBJECTS = $(am_multi_thread_OBJECTS) @@ -172,9 +175,11 @@ am__v_CXXLD_0 = @echo " CXXLD " $@; am__v_CXXLD_1 = SOURCES = $(dict_reopen_SOURCES) $(mem_leak_SOURCES) \ - $(multi_java_SOURCES) $(multi_thread_SOURCES) + $(multi_dict_SOURCES) $(multi_java_SOURCES) \ + $(multi_thread_SOURCES) DIST_SOURCES = $(dict_reopen_SOURCES) $(mem_leak_SOURCES) \ - $(am__multi_java_SOURCES_DIST) $(multi_thread_SOURCES) + $(multi_dict_SOURCES) $(am__multi_java_SOURCES_DIST) \ + $(multi_thread_SOURCES) am__can_run_installinfo = \ case $$AM_UPDATE_INFO_DIR in \ n|no|NO) false;; \ @@ -412,6 +417,7 @@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ANTfound = @ANTfound@ AR = @AR@ +AS = @AS@ ASPELL_CFLAGS = @ASPELL_CFLAGS@ ASPELL_LIBS = @ASPELL_LIBS@ AUTOCONF = @AUTOCONF@ @@ -442,6 +448,7 @@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ +HOST_OS = @HOST_OS@ HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ HUNSPELL_LIBS = @HUNSPELL_LIBS@ INSTALL = @INSTALL@ @@ -450,13 +457,16 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ +JAVA_SRCDIR = @JAVA_SRCDIR@ LD = @LD@ LDFLAGS = @LDFLAGS@ $(LINK_CXXFLAGS) +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LG_DEFS = @LG_DEFS@ LG_PYDIR = @LG_PYDIR@ LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ @@ -523,12 +533,12 @@ STRIP = @STRIP@ SWIG = @SWIG@ SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ WARN_CFLAGS = @WARN_CFLAGS@ WARN_CXXFLAGS = @WARN_CXXFLAGS@ ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -598,10 +608,12 @@ @HAVE_JAVA_TRUE@multi_java_LDADD = -L$(top_builddir)/bindings/java-jni/ -llink-grammar-java -lpthread $(LDADD) TESTS = $(check_PROGRAMS) dict_reopen_SOURCES = dict-reopen.cc +multi_dict_SOURCES = multi-dict.cc multi_thread_SOURCES = multi-thread.cc mem_leak_SOURCES = mem-leak.cc LDADD = -L$(top_builddir)/link-grammar/ -llink-grammar $(am__append_3) \ $(am__append_4) $(am__append_5) +multi_dict_LDADD = -lpthread $(LDADD) multi_thread_LDADD = -lpthread $(LDADD) all: all-am @@ -654,6 +666,10 @@ @rm -f mem-leak$(EXEEXT) $(AM_V_CXXLD)$(CXXLINK) $(mem_leak_OBJECTS) $(mem_leak_LDADD) $(LIBS) +multi-dict$(EXEEXT): $(multi_dict_OBJECTS) $(multi_dict_DEPENDENCIES) $(EXTRA_multi_dict_DEPENDENCIES) + @rm -f multi-dict$(EXEEXT) + $(AM_V_CXXLD)$(CXXLINK) $(multi_dict_OBJECTS) $(multi_dict_LDADD) $(LIBS) + multi-java$(EXEEXT): $(multi_java_OBJECTS) $(multi_java_DEPENDENCIES) $(EXTRA_multi_java_DEPENDENCIES) @rm -f multi-java$(EXEEXT) $(AM_V_CXXLD)$(CXXLINK) $(multi_java_OBJECTS) $(multi_java_LDADD) $(LIBS) @@ -670,6 +686,7 @@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dict-reopen.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mem-leak.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/multi-dict.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/multi-java.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/multi-thread.Po@am__quote@ @@ -902,6 +919,13 @@ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ + "$$tst" $(AM_TESTS_FD_REDIRECT) +multi-dict.log: multi-dict$(EXEEXT) + @p='multi-dict$(EXEEXT)'; \ + b='multi-dict'; \ + $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ + --log-file $$b.log --trs-file $$b.trs \ + $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) multi-thread.log: multi-thread$(EXEEXT) @p='multi-thread$(EXEEXT)'; \ diff -Nru link-grammar-5.3.16/tests/multi-dict.cc link-grammar-5.5.0/tests/multi-dict.cc --- link-grammar-5.3.16/tests/multi-dict.cc 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/tests/multi-dict.cc 2018-01-11 20:07:05.000000000 +0000 @@ -0,0 +1,107 @@ +/*************************************************************************/ +/* Copyright (c) 2014,2018 Linas Vepstas */ +/* All rights reserved */ +/* */ +/* Use of the link grammar parsing system is subject to the terms of the */ +/* license set forth in the LICENSE file included with this software. */ +/* This license allows free redistribution and use in source and binary */ +/* forms, with or without modification, subject to certain conditions. */ +/* */ +/*************************************************************************/ + +// This implements a very simple-minded multi-threaded unit test. +// All it does is to make sure the system doesn't crash e.g. due to +// memory allocation conflicts. + +#include +#include + +#include +#include +#include +#include "link-grammar/link-includes.h" + +static void parse_one_sent(const char *sent_str) +{ + Parse_Options opts = parse_options_create(); + // Dictionary dict = dictionary_create_lang("ru"); + Dictionary dict = dictionary_create_lang("en"); + if (!dict) { + fprintf (stderr, "Fatal error: Unable to open the dictionary\n"); + exit(1); + } + + Sentence sent = sentence_create(sent_str, dict); + if (!sent) { + fprintf (stderr, "Fatal error: Unable to create parser\n"); + exit(2); + } + + sentence_split(sent, opts); + int num_linkages = sentence_parse(sent, opts); + if (num_linkages <= 0) { + fprintf (stderr, "Fatal error: Unable to parse sentence\n"); + exit(3); + } + + if (2 < num_linkages) num_linkages = 2; + for (int li = 0; li thread_pool; + for (int i=0; i < n_threads; i++) { + thread_pool.push_back(std::thread(parse_sents, i, niter)); + } + + // Wait for all threads to complete + for (std::thread& t : thread_pool) t.join(); + printf("Done with multi-threaded parsing\n"); + + return 0; +} diff -Nru link-grammar-5.3.16/tests/multi-thread.cc link-grammar-5.5.0/tests/multi-thread.cc --- link-grammar-5.3.16/tests/multi-thread.cc 2017-02-12 12:32:15.000000000 +0000 +++ link-grammar-5.5.0/tests/multi-thread.cc 2018-01-24 01:19:58.000000000 +0000 @@ -18,13 +18,24 @@ #include #include +#include #include "link-grammar/link-includes.h" static void parse_one_sent(Dictionary dict, Parse_Options opts, const char *sent_str) { Sentence sent = sentence_create(sent_str, dict); + if (!sent) { + fprintf (stderr, "Fatal error: Unable to create parser\n"); + exit(2); + } sentence_split(sent, opts); int num_linkages = sentence_parse(sent, opts); +#if 0 + if (num_linkages <= 0) { + fprintf (stderr, "Fatal error: Unable to parse sentence\n"); + exit(3); + } +#endif if (0 < num_linkages) { if (10 < num_linkages) num_linkages = 10; @@ -32,7 +43,9 @@ for (int li = 0; li thread_pool; - for (int i=0; i < n_threads; i++) { + for (int i=0; i < n_threads; i++) + { + Dictionary dict = dicte; + if (0 == i%3) dict = dictr; + + Parse_Options opts = optsa; + if (0 == i%2) opts = optsb; + thread_pool.push_back(std::thread(parse_sents, dict, opts, i, niter)); } @@ -100,7 +158,9 @@ for (std::thread& t : thread_pool) t.join(); printf("Done with multi-threaded parsing\n"); - dictionary_delete(dict); - parse_options_delete(opts); + dictionary_delete(dicte); + dictionary_delete(dictr); + parse_options_delete(optsa); + parse_options_delete(optsb); return 0; } diff -Nru link-grammar-5.3.16/viterbi/atom.cc link-grammar-5.5.0/viterbi/atom.cc --- link-grammar-5.3.16/viterbi/atom.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/atom.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,368 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include -#include -#include - -#include "atom.h" -#include "utilities.h" -#include "compile-base.h" - -namespace atombase { - -using namespace std; - -bool TV::operator==(const TV& other) const -{ - // The ULP for single-precision floating point is approx 1.0e-7.2 - if (fabs(other._strength - _strength) < 1.0e-6) return true; - return false; -} - -// ==================================================== - -// Single, global mutex for locking the incoming set. -std::mutex Atom::IncomingSet::_mtx; - -// Destructor. -Atom::~Atom() -{ - drop_incoming_set(); -} - -/// Start tracking the incoming set for this atom. -/// An atom can't know what it's incoming set is, until this method -/// is called. If this atom is added to any links before this call -/// is made, those links won't show up in the incoming set. -/// -/// We don't automatically track incoming sets for two reasons: -/// 1) std::set takes up 48 bytes -/// 2) adding and remoiving uses up cpu cycles. -/// Thus, if the incoming set isn't needed, then don't bother -/// tracking it. -void Atom::keep_incoming_set() -{ - if (_incoming_set) return; - _incoming_set = new IncomingSet; -} - -/// Stop tracking the incoming set for this atom. -/// After this call, the incoming set for this atom can no longer -/// be queried; it si erased. -void Atom::drop_incoming_set() -{ - if (NULL == _incoming_set) return; - std::lock_guard lck (_incoming_set->_mtx); - - // Perform an explicit free and delete; this alleviates - // pressure on the garbage collector. - _incoming_set->_iset.clear(); - delete _incoming_set; - _incoming_set = NULL; -} - -// #define WEAK_POINTER_HASH 0x5555555555555555UL -#define WEAK_POINTER_HASH 0x0UL - -/// Add an atom to the incoming set. -void Atom::insert_atom(Link* a) -{ - if (NULL == _incoming_set) return; - std::lock_guard lck (_incoming_set->_mtx); - // Create a weak pointer, hidden from GC, by XOR'ing it. I think - // thi si the right thing to do. The Boehm GC docs suggest that - // GC_register_disappearing_link() be used, but I think that's only - // when registering finalizers, which we don't do. - WeakLinkPtr wa = (WeakLinkPtr) a; - _incoming_set->_iset.insert(wa ^ WEAK_POINTER_HASH); -} - -/// Remove an atom from the incoming set. -void Atom::remove_atom(Link* a) -{ - if (NULL == _incoming_set) return; - std::lock_guard lck (_incoming_set->_mtx); - WeakLinkPtr wa = (WeakLinkPtr) a; - _incoming_set->_iset.erase(wa ^ WEAK_POINTER_HASH); -} - -/// Return a copy of the entire incoming set of this atom. -/// -/// This returns a copy of the incoming set at the time it was called. -/// This call is thread-safe, and thread-consistent (i.e. the incoming -/// set is guaranteed not to get smaller for as long as the link is -/// held; it may, however, get larger, if there are any atoms creted -/// after this method returns. -Set* Atom::get_incoming_set() const -{ - if (NULL == _incoming_set) return new Set(); - std::function filter = [&](Link* a) -> Atom* - { - return a; - }; - return filter_iset(filter); -#ifdef EQUIVALENT_TO_ABOVE - // The above expands out into this. - std::unique_lock lck (_incoming_set->_mtx); - OutList oset; - std::set::iterator it = _incoming_set->_iset.begin(); - std::set::iterator end = _incoming_set->_iset.end(); - for (; it != end; ++it) - { - oset.push_back(*it); - } - lck.unlock(); - return new Set(oset); -#endif -} - -/// Like above, but filtering for type. -Set* Atom::get_incoming_set(AtomType type) const -{ - if (NULL == _incoming_set) return new Set(); - std::function filter = [type](Link* a) -> Atom* - { - if (a->get_type() != type) return NULL; - return a; - }; - return filter_iset(filter); -} - -// ==================================================== - -/// Add a nemed relation to the atom. -/// This method only creates named binary relations. -Relation* Atom::add_relation(const char* name, Atom* val) -{ - // We need to keep the incoming set, else the relation will not be findable. - keep_incoming_set(); - return new Relation(name, this, val); -} - -Set* Atom::filter_iset(std::function filter) const -{ - if (NULL == _incoming_set) return new Set(); - std::unique_lock lck (_incoming_set->_mtx); - OutList oset; - std::set::iterator it = _incoming_set->_iset.begin(); - std::set::iterator end = _incoming_set->_iset.end(); - for (; it != end; ++it) - { - WeakLinkPtr wa = *it; - wa = wa ^ WEAK_POINTER_HASH; - Link* l = (Link*) wa; - Atom* a = filter(l); - if (a) oset.push_back(a); - } - - // Unlock the mutex before calling new, below. - lck.unlock(); - return new Set(oset); -} - -/// Get the set of all named relations -Set* Atom::get_relations(const char* name) const -{ - Label* lab = new Label(name); - std::function filter = [this, lab](Link* a) -> Atom* - { - if (RELATION != a->get_type()) return NULL; - assert(1 < a->get_arity(), "Relation cannot be zero-ary"); - if (lab != a->get_outgoing_atom(0)) return NULL; - assert(this == a->get_outgoing_atom(1), "Corrupted outgoing set for relation"); - return a; - }; - return filter_iset(filter); -} - -/// Get the set of all the values of the named relations -Set* Atom::get_relation_vals(const char* name) const -{ - Label* lab = new Label(name); - std::function filter = [this, lab](Link* a) -> Atom* - { - if (RELATION != a->get_type()) return NULL; - assert(3 == a->get_arity(), "Expecting binary relation"); - if (lab != a->get_outgoing_atom(0)) return NULL; - assert(this == a->get_outgoing_atom(1), "Corrupted outgoing set for relation"); - return a->get_outgoing_atom(2); - }; - return filter_iset(filter); -} - -// ==================================================== - -/// Constructor. Place self into incoming set. -/// For every atom in the outgoing set of this link, add this link -/// to that atom's incoming set. -void Link::add_to_incoming_set() -{ - size_t arity = get_arity(); - for (size_t i=0; iinsert_atom(this); -} - -/// Place self into incoming sets, but only they are of type t. -/// For every atom of type t in the outgoing set of this link, add this -/// link to that atom's incoming set. -void Link::add_to_incoming_set(AtomType t) -{ - size_t arity = get_arity(); - for (size_t i=0; iget_type() == t) - _oset[i]->insert_atom(this); -} - -/// Remove self from the incoming sets, if they are of type t. -/// For every atom of type t in the outgoing set of this link, remove -/// this link from that atom's incoming set. -void Link::remove_from_incoming_set(AtomType t) -{ - size_t arity = get_arity(); - for (size_t i=0; iget_type() == t) - _oset[i]->remove_atom(this); -} - -/// Enable the tracking of incoming sets for atoms of type t. -/// For every atom of type t in the outgoing set of this link, enable -/// incoming-set tracking, and add this link to that atom's incoming set. -void Link::enable_keep_incoming_set(AtomType t) -{ - size_t arity = get_arity(); - for (size_t i=0; iget_type() == t) - { - _oset[i]->keep_incoming_set(); - _oset[i]->insert_atom(this); - } -} - -/// Disable the tracking of incoming sets for atoms of type t. -/// For every atom of type t in the outgoing set of this link, disable -/// incoming-set tracking. -void Link::disable_keep_incoming_set(AtomType t) -{ - size_t arity = get_arity(); - for (size_t i=0; iget_type() == t) - { - _oset[i]->drop_incoming_set(); - } -} - -// Destructor. Remove self from incoming set. -// Note: with garbage collection, this destructor is never called -// (and that is how things should be). We keep it around here, for the -// rainy day when we swith to reference-counted pointers. -// -// Note also: if this ever was called during gc, e.g. as a finalizer, -// it will lead to deadlocks, since gc could get triggered by the call -// to insert_atom(), which aleady holds the same global lock that -// remove_atom() would use. i.e. using this in a finalizer will dead-lock -// (unless we convert to per-atom locks, which could be wasteful). -Link::~Link() -{ - // Alleviate pressure on the GC, whenever possible. - size_t arity = get_arity(); - - // Cast-away const so that we can trash the contents. - OutList& os = (OutList&) _oset; - for (size_t i=0; iremove_atom(this); - os[i] = NULL; - } - os.resize(0); -} - -// ==================================================== - -bool Atom::operator==(const Atom* other) const -{ - if (!other) return false; - if (other == this) return true; - if (other->_type == this->_type and - other->_tv == this->_tv) return true; - return false; -} - -bool Node::operator==(const Atom* other) const -{ - if (!other) return false; - if (other == this) return true; - if (other->get_type() != this->get_type()) return false; - if (not other->_tv.operator==(this->_tv)) return false; - const Node* nother = dynamic_cast(other); - if (nother->_name != this->_name) return false; - return true; -} - -bool Link::operator==(const Atom* other) const -{ - if (!other) return false; - if (other == this) return true; - if (other->get_type() != this->get_type()) return false; - if (not other->_tv.operator==(this->_tv)) return false; - const Link* lother = dynamic_cast(other); - if (lother->get_arity() != this->get_arity()) return false; - for (size_t i=0; i_oset[i]->operator==(this->_oset[i]))) return false; - } - return true; -} - - -std::ostream& do_prt(std::ostream& out, const Atom* a, int ilvl) -{ - static const char *indent_str = " "; - const Node* n = dynamic_cast(a); - if (n) - { - for (int i=0; iget_type() << " : " << n->get_name(); - if (0.0f != n->_tv._strength) - out << " (" << n->_tv._strength << ")"; - out << endl; - return out; - } - const Link* l = dynamic_cast(a); - if (l) - { - for (int i=0; iget_type() <<" :"; - if (0.0f != l->_tv._strength) - out << " (" << l->_tv._strength << ")"; - out << endl; - - ilvl++; - size_t lsz = l->get_arity(); - for (size_t i=0; i < lsz; i++) - { - do_prt(out, l->get_outgoing_atom(i), ilvl); - } - return out; - } - - out << "xxx-null-ptr-xxx"; - return out; -} - -std::ostream& operator<<(std::ostream& out, const Atom* a) -{ - return do_prt(out, a, 0); -} - -} // namespace atombase - diff -Nru link-grammar-5.3.16/viterbi/atom.h link-grammar-5.5.0/viterbi/atom.h --- link-grammar-5.3.16/viterbi/atom.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/atom.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,258 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _ATOMBASE_ATOM_H -#define _ATOMBASE_ATOM_H - -#include -#include -#include -#include -#include - -#include -#include - -#include "atom-types.h" - -namespace atombase { - -// Classes generally resembling those of the OpenCog AtomSpace -// These are tailored for use for the tracking task. - -/** - * TV (truth value): strength or likelihood of a link. - * - * Actually, we store the log-likelihood here, in units of bits, - * rather than the probability. This makes the numbers more - * comprehensible and easier to read and debug. To obtain the - * probability (likelihood), just raise 2 to minus this value. - * - * Measuring in bits allows us to conflate ideas of energy, entropy, - * complexity, cost. In particular, long linkages will get a complexity - * cost, whereas certain disjuncts have an innate cost, obtained from - * entropy principles. These can be added together; they'e on the same - * scale. - */ -class TV -{ - public: - TV(float likeli=0.0f) : _strength(likeli) {} - float _strength; - bool operator==(const TV&) const; - - /// Log-likelihoods (costs, energies, entropies) add. - TV& operator+=(const TV& other) - { - _strength += other._strength; - return *this; - } - - const TV operator+(const TV& other) const - { - return TV(*this) += other; - } -}; - -/* Base class for Nodes and Links */ -/** - * Atoms are not mutable, except for the TV value. That is, you cannot - * change the type of the atom. In particular, all methods are const. - * - * The mutable TV value can cause problems. In particular, when - * propagating costs upwards when putting mixed expressions into DNF, - * this mutability can mess things up. The work-around for this is to - * have a clone() function. I'm not sure I like this. Its ugly, because - * of course, once an atom is in the atom space, its unique, and not clonable. - * Ick. Perhaps TV should not be mutable?? - * - * All atoms are automatically garbage-collected. - */ -class Link; -class Relation; -class Set; -class Atom : public gc -{ - public: - Atom(AtomType type, const TV& tv = TV()) : - _tv(tv), _type(type), _incoming_set(NULL) {} - virtual ~Atom(); - AtomType get_type() const { return _type; } - TV _tv; - - void keep_incoming_set(); - void drop_incoming_set(); - Set* get_incoming_set() const; - Set* get_incoming_set(AtomType) const; - - Relation* add_relation(const char*, Atom*); - Set* get_relations(const char*) const; - Set* get_relation_vals(const char*) const; - - virtual bool operator==(const Atom*) const; - virtual Atom* clone() const = 0; - Atom* upcaster(); - protected: - friend class Link; // wtf ??? - void insert_atom(Link*); - void remove_atom(Link*); - - const AtomType _type; - - typedef unsigned long int WeakLinkPtr; - struct IncomingSet : public gc - { - // Just right now, we will use a single shared mutex for all - // locking on the incoming set. If this causes too much - // contention, then we can fall back to a non-global lock, - // at the cost of 40 additional bytes per atom. - static std::mutex _mtx; - // incoming set is not tracked by garbage collector, - // to avoid cyclic references. - // std::set uses 48 bytes (per atom). - std::set, gc_allocator > _iset; - }; - IncomingSet* _incoming_set; - - Set* filter_iset(std::function) const; -}; - -/// Given an atom of a given type, return the C++ class of that type. -template -T upcast(Atom* a) -{ - T t = dynamic_cast(a); - if (t) return t; - return dynamic_cast(a->upcaster()); -} - -typedef std::basic_string, gc_allocator > NameString; -/** - * A Node may be - * -- a word (the std::string holds the word) - * -- a link (the std::string holds the link) - * -- a disjunct (the std::string holds the disjunct) - * -- etc. - * Nodes are immuatble; the name can be set but not changed. - * Note: all methods are const. - */ -class Node : public Atom -{ - public: - Node(const char* n, const TV& tv = TV()) - : Atom(NODE, tv), _name(n) {} - - Node(const NameString& n, const TV& tv = TV()) - : Atom(NODE, tv), _name(n) {} - - Node(AtomType t, const NameString& n, const TV& tv = TV()) - : Atom(t, tv), _name(n) {} - - const NameString& get_name() const { return _name; } - - virtual bool operator==(const Atom*) const; - virtual Node* clone() const { return new Node(*this); } - protected: - const NameString _name; -}; - - -/// All outgoing lists will be handled as vectors. -// Must use the bdw-gc allocator to track these pointers. -// If this is not done, the GC will fail to see the pointers here. -#if __cplusplus > 199711L -// using requires C++11 -template -using AtomList = std::vector >; -typedef AtomList OutList; -#else -typedef std::vector > OutList; -#endif - -/** - * Links hold a bunch of atoms - * Links are immutable; the outgoing set cannot be changed. - * Note: all methods are const. - */ -class Link : public Atom -{ - public: - // The main ctor - Link(AtomType t, const OutList& oset, const TV& tv = TV()) - : Atom(t, tv), _oset(oset) - { add_to_incoming_set(); } - Link(AtomType t, const TV& tv = TV()) - : Atom(t, tv) - { add_to_incoming_set(); } - Link(AtomType t, Atom* a, const TV& tv = TV()) - : Atom(t, tv), _oset(1, a) - { add_to_incoming_set(); } - Link(AtomType t, Atom* a, Atom*b, const TV& tv = TV()) - : Atom(t, tv), _oset(({OutList o(1,a); o.push_back(b); o;})) - { add_to_incoming_set(); } - Link(AtomType t, Atom* a, Atom* b, Atom* c, const TV& tv = TV()) - : Atom(t, tv), _oset(({OutList o(1,a); o.push_back(b); - o.push_back(c); o;})) - { add_to_incoming_set(); } - Link(AtomType t, Atom* a, Atom* b, Atom* c, Atom* d, const TV& tv = TV()) - : Atom(t, tv), _oset(({OutList o(1,a); o.push_back(b); - o.push_back(c); o.push_back(d); o;})) - { add_to_incoming_set(); } - Link(AtomType t, Atom* a, Atom* b, Atom* c, Atom* d, Atom* e, const TV& tv = TV()) - : Atom(t, tv), _oset(({OutList o(1,a); o.push_back(b); - o.push_back(c); o.push_back(d); - o.push_back(e); o;})) - { add_to_incoming_set(); } - virtual ~Link(); - - size_t get_arity() const { return _oset.size(); } - Atom* get_outgoing_atom(size_t pos) const { return _oset.at(pos); } - const OutList& get_outgoing_set() const { return _oset; } - - void enable_keep_incoming_set(AtomType); - void disable_keep_incoming_set(AtomType); - void add_to_incoming_set(); - void add_to_incoming_set(AtomType); - void remove_from_incoming_set(AtomType); - Link* append(Atom*) const; - Link* replace(Atom*, Atom*) const; - - virtual bool operator==(const Atom*) const; - virtual Link* clone() const { return new Link(*this); } - protected: - // Outgoing set is const, not modifiable. - const OutList _oset; -}; - - -// An unhygenic for-each loop, to simplify iterating over -// the outgoing set. I don't see a more elegant way to do this, -// just right now... -// Anyway, this implements the semantics "foreach VAR of TYPENAME in LNK" -#define foreach_outgoing(TYPENAME,VAR,LNK) \ - const atombase::Link* _ll_##VAR; \ - size_t _ii_##VAR, _ee_##VAR; \ - atombase::Atom* _aa_##VAR; \ - TYPENAME VAR; \ - for (_ll_##VAR = (LNK), _ii_##VAR = 0, \ - _ee_##VAR = _ll_##VAR->get_arity(); \ - _aa_##VAR = (_ii_##VAR < _ee_##VAR) ? \ - _ll_##VAR->get_outgoing_atom(_ii_##VAR) : 0x0, \ - VAR = dynamic_cast(_aa_##VAR), \ - _ii_##VAR < _ee_##VAR; \ - _ii_##VAR++) - - -std::ostream& operator<<(std::ostream& out, const Atom*); - -} // namespace atombase - -#endif // _ATOMBASE_ATOM_H diff -Nru link-grammar-5.3.16/viterbi/atom-types.cc link-grammar-5.5.0/viterbi/atom-types.cc --- link-grammar-5.3.16/viterbi/atom-types.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/atom-types.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,58 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "atom-types.h" - -namespace atombase { - -const std::string type_name(AtomType t) -{ - switch(t) - { - // Generic node types - case NODE: return "NODE"; - case INDEX: return "INDEX"; - case LABEL: return "LABEL"; - case NUMBER: return "NUMBER"; - - // Viterbi-specific node types - case WORD: return "WORD"; - case LING_TYPE: return "LING_TYPE"; - case CONNECTOR: return "CONNECTOR"; - - // Generic link types - case LINK: return "LINK"; - case RELATION: return "RELATION"; - case SEQ: return "SEQ"; - case SET: return "SET"; - case UNIQ: return "UNIQ"; - case OR: return "OR"; - case AND: return "AND"; - - // Viterbi-specific link types - case WORD_CSET: return "WORD_CSET"; - case WORD_DISJ: return "WORD_DISJ"; - case LING: return "LING"; - case STATE_TRIPLE: return "STATE_TRIPLE"; - case RULE: return "RULE"; - } - - return "UNHANDLED_TYPE_NAME"; -} - -std::ostream& operator<<(std::ostream& out, AtomType t) -{ - out << type_name(t); - return out; -} - -} // namespace atombase - diff -Nru link-grammar-5.3.16/viterbi/atom-types.h link-grammar-5.5.0/viterbi/atom-types.h --- link-grammar-5.3.16/viterbi/atom-types.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/atom-types.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,57 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _ATOMBASE_ATOM_TYPES_H -#define _ATOMBASE_ATOM_TYPES_H - -#include - -namespace atombase { - -// Atom types. Right now an enum, but maybe should be dynamic!? -enum AtomType -{ - // Generic Node types - NODE = 1, - INDEX, - LABEL, - NUMBER, - - // Viterbi-specific Node types - WORD, // a word - LING_TYPE, // a pair of merged connectors (LG LINK TYPE) - // META, // special-word, e.g. LEFT-WALL, RIGHT-WALL - CONNECTOR, // e.g. S+ - - // Generic Link types - LINK, - RELATION, // model-theoretic relation (OpenCog ExecutionLink) - SET, // unordered multiset of children - UNIQ, // unordered set of children - SEQ, // ordered sequence of children - AND, // ordered AND of all children (order is important!) - OR, // unordered OR of all children - - // Viterbi-specific Link types - WORD_CSET, // word, followed by a set of connectors for that word. - WORD_DISJ, // word, followed by a single disjunct for that word. - LING, // two connected connectors, (LGLINK) e.g. Dmcn w/o direction info - STATE_TRIPLE, // Current pending input, parse state and corresponding output. - - RULE, // Base class for graph re-write rules -}; - -const std::string type_name(AtomType); -std::ostream& operator<<(std::ostream& out, AtomType t); - -} // namespace atombase - -#endif // _ATOMBASE_ATOM_TYPES_H diff -Nru link-grammar-5.3.16/viterbi/compile-base.cc link-grammar-5.5.0/viterbi/compile-base.cc --- link-grammar-5.3.16/viterbi/compile-base.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/compile-base.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,408 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include - -#include "compile-base.h" -#include "utilities.h" // needed for assert - -namespace atombase { - - -/// Flatten a set. That is, remove the nested, recursive -/// structure of the set, and return all elements as just -/// one single set. For example, the flattened version of -/// {a, {b,c}} is {a, b, c} -/// -/// Any costs (i.e. truth values) associated with the removed -/// levels are accumulated onto the children. -/// -/// See also super_flatten(), which recursively flattens -/// anything deriving from class Set, recursively. -/// -// Note that this algo, as currently implemented, is order-preserving. -// This is important for the Seq class, and the And class (since the -// link-grammar AND must be an ordered sequence, to preserve planarity -// of the parses.) -OutList Set::flatset() const -{ - OutList newset; - foreach_outgoing(Atom*, a, this) - { - /* Copy without change, if types differ. */ - if (_type != a->get_type()) - { - newset.push_back(a); - continue; - } - - /* Get rid of a level */ - const TV& cost = a->_tv; - Set* ora = upcast(a); - OutList fora = ora->flatset(); - size_t osz = fora.size(); - for (size_t j=0; j_tv += cost; - newset.push_back(fora[j]); - } - } - - return newset; -} - -/// Recursively flatten everything that inherits from set. -/// This does preserve the type hierarchy: that is, types are respected, -/// and the flattening only happens within a single type. The only -/// exception to this is if the set contains just a single element, -/// in which case this returns that one element. -// -Atom* Set::super_flatten() const -{ - size_t sz = get_arity(); - - // If its a singleton, just return that. But super-flatten - // it first! Push the cost of this node down onto the child. - if (1 == sz) - { - Atom* a = get_outgoing_atom(0); - Set* set = upcast(a); - if (set) - a = set->super_flatten(); - a->_tv += _tv; - return a; - } - - OutList newset; - for (size_t i=0; i(a); - if (NULL == set) - { - newset.push_back(a); - continue; - } - - /* Copy without change, if types differ. */ - /* But flatten it first, if it inherits from set. */ - if (get_type() != a->get_type()) - { - newset.push_back(set->super_flatten()); - continue; - } - - // type if this equals type of children. - /* Get rid of a level */ - Atom* achld = set->super_flatten(); - Set* chld = dynamic_cast(achld); - if (!chld) - { - newset.push_back(achld); - continue; - } - - // perform the actual flattening, distributing the cost - // of the deleted atom onto its children. - foreach_outgoing(Atom*, c, chld) - { - c->_tv += a->_tv; - newset.push_back(c); - } - } - - return (new Link(get_type(), newset, _tv))->upcaster(); -} - -/// Set union (append) other set to this set. -Set* Set::sum(const Set* other) const -{ - if (0 == other->get_arity()) return new Set(*this); - - OutList o = get_outgoing_set(); - const OutList& oth = other->get_outgoing_set(); - o.insert(o.end(), oth.begin(), oth.end()); - return new Set(o); // XXX what about the tv ??? -} - -// ============================================================ - -/// Utility to create a list of uniq items, given a non-unique list. -OutList Uniq::uniqify(const OutList& ol) -{ - std::unordered_set us; - - // Force uniqueness by copying into a set, and then back out. - // I assume this is faster than sorting!? - auto end = ol.end(); - for (auto it = ol.begin(); it != end; it++) - us.insert(*it); - - OutList ret; - auto usend = us.end(); - for (auto ut = us.begin(); ut != usend; ut++) - ret.push_back(*ut); - - return ret; -} - -// ============================================================ -/// Remove repeated entries. -Or* Or::uniq() const -{ - OutList uniq; - size_t sz = get_arity(); - for (size_t i=0; ioperator==(b)) - { - is_uniq = false; - break; - } - } - - if (is_uniq) - uniq.push_back(a); - } - return new Or(uniq); -} - -// ============================================================ -/// Return disjunctive normal form (DNF) -/// -/// Presumes that the oset is a nested list consisting -/// of And and Or nodes. If the oset contains non-boolean -/// terms, these are left in place, unmolested. -/// -/// Costs are distributed over disjuncts. -/// -/// XXX Note: this somewhat duplicates the function of the -/// disjoin() subroutine defined in disjoin.cc ... -/// Note: this one is unit-tested, the other is not. -/// Note, however, that one handles optional clauses; this does not. -Atom* Or::disjoin() const -{ - // Trying to disjoin anything that is not in a flattened - // form is crazy-making. - Atom* sfl = super_flatten(); - Set* fl = dynamic_cast(sfl); - if (NULL == fl) return sfl; - - // If the flattening discarded the top-level Or, deal with it. - And* afl = dynamic_cast(sfl); - if (afl) return afl->disjoin(); - - // If we are not an Or link, then wtf!? - assert(dynamic_cast(sfl), "We are deeply confused disjoining!"); - - OutList dnf; - foreach_outgoing(Atom*, a, fl) - { - AtomType ty = a->get_type(); - if (AND == ty) - { - And* al = upcast(a); - Atom* a = al->disjoin(); - Link* l = dynamic_cast(a); - if (l) - { - foreach_outgoing(Atom*, b, l) - dnf.push_back(b); - } - else - { - dnf.push_back(a); - } - } - else if (OR == ty) - { -assert(0, "not expecting Or after flattening"); -#if 0 - Or* ol = upcast(a); - Or* l = ol->disjoin(); - TV cost = l->_tv; - foreach_outgoing(Atom*, aol, l) - { - // XXX We've got to distribute the cost, somehow, but - // I don't really like bumping it like this ... its not - // a pure play. - aol->_tv += cost; - dnf.push_back(aol); - } -#endif - } - else - dnf.push_back(a); - } - return new Or(dnf, fl->_tv); -} - -/// Return disjunctive normal form (DNF) -/// -/// Presumes that the oset is a nested list consisting -/// of And and Or nodes. If the oset contains non-boolean -/// terms, these are left in place, unmolested. -/// -/// Costs are distributed over disjuncts. -/// -/// XXX Note: this somewhat duplicates the function of the -/// disjoin() subroutine defined in disjoin.cc ... -/// Note: this one is unit-tested, the other is not. -/// Note, however, that one handles optional clauses; this does not. -Atom* And::disjoin() -{ - // Trying to disjoin anything that is not in a flattened - // form is crazy-making. - Atom* sfl = super_flatten(); - Set* fl = dynamic_cast(sfl); - if (NULL == fl) return sfl; - - // If the flattening discarded the top-level And, deal with it. - Or* orfl = dynamic_cast(sfl); - if (orfl) return orfl->disjoin(); - - // If we are not an And link, then wtf!? - assert(dynamic_cast(sfl), "And we are deeply confused!"); - - size_t sz = fl->get_arity(); - if (0 == sz) return new Or(fl->_tv); - if (1 == sz) return fl->get_outgoing_atom(0); - - // Perhaps there is nothing to be done, because none - // of the children are boolean operators. - bool done = true; - bool needs_flattening = false; - for (size_t i=0; iget_outgoing_atom(i)->get_type(); - if (AND == ty) - { - done = false; - needs_flattening = true; - } - else if (OR == ty) - done = false; - } - if (done) - { - return fl; - } - - // First, disjoin any child nodes - OutList* ol = new OutList(fl->get_outgoing_set()); -#if 0 - for (size_t i=0; iat(i); - AtomType ty = a->get_type(); - if (OR == ty) - { - Or* oo = upcast(a); - (*ol)[i] = oo->disjoin(); - } - else if (AND == ty) - { - And* aa = upcast(a); - (*ol)[i] = aa->disjoin(); - } - } -#endif - - // Next, flatten out any nested And's - while (needs_flattening) - { - bool did_flatten = false; - OutList* flat = new OutList(); - for (size_t i=0; iat(i); - AtomType ty = a->get_type(); - if (AND == ty) - { - did_flatten = true; - Link* l = dynamic_cast(a); - foreach_outgoing(Atom*, b, l) - flat->push_back(b); - } - else - flat->push_back(a); - } - - if (not did_flatten) break; - ol = flat; - } - - // Get the last element off of the list of and'ed terms - Atom* last = *(ol->rbegin()); - ol->pop_back(); - And shorter(*ol); - - // recurse ... - Atom* stumper = shorter.disjoin(); - Or* stumpy = dynamic_cast(stumper); - - if (!stumpy) - { - // If we are here, then the front was just a single atom, - // and the last element was (possibly) an OR-list. So - // distribute the front onto the tail. (using AND, of course) - OutList dnf; - - if (OR != last->get_type()) - last = new Or(last); - - // Costs distribute additively: AND over OR. - TV cost = fl->_tv + last->_tv; - - Link* ll = dynamic_cast(last); - foreach_outgoing(Atom*, tail, ll) - dnf.push_back(new And(stumper, tail, cost)); - - return new Or(dnf); - } - - // finally, distribute last elt back onto the end. - OutList dnf; - - if (OR != last->get_type()) - last = new Or(last); - - // Costs distribute additively: AND over OR. - TV cost = stumpy->_tv + fl->_tv + last->_tv; - - Link* ll = dynamic_cast(last); - foreach_outgoing(Atom*, tail, ll) - { - foreach_outgoing(Atom*, a, stumpy) - { - AtomType ty = a->get_type(); - if (AND == ty) - { - Link* l = dynamic_cast(a); - OutList al = l->get_outgoing_set(); - al.push_back(tail); - dnf.push_back(new And(al, cost + a->_tv)); - } - else - { - dnf.push_back(new And(a, tail, cost)); - } - } - } - return new Or(dnf); -} - -} // namespace atombase diff -Nru link-grammar-5.3.16/viterbi/compile-base.h link-grammar-5.5.0/viterbi/compile-base.h --- link-grammar-5.3.16/viterbi/compile-base.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/compile-base.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,328 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _ATOMBASE_COMPILE_H -#define _ATOMBASE_COMPILE_H - -#include -#include -#include "atom.h" - -namespace atombase { - -// Classes that convert run-time atom types into compile-time static -// types, so that the compiler can check these for correctness. -// These are here purely for C++ programming convenience; the true -// structure that matters is the dynamic run-time (hyper-)graphs. - -/// Simple string label node. -class Label : public Node -{ - public: - Label(const NameString& n, const TV& tv = TV()) - : Node(LABEL, n, tv) {} -}; - -/// Index, ID -/// Identification Node, holds one or several numeric ID values. -/// Intended primarily for debugging. -class Index : public Node -{ - public: - Index(int a, const TV& tv = TV()) - : Node(INDEX, ({ char buff[80]; snprintf(buff, 80, "%d", a); buff;}), tv) - {} - Index(int a, int b, const TV& tv = TV()) - : Node(INDEX, ({ char buff[80]; snprintf(buff, 80, "%d, %d", a, b); buff;}), tv) - {} - Index(unsigned int a, int b, int c, const TV& tv = TV()) - : Node(INDEX, ({ char buff[80]; snprintf(buff, 80, "%u, %d, %d", a, b, c); buff;}), tv) - {} - Index(double a, const TV& tv = TV()) - : Node(INDEX, ({ char buff[80]; snprintf(buff, 80, "%20.16f", a); buff;}), tv) - {} -}; - -/// Holder of one floating-poing number -class Number : public Node -{ - public: - Number(double a, const TV& tv = TV()) - : Node(NUMBER, ({ char buff[80]; snprintf(buff, 80, "%20.16g", a); buff;}), tv), - _value(a) - {} - double get_value() const { return _value; } - protected: - double _value; -}; - -/// Named relation, as defined in model theory. -/// -/// This consists of a name (Label), followed by an ordered sequence -/// of atoms. The atoms are all related by the relation. There are -/// many different ways of thinking about this: -/// 1) as a predicate: the relation is true/satisfied if it exists. -/// (the truth of which is given by the _tv, of course) -/// 2) as a graph of a function (in the sense of Bourbaki) -/// 3) as a (named) function, the first n atoms being the arguments, -/// the last being the value of the function for those arguments. -/// 4) the atom that is (more or less) equivalent to the OpenCog ExecutionLink -class Relation : public Link -{ - public: - // Binary relation; add others if needed. - Relation(Label* lab, Atom* arg, Atom* val, const TV& tv = TV()) - : Link(RELATION, ({OutList o(1,lab); o.push_back(arg); o.push_back(val); o;}), tv) - {} - Relation(const char* name, Atom* arg, Atom* val, const TV& tv = TV()) - : Link(RELATION, ({OutList o(1, new Label(name)); o.push_back(arg); o.push_back(val); o;}), tv) - {} -}; - -/// Unordered multiset -/// A Set inherits fom Link, and is an unordered set of zero or more -/// atoms. Properly speaking, it is a multi-set; the same atom may -/// appear more than once in the set. -class Set : public Link -{ - public: - Set(const TV& tv = TV()) - : Link(SET, tv) - {} - Set(const OutList& ol, const TV& tv = TV()) - : Link(SET, ol, tv) - {} - Set(Atom* singleton, const TV& tv = TV()) - : Link(SET, OutList(1, singleton), tv) - {} - Set(Atom* a, Atom* b, const TV& tv = TV()) - : Link(SET, ({OutList o(1,a); o.push_back(b); o;}), tv) - {} - Set(Atom* a, Atom* b, Atom* c, const TV& tv = TV()) - : Link(SET, ({OutList o(1,a); o.push_back(b); o.push_back(c); o;}), tv) - {} - Set(Atom* a, Atom* b, Atom* c, Atom* d, const TV& tv = TV()) - : Link(SET, ({OutList o(1,a); o.push_back(b); o.push_back(c); o.push_back(d); o;}), tv) - {} - Set(Atom* a, Atom* b, Atom* c, Atom* d, Atom* e, const TV& tv = TV()) - : Link(SET, ({OutList o(1,a); o.push_back(b); o.push_back(c); o.push_back(d); o.push_back(e); o;}), tv) - {} - - protected: - /// The sole purpose of this ctor is to allow inheritance. - Set(AtomType t, const TV& tv = TV()) - : Link(t, tv) - {} - Set(AtomType t, const OutList& oset, const TV& tv = TV()) - : Link(t, oset, tv) - {} - // Only for classes that inherit from Set - Set(AtomType t, Atom* singleton, const TV& tv = TV()) - : Link(t, OutList(1, singleton), tv) - {} - Set(AtomType t, Atom* a, Atom* b, const TV& tv = TV()) - : Link(t, ({OutList o(1,a); o.push_back(b); o;}), tv) - {} - Set(AtomType t, Atom* a, Atom* b, Atom* c, const TV& tv = TV()) - : Link(t, ({OutList o(1,a); o.push_back(b); o.push_back(c); o;}), tv) - {} - Set(AtomType t, Atom* a, Atom* b, Atom* c, Atom* d, const TV& tv = TV()) - : Link(t, ({OutList o(1,a); o.push_back(b); o.push_back(c); o.push_back(d); o;}), tv) - {} - Set(AtomType t, Atom* a, Atom* b, Atom* c, Atom* d, Atom* e, const TV& tv = TV()) - : Link(t, ({OutList o(1,a); o.push_back(b); o.push_back(c); o.push_back(d); o.push_back(e); o;}), tv) - {} - - public: - // See the C file for documentation - Set* flatten() const { return new Set(flatset(), _tv); } - Atom* super_flatten() const; - - // Set-union (append) other set to this set. - virtual Set* sum(const Set*) const; - - virtual Set* clone() const { return new Set(*this); } - - Set* append(Atom* a) const { return dynamic_cast(Link::append(a)); } - Set* replace(Atom* novi, Atom* ctari) const { - return dynamic_cast(Link::replace(novi, ctari)); - } - - protected: - OutList flatset() const; -}; - - -/// Unique set. An atom may appear at most once in the outgoing set. -/// Duplicates are removed during construction. -class Uniq : public Set -{ - public: - Uniq(const TV& tv = TV()) - : Set(UNIQ, tv) - {} - Uniq(const OutList& ol, const TV& tv = TV()) - : Set(UNIQ, uniqify(ol), tv) - {} - Uniq(Atom* singleton, const TV& tv = TV()) - : Set(UNIQ, uniqify(OutList(1, singleton)), tv) - {} - Uniq(Atom* a, Atom* b, const TV& tv = TV()) - : Set(UNIQ, uniqify(({OutList o(1,a); o.push_back(b); o;})), tv) - {} - Uniq(Atom* a, Atom* b, Atom* c, const TV& tv = TV()) - : Set(UNIQ, uniqify(({OutList o(1,a); o.push_back(b); o.push_back(c); o;})), tv) - {} - - // Special copy constructor - Uniq(Set* sset) - : Set(UNIQ, uniqify(sset->get_outgoing_set()), sset->_tv) - {} - - // XXX TODO we really should overload append, and enforce uniqueness. ... - protected: - static OutList uniqify(const OutList& ol); -}; - -/// Ordered sequence -/// Seq inherits from Set, and is an ordered sequence of zero or more -/// atoms. -class Seq : public Set -{ - public: - Seq() - : Set(SEQ) - {} - Seq(const OutList& ol, const TV& tv = TV()) - : Set(SEQ, ol, tv) - {} - Seq(Atom* singleton, const TV& tv = TV()) - : Set(SEQ, OutList(1, singleton), tv) - {} - Seq(Atom* a, Atom* b, const TV& tv = TV()) - : Set(SEQ, ({OutList o(1,a); o.push_back(b); o;}), tv) - {} - - // See the Set class for documentation - Seq* flatten() const { return new Seq(flatset(), _tv); } - - virtual Seq* clone() const { return new Seq(*this); } - - Seq* append(Atom* a) const { return dynamic_cast(Link::append(a)); } - - /// Get the first and last atoms in the sequence - Atom* get_first() const { - if (0 < get_arity()) return get_outgoing_atom(0); - else return NULL; - } - Atom* get_last() const { - size_t sz = get_arity(); - if (0 < sz) return get_outgoing_atom(sz-1); - else return NULL; - } - - - protected: - /// The sole purpose of this ctor is to allow inheritance. - Seq(AtomType t) - : Set(t) - {} - Seq(AtomType t, const OutList& oset, const TV& tv = TV()) - : Set(t, oset, tv) - {} - Seq(AtomType t, Atom* a, Atom* b, const TV& tv = TV()) - : Set(t, ({OutList o(1,a); o.push_back(b); o;}), tv) - {} - Seq(AtomType t, Atom* a, Atom* b, Atom* c, const TV& tv = TV()) - : Set(t, ({OutList o(1,a); o.push_back(b); o.push_back(c); o;}), tv) - {} - Seq(AtomType t, Atom* a, Atom* b, Atom* c, Atom* d, const TV& tv = TV()) - : Set(t, ({OutList o(1,a); o.push_back(b); o.push_back(c); o.push_back(d); o;}), tv) - {} -}; - -/// Unordered OR of all children -class Or : public Set -{ - public: - Or(const TV& tv = TV()) - : Set(OR, tv) - {} - Or(const OutList& ol, const TV& tv = TV()) - : Set(OR, ol, tv) - {} - Or(Atom* singleton, const TV& tv = TV()) - : Set(OR, OutList(1, singleton), tv) - {} - Or(Atom* a, Atom* b, const TV& tv = TV()) - : Set(OR, ({OutList o(1,a); o.push_back(b); o;}), tv) - {} - Or(Atom* a, Atom* b, Atom* c, const TV& tv = TV()) - : Set(OR, ({OutList o(1,a); o.push_back(b); o.push_back(c); o;}), tv) - {} - - // Return disjunctive normal form (DNF) - Atom* disjoin() const; - - // See the Set class for documentation - Or* flatten() const { return new Or(flatset(), _tv); } - - // Remove repeated entries - Or* uniq() const; - - virtual Or* clone() const { return new Or(*this); } - - Or* append(Atom* a) const { return dynamic_cast(Link::append(a)); } -}; - -/// Ordered sequence -/// And inherits from Seq, since the order of the atoms in -/// its outgoing set is important. -class And : public Seq -{ - public: - And() - : Seq(AND) - {} - And(const OutList& ol, const TV& tv = TV()) - : Seq(AND, ol, tv) - {} - And(Atom* singleton, const TV& tv = TV()) - : Seq(AND, OutList(1, singleton), tv) - {} - And(Atom* a, Atom* b, const TV& tv = TV()) - : Seq(AND, ({OutList o(1,a); o.push_back(b); o;}), tv) - {} - And(Atom* a, Atom* b, Atom* c, const TV& tv = TV()) - : Seq(AND, ({OutList o(1,a); o.push_back(b); o.push_back(c); o;}), tv) - {} - - - // Return disjunctive normal form (DNF) - // Does not modify this atom; just returns a new one. - Atom* disjoin(); - - // See the Set class for documentation - And* flatten() const { return new And(flatset(), _tv); } - - /// Remove optional clauses. - /// XXX Perhaps this should not be a method on this class... - Atom* clean() const; - - virtual And* clone() const { return new And(*this); } - - And * append(Atom* a) const { return dynamic_cast(Link::append(a)); } -}; - -} // namespace atombase - - -#endif // _ATOMBASE_COMPILE_H diff -Nru link-grammar-5.3.16/viterbi/compile.cc link-grammar-5.5.0/viterbi/compile.cc --- link-grammar-5.3.16/viterbi/compile.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/compile.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,120 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "compile.h" - -// XXX temporary hack until dynamic types are supported !? -namespace atombase { -using namespace link_grammar::viterbi; - -// ============================================================ -/// Remove optional connectors. -/// -/// It doesn't make any sense at all to have an optional connector -/// in an AND-clause, so just remove it. (Well, OK, it "makes sense", -/// its just effectively a no-op, and so doesn't have any effect. So, -/// removing it here simplifies logic in other places.) -/// -/// The cost of the optional is passed up to the dsjunct. The reason for -/// this is that the doctionary contains entries such as -/// , whcih has (Xc+ or ) & MX- -/// After being disjoined, we need to pass that cost up. -Atom* And::clean() const -{ - TV tv = _tv; - OutList cl; - size_t sz = get_arity(); - - // Special case: it could be a and-clause containing a single, - // optional connector, in which case, we flatten the thing - // (returning the optional connector!) - if (1 == sz) - { - Atom* a = get_outgoing_atom(0); - a->_tv += _tv; - return a; - } - for (size_t i=0; i(get_outgoing_atom(i)); - if (cn and cn->is_optional()) - { - tv += cn->_tv; - continue; - } - - cl.push_back(_oset[i]); - } - - return new And(cl, tv); -} - -} // namespace atombase - -// ============================================================ - -namespace link_grammar { -namespace viterbi { - - -// Helper function for below. -static bool has_lefties(Atom* a) -{ - Connector* c = dynamic_cast(a); - if (c) - { - if ('-' == c->get_direction()) - return true; - return false; - } - - // Verify we've got a valid disjunct - AtomType at = a->get_type(); - assert ((at == OR) or (at == AND), "Disjunct, expecting OR or AND"); - - // Recurse down into disjunct - Link* l = dynamic_cast(a); - size_t sz = l->get_arity(); - for (size_t i=0; iget_outgoing_atom(i))) - return true; - } - return false; -} - -/// Return true if any of the connectors in the cset point to the left. -bool WordCset::has_left_pointers() const -{ - return has_lefties(get_cset()); -} - -/// Simplify any gratuituousnesting in the cset. -WordCset* WordCset::flatten() -{ - // AND and OR inherit from Set - Set* s = dynamic_cast(get_cset()); - if (NULL == s) - return this; - - Atom* flat = s->super_flatten(); - - // If there is nothing left after flattening, return NULL. - const Link* fl = dynamic_cast(flat); - if (fl && 0 == fl->get_arity()) - return NULL; - - return new WordCset(get_word(), flat); -} - - -} // namespace viterbi -} // namespace link-grammar diff -Nru link-grammar-5.3.16/viterbi/compile.h link-grammar-5.5.0/viterbi/compile.h --- link-grammar-5.3.16/viterbi/compile.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/compile.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,186 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_VITERBI_COMPILELG_H -#define _LG_VITERBI_COMPILELG_H - -#include "utilities.h" // needed for assert - -#include "atom.h" -#include "compile-base.h" - -namespace link_grammar { -namespace viterbi { - -using namespace atombase; - -#define OPTIONAL_CLAUSE "0" - -// Classes that convert run-time atom types into compile-time static -// types, so that the compiler can check these for correctness. -// These are here purely for C++ programming convenience; the true -// structure that matters is the dynamic run-time (hyper-)graphs. - -class Connector : public Node -{ - public: - // Last letter of the connector must be + or - - // indicating the direction of the connector. - Connector(const NameString& name, const TV& tv = TV()) - : Node(CONNECTOR, name, tv) - { - if (name == OPTIONAL_CLAUSE) - return; - char dir = *name.rbegin(); - assert (('+' == dir) or ('-' == dir), "Bad direction"); - } - - bool is_optional() const - { - return _name == OPTIONAL_CLAUSE; - } - - char get_direction() const - { - return *_name.rbegin(); - } - virtual Connector* clone() const { return new Connector(*this); } -}; - -class LingType : public Node -{ - public: - LingType(const NameString& name, const TV& tv = TV()) - : Node(LING_TYPE, name, tv) - {} -}; - -class Word : public Node -{ - public: - Word(const NameString& name, const TV& tv = TV()) - : Node(WORD, name, tv) - {} -}; - -#if 0 -// Atom types. Right now an enum, but maybe should be dynamic!? -enum AtomType -{ - // Link types - WORD_DISJ, // word, followed by a single disjunct for that word. -}; -#endif - - -/// Create a ling-grammar link. This will be of the form: -/// LING: -/// Ling_TYPE "MVa" -/// Atom ... -/// Atom ... -/// where the Atoms are typically either connectors, or WORD_DISJ -/// -class Ling : public atombase::Link -{ - public: - Ling(const OutList& ol) - : Link(LING, ol) - { - assert(3 == ol.size(), "LG link wrong size"); - assert(ol[0]->get_type() == LING_TYPE, "LG link has bad first node"); - } - Ling(const NameString& str, Atom* a, Atom *b) - : Link(LING, new LingType(str), a, b) {} - - Ling(LingType* t, Atom* a, Atom *b) - : Link(LING, t, a, b) {} - - LingType* get_ling_type() const - { - return dynamic_cast(get_outgoing_atom(0)); - } - - Atom* get_left() const - { - return get_outgoing_atom(1); - } - Atom* get_right() const - { - return get_outgoing_atom(2); - } -}; - - -class WordCset : public atombase::Link -{ - public: - WordCset(Word* a, Atom* b) - : Link(WORD_CSET, a, b) - { - // this should be pointing at: - // WORD_CSET : - // WORD : blah.v - // AND : - // CONNECTOR : Wd- etc... - - assert(a->get_type() == WORD, "CSET is expecting WORD as first arg"); - bool ok = false; - ok = ok or b->get_type() == CONNECTOR; - ok = ok or b->get_type() == AND; - ok = ok or b->get_type() == OR; - assert(ok, "CSET is expecting connector set as second arg"); - } - - Word* get_word() const - { - return dynamic_cast(_oset[0]); - } - Atom* get_cset() const - { - return _oset[1]; - } - bool has_left_pointers() const; - WordCset* flatten(); -}; - -/// A triple of three sequences. The first sequence is a sequence of 'input' -/// words (a sentence or phrase that has not yet been parsed). The second -/// sequence is the current parse state. The third sequence is the 'output' -/// of the parse, i.e. a set of connected words. -/// -/// It of the form -/// -/// STATE_TRIPLE : -/// SEQ -/// WORD -/// WORD -/// SEQ -/// WORD_CSET ... -/// WORD_CSET ... -/// SET -/// LING ... -/// LING ... -/// -class StateTriple : public atombase::Link -{ - public: - StateTriple(Seq* input, Seq* state, Set* output) - : Link(STATE_TRIPLE, input, state, output) {} - Seq* get_input() const { return dynamic_cast(_oset.at(0)); } - Seq* get_state() const { return dynamic_cast(_oset.at(1)); } - Set* get_output() const { return dynamic_cast(_oset.at(2)); } -}; - - -} // namespace viterbi -} // namespace link-grammar - -#endif // _LG_VITERBI_COMPILELG_H diff -Nru link-grammar-5.3.16/viterbi/compress.cc link-grammar-5.5.0/viterbi/compress.cc --- link-grammar-5.3.16/viterbi/compress.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/compress.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,169 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "compress.h" - -using namespace std; - -namespace link_grammar { -namespace viterbi { - -// Merge a pair of word-lists, if they are mergable. -// Internal-use only utility for the function below. -static Seq* merge_wordlists(Seq* wla, Seq* wlb) -{ - size_t sza = wla->get_arity(); - size_t szb = wlb->get_arity(); - if (sza != szb) - return NULL; - - bool already_merged = false; - OutList seq; - for (size_t i = 0; i < sza; i++) - { - Atom* a = wla->get_outgoing_atom(i); - Atom* b = wlb->get_outgoing_atom(i); - - if (a->operator==(b)) - { - seq.push_back(a); - continue; - } - - // If we are here, then the word csets differ. - if (already_merged) - return NULL; - already_merged = true; - - // Check to see if the words are the same. - WordCset* wcsa = dynamic_cast(a); - WordCset* wcsb = dynamic_cast(b); - Word* wa = wcsa->get_word(); - Word* wb = wcsb->get_word(); - - // Words differ, its not mergable. - if (not wa->operator==(wb)) - return NULL; - - // OK, all is well, merge the connectors. - Atom* csa = wcsa->get_cset(); - Atom* csb = wcsb->get_cset(); - Or* dj = new Or(csa, csb); - dj = dj->flatten(); - dj = dj->uniq(); - - WordCset* merge = new WordCset(wa, dj); - seq.push_back(merge); - } - - return new Seq(seq); -} - -/// Try to shrink down a set of alternatives by collapsing repeated -/// states into disjuncts. Best explained by example: Suppose that -/// the input is -/// SET : -/// STATE_TRIPLE : -/// SEQ : -/// WORD : is -/// WORD : a -/// WORD : test -/// SEQ : -/// WORD_CSET : -/// WORD : this -/// CONNECTOR : Ss*b+ -/// WORD_CSET : -/// WORD : LEFT-WALL -/// CONNECTOR : Wd+ -/// SEQ : -/// STATE_TRIPLE : -/// SEQ : -/// WORD : is -/// WORD : a -/// WORD : test -/// SEQ : -/// WORD_CSET : -/// WORD : this -/// CONNECTOR : Ss*b+ -/// WORD_CSET : -/// WORD : LEFT-WALL -/// CONNECTOR : Wi+ -/// SEQ : -/// -/// Then the compressed output will be: -/// SET : -/// STATE_TRIPLE : -/// SEQ : -/// WORD : is -/// WORD : a -/// WORD : test -/// SEQ : -/// WORD_CSET : -/// WORD : this -/// CONNECTOR : Ss*b+ -/// WORD_CSET : -/// WORD : LEFT-WALL -/// OR : -/// CONNECTOR : Wd+ -/// CONNECTOR : Wi+ -/// SEQ : -/// -/// Note how two alternative state triples were collapsed down into a -/// single word cset. The goal of this compression is to shrink down -/// the state to make the output more tractable, slightly less -/// combinatoric-explosion-y. -// -// XXX TODO: if we created a special atom that held only alternatives, -// then this would be a method on that that atom. Do we need such an -// atom? It could help avoid confusion ... - -Set* compress_alternatives(Set* state_alternatives) -{ - OutList alts; - Seq* prev_in = NULL; - Set* prev_out = NULL; - Seq* merged = NULL; - - foreach_outgoing(StateTriple*, sp, state_alternatives) - { - // If the inputs or the outputs differ, the state triples are - // fundamentally not mergable. Move along. - Seq* in = sp->get_input(); - Set* out = sp->get_output(); - if ((not in->operator==(prev_in)) or - (not out->operator==(prev_out))) - { - if (merged) - alts.push_back(new StateTriple(prev_in, merged, prev_out)); - prev_in = in; - prev_out = out; - merged = sp->get_state(); - continue; - } - - Seq* m = merge_wordlists(merged, sp->get_state()); - if (NULL == m) - { - alts.push_back(sp); - continue; - } - merged = m; - } - - if (merged) - alts.push_back(new StateTriple(prev_in, merged, prev_out)); - - return new Set(alts); -} - -} // namespace viterbi -} // namespace link-grammar - diff -Nru link-grammar-5.3.16/viterbi/compress.h link-grammar-5.5.0/viterbi/compress.h --- link-grammar-5.3.16/viterbi/compress.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/compress.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,26 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_VITERBI_COMPRESS_H -#define _LG_VITERBI_COMPRESS_H - -#include "atom.h" -#include "compile.h" - -namespace link_grammar { -namespace viterbi { - -Set* compress_alternatives(Set*); - -} // namespace viterbi -} // namespace link-grammar - -#endif // _LG_VITERBI_COMPRESS_H diff -Nru link-grammar-5.3.16/viterbi/connect.cc link-grammar-5.5.0/viterbi/connect.cc --- link-grammar-5.3.16/viterbi/connect.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/connect.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,376 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include - -#include -#include -#include - -#include "utilities.h" // From base link-grammar - -#include "atom.h" -#include "connect.h" -#include "connector-utils.h" - -using namespace std; - -// #define DBG(X) X; -#define DBG(X) - -namespace link_grammar { -namespace viterbi { - -/** - * This class is vaguely monad-like. - */ -Connect::Connect(WordCset* lcs, WordCset* rcs) - : _left_cset(lcs), _right_cset(rcs) -{} - -// ============================================================= - -/// Try to connect the left and right disjuncts. -/// -/// If the connection attempt is successful, then return a -/// StateTriple given the emitted output, and the resulting state. -/// -/// The implementation below is just a dispatcher for each of the -/// alternative handlers, depending on whether the arguments are -/// single or multi-connectors. -StateTriple* Connect::try_alternative(Atom* ldj, Atom* rdj) -{ - Connector* lcon = dynamic_cast(ldj); - Connector* rcon = dynamic_cast(rdj); - - // Left disjunct is a single connector - if (lcon) - { - // Right disjunct is a single connector - if (rcon) - return alternative(lcon, rcon); - else - { - // Right disunct better be a multi-connector - And* rand = upcast(rdj); - assert(rand, "Right dj not a disjunct"); - return alternative(lcon, rand); - } - } - else - { - // Left disjunct better be a multi-connector. - And* land = upcast(ldj); - assert(land, "Left dj not a disjunct"); - - // Right disjunct is a single connector - if (rcon) - return alternative(land, rcon); - else - { - // Right disunct better be a multi-connector - And* rand = upcast(rdj); - assert(rand, "Right dj not a disjunct"); - - return alternative(land, rand); - } - } - - return NULL; // Not reached. -} - -// ============================================================= -/// Try connecting the left and right disjuncts. -/// -/// If a connection was made, return the resulting state pair. -/// If no connection is possible, return NULL. -/// -/// The state pair will contain the output generated (if any) and the -/// final state (if any) after the connection is made. -/// -/// There are four distinct methods below, depending on whether -/// each disjunct is a single or a multi connector. Multi-connectors -/// are just a list of conjoind (AND) single-connectors. A multi- -/// connector is also called a "disjunct" because it is one of the -/// terms in a connector set that has been expanded into dsjunctive -/// normal form. Viz. a single disjunct is a conjoined set of -/// connectors. -// -StateTriple* Connect::alternative(Connector* lcon, Connector* rcon) -{ - Ling* conn = conn_connect_nn(lcon, rcon); - if (!conn) - { - // If we are here, then no connection was possible. It may - // be the case that rcon was right-pointing, in which case, - // it can be added to the state. - char dir = rcon->get_direction(); - if ('-' == dir) - return NULL; - - Word* lword = _left_cset->get_word(); - Word* rword = _right_cset->get_word(); - WordCset* lcset = new WordCset(lword, lcon); - WordCset* rcset = new WordCset(rword, rcon); - Seq* state = new Seq(rcset, lcset); - StateTriple* sp = new StateTriple(new Seq(), state, new Set()); - DBG(cout<<"------ Empty-output alternative created:\n" << sp << endl;); - return sp; - } - - // At this point, conn holds an LG link type, and the - // two disjuncts that were mated. Re-assemble these - // into a pair of word_disjuncts (i.e. stick the word - // back in there, as that is what later stages need). - Set* out = new Set(conn); - - // Meanwhile, we exhausted the state, so that's empty. - StateTriple* sp = new StateTriple(new Seq(), new Seq(), out); - DBG(cout<<"----- single-connector alternative created:\n" << sp << endl;); - return sp; -} - -// See docs above -StateTriple* Connect::alternative(Connector* lcon, And* rand) -{ - if (0 == rand->get_arity()) - return NULL; - - Atom* rfirst = rand->get_outgoing_atom(0); - Connector* rfc = dynamic_cast(rfirst); - assert(rfc, "Exepcting a connector in the right disjunct"); - - Ling* conn = conn_connect_nn(lcon, rfc); -// XXX fixme ;;; if all left-pointers are opt, then OK to create state ... - if (!conn) - return NULL; - - // At this point, conn holds an LG link type, and the - // two disjuncts that were mated. Re-assemble these - // into a pair of word_disjuncts (i.e. stick the word - // back in there, as that is what later stages need). - Set* out = new Set(conn); - - // The state is now everything else left in the disjunct. - // We need to build this back up into WordCset. - OutList remaining_cons = rand->get_outgoing_set(); - remaining_cons.erase(remaining_cons.begin()); - And* remaining_cj = new And(remaining_cons); - Atom* rema = remaining_cj->super_flatten(); - WordCset* rem_cset = new WordCset(_right_cset->get_word(), rema); - -// XXX TODO probably need to flatten haere and make sure that -// the new state isn't empty. -Link* rl = dynamic_cast(rema); -if (rl and rl->get_arity() == 0) -assert(0, "Need to handle this empty state case like all the others"); - - StateTriple* sp = new StateTriple(new Seq(), new Seq(rem_cset), out); - DBG(cout<<"----- right multi-conn alternative created:\n" << sp << endl;); - return sp; -} - -// See docs above -StateTriple* Connect::alternative(And* land, Connector* rcon) -{ - Atom* lfirst = land->get_outgoing_atom(0); - Connector* lfc = dynamic_cast(lfirst); - assert(lfc, "Exepcting a connector in the left disjunct"); - - Ling* conn = conn_connect_nn(lfc, rcon); - if (!conn) - return NULL; - - // At this point, conn holds an LG link type, and the - // two disjuncts that were mated. Re-assemble these - // into a pair of word_disjuncts (i.e. stick the word - // back in there, as that is what later stages need). - Set* out = new Set(conn); - - // The state is now everything left in the disjunct. - // We need to build this back up into WordCset. - OutList remaining_cons = land->get_outgoing_set(); - remaining_cons.erase(remaining_cons.begin()); - And* remaining_cj = new And(remaining_cons); - WordCset* rem_cset = new WordCset(_left_cset->get_word(), remaining_cj); - - // The remaining cset could be empty (e.g. an AND link with - // nothing left in it.) - rem_cset = rem_cset->flatten(); - StateTriple* sp; - if (NULL != rem_cset) - sp = new StateTriple(new Seq(), new Seq(rem_cset), out); - else - sp = new StateTriple(new Seq(), new Seq(), out); - - DBG(cout << "=================> state triple created: " << sp << endl); - return sp; -} - -// See docs above -StateTriple* Connect::alternative(And* land, And* rand) -{ -// cout<<"duude land="<get_outgoing_atom(m); - Connector* lfc = dynamic_cast(lfirst); - assert(lfc, "Exepecting a connector in the left disjunct"); - - - Ling* conn = conn_connect_nn(lfc, rfc); - if (!conn) - break; - m++; - - // At this point, conn holds an LG link type, and the - // two disjuncts that were mated. Re-assemble these - // into a pair of word_disjuncts (i.e. stick the word - // back in there, as that is what later stages need). - outputs.push_back(conn); - } - if (0 == m) - return NULL; - - // Add the un-connected parts of the left and right csets - // to the state. But first, check to make sure that the - // right cset does not have any (non-optional) - // left-pointers, because these will never be fulfilled. - // Lets start with the right cset. - // We need to build this back up into WordCset. - OutList remaining_cons = rand->get_outgoing_set(); - for (size_t k = 0; kget_word(), remaining_cj); - rem_cset = cset_trim_left_pointers(rem_cset); - if (NULL == rem_cset) - return NULL; - - // If we are here, the remaining right connectors all - // point right. Put them into the state. - OutList statel; - statel.push_back(rem_cset); - - // And now repeat for the left cset. - remaining_cons = land->get_outgoing_set(); - for (size_t k = 0; kget_word(), remaining_cj); - statel.push_back(rem_cset); - } - - Seq* state = new Seq(statel); - - Set* out = new Set(outputs); - - StateTriple* sp = new StateTriple(new Seq(), state, out); - DBG(cout << "============> multi state pair created: " << sp << endl); - return sp; -} - -// ============================================================= - -// At this point, conn holds an LG link type, and the -// two disjuncts that were mated. Re-assemble these -// into a pair of word_disjuncts (i.e. stick the word -// back in there, as that is what later stages need). -// -// The left_cset and right_cset are assumed to be the word-connector -// sets that matched. These are needed, only to extract the words; -// the rest is dicarded. -Ling* Connect::reassemble(Ling* conn, WordCset* left_cset, WordCset* right_cset) -{ - assert(conn, "Bad cast to Ling"); - - OutList lwdj; - lwdj.push_back(left_cset->get_word()); // the word - lwdj.push_back(conn->get_left()); // the connector - Link *lwordj = new Link(WORD_DISJ, lwdj); - - OutList rwdj; - rwdj.push_back(right_cset->get_word()); // the word - rwdj.push_back(conn->get_right()); // the connector - Link *rwordj = new Link(WORD_DISJ, rwdj); - - Ling *lg_link = new Ling(conn->get_ling_type(), lwordj, rwordj); - - return lg_link; -} - -// ============================================================= -/** - * Try to connect the left and right connectors. If they do connect, - * then return an LG_LING structure linking them. - */ -Ling* Connect::conn_connect_nn(Connector* lnode, Connector* rnode) -{ -// cout<<"try match connectors l="<get_name()<<" to r="<< rnode->get_name() << endl; - if (lnode->is_optional()) return NULL; - if (rnode->is_optional()) return NULL; - if (!conn_match(lnode->get_name(), rnode->get_name())) - return NULL; - -// cout << "Yayyyyae connectors match!"<get_name(), rnode->get_name()); - Ling* ling = new Ling(link_name, lnode, rnode); - - ling = reassemble(ling, _left_cset, _right_cset); - return ling; -} - -// ============================================================= - -// Collapse singleton sets, if any. This is the price we pay -// for otherwise being able to ignore the difference between -// singleton sets, and their elements. -const OutList& Connect::flatten(OutList& alternatives) -{ - size_t asize = alternatives.size(); - - // If its a singleton, and its already a set ... - if (1 == asize) - { - Set* set = dynamic_cast(alternatives[0]); - if (set) - return set->get_outgoing_set(); - } - - for (size_t i = 0; i < asize; i++) - { - Set* set = dynamic_cast(alternatives[i]); - if (set and (1 == set->get_arity())) - { - alternatives[i] = set->get_outgoing_atom(0); - } - } - return alternatives; -} - - -} // namespace viterbi -} // namespace link-grammar diff -Nru link-grammar-5.3.16/viterbi/connect.h link-grammar-5.5.0/viterbi/connect.h --- link-grammar-5.3.16/viterbi/connect.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/connect.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_VITERBI_CONNECT_H -#define _LG_VITERBI_CONNECT_H - -#include "atom.h" -#include "compile.h" -#include "garbage.h" - -namespace link_grammar { -namespace viterbi { - -class Connect : public gc -{ - public: - Connect(WordCset*, WordCset*); - StateTriple* try_alternative(Atom*, Atom*); - - protected: - - StateTriple* alternative(Connector*, Connector*); - StateTriple* alternative(Connector*, And*); - StateTriple* alternative(And*, Connector*); - StateTriple* alternative(And*, And*); - - Ling* conn_connect_nn(Connector*, Connector*); - Ling* reassemble(Ling*, WordCset*, WordCset*); - - static const OutList& flatten(OutList&); - - private: - WordCset* _left_cset; - WordCset* _right_cset; -}; - - -} // namespace viterbi -} // namespace link-grammar - -#endif // _LG_VITERBI_CONNECT_H diff -Nru link-grammar-5.3.16/viterbi/connector-utils.cc link-grammar-5.5.0/viterbi/connector-utils.cc --- link-grammar-5.3.16/viterbi/connector-utils.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/connector-utils.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,248 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include - -#include -#include - -#include "utilities.h" - -#include "compile.h" -#include "connector-utils.h" - -namespace link_grammar { -namespace viterbi { - -/** - * Compare two connector strings, see if they mate. - * Return true if they do, else return false. - * All upper-case letters must match exactly. - * Lower case letters must match exactly, or must match wildcard '*'. - * All strings are implicitly padded with an infinite number of - * wild-cards on the right; thus, only need to compare against the - * shorter of the * two strings. - */ -bool conn_match(const NameString& ls, const NameString& rs) -{ - char ldir = *ls.rbegin(); - char rdir = *rs.rbegin(); - assert (ldir == '+' or ldir == '-', "Bad word direction (l)"); - assert (rdir == '+' or rdir == '-', "Bad word direction (r)"); - - // Direction signs must couple. - if ('+' == ldir and '-' != rdir) return false; - if ('-' == ldir and '+' != rdir) return false; - - // Captial letters must match. Wildcards match anything lower-case. - NameString::const_iterator lp = ls.begin(); - NameString::const_iterator rp = rs.begin(); - size_t lslen = ls.size(); - size_t rslen = rs.size(); - size_t minlen = std::min(lslen, rslen); - size_t len = minlen - 1; // -1 for direction - while (0 < len) - { - if (*lp != *rp) - { - // All upper-case letters must match exactly! - if (isupper(*lp) or isupper(*rp)) return false; - // Wild-card matches anything. - if ('*' != *lp and '*' != *rp) return false; - } - lp++; - rp++; - len--; - } - // If the longer string is sill upper-case .. ouch - if ((minlen < lslen) and isupper(*lp)) return false; - if ((minlen < rslen) and isupper(*rp)) return false; - - return true; -} - -/** - * Merge two connector strings to create the linkage string. - * It is assumed that the two connectors mate; no error checking is - * done to detect if they don't. - * Example: W*n+ merged with Wi*dy- gives Windy - */ -NameString conn_merge(const NameString& ls, const NameString& rs) -{ - NameString::const_iterator lp = ls.begin(); - NameString::const_iterator rp = rs.begin(); - size_t len = -1 + std::max(ls.size(), rs.size()); - NameString merger; - merger.reserve(len); - while (0 < len) - { - if (lp == ls.end()) - merger.push_back(*rp); - else if (rp == rs.end()) - merger.push_back(*lp); - else if ('*' != *lp and '+' != *lp and '-' != *lp) - merger.push_back(*lp); - else if ('+' != *rp and '-' != *rp) - merger.push_back(*rp); - else - merger.push_back('*'); - - if (lp != ls.end()) lp++; - if (rp != rs.end()) rp++; - len--; - } - return merger; -} - -// ============================================================= - -/// Return true if the indicated atom is an optional clause. -/// else return false. -bool is_optional(Atom *a) -{ - AtomType ty = a->get_type(); - if (CONNECTOR == ty) - { - Connector* n = dynamic_cast(a); - if (n->is_optional()) - return true; - return false; - } - assert (OR == ty or AND == ty, "Must be boolean junction"); - - foreach_outgoing(Atom*, atom, dynamic_cast(a)) - { - bool c = is_optional(atom); - if (OR == ty) - { - // If anything in OR is optional, the whole clause is optional. - if (c) return true; - } - else - { - // ty is AND - // If anything in AND is isn't optional, then something is required - if (!c) return false; - } - } - - // All disj were required. - if (OR == ty) return false; - - // All conj were optional. - return true; -} - -// =================================================================== - -// Utility for below. See description given there. -static Atom* trim_left_pointers(Atom* a) -{ - Connector* ct = dynamic_cast(a); - if (ct) - { - if (ct->is_optional()) - return a; - char dir = ct->get_direction(); - if ('-' == dir) return NULL; - if ('+' == dir) return a; - assert(0, "Bad word direction (t)"); - } - - AtomType ty = a->get_type(); - assert (OR == ty or AND == ty, "Must be boolean junction"); - - // Note: With the new DNF style of processing esewhere in the code, - // it will never be the case that the block of code will be hit. - // None-the-less, the code below works great for arbitrarily-nested - // connector sets (i.e. sets that are not in any normal form) and - // so its left here in case you need it. - if (OR == ty) - { - OutList new_ol; - foreach_outgoing(Atom*, ota, dynamic_cast(a)) - { - Atom* new_ota = trim_left_pointers(ota); - if (new_ota) - new_ol.push_back(new_ota); - } - if (0 == new_ol.size()) - return NULL; - - // The result of trimming may be multiple empty nodes. - // Remove all but one of them. - bool got_opt = false; - size_t nsz = new_ol.size(); - for (size_t i = 0; i < nsz; i++) - { - Connector* c = dynamic_cast(new_ol[i]); - if (c and c->is_optional()) - { - if (!got_opt) - got_opt = true; - else - new_ol.erase(new_ol.begin() + i); - } - } - - if (1 == new_ol.size()) - { - // If the entire OR-list was pruned down to one connector, - // and that connector is the empty connector, then it - // "connects to nothing" on the left, and should be removed. - Connector* c = dynamic_cast(new_ol[0]); - if (c and c->is_optional()) - return NULL; - return new_ol[0]; - } - - return new Link(OR, new_ol); - } - - // If we are here, then it an andlist, and all conectors are - // mandatory, unless they are optional. So fail, if the - // connectors that were trimmed were not optional. - OutList new_ol; - foreach_outgoing(Atom*, ota, dynamic_cast(a)) - { - Atom* new_ota = trim_left_pointers(ota); - if (new_ota) - new_ol.push_back(new_ota); - else - if (!is_optional(ota)) - return NULL; - } - - if (0 == new_ol.size()) - return new And(); - - if (1 == new_ol.size()) - return new_ol[0]; - - return new And(new_ol); -} - -/// Trim away all optional left pointers (connectors with - direction) -/// If there are any non-optional left-pointers, then return NULL. -/// -/// If all of the connectors were optional left-pointers, then they -/// are all trimmed away, and a single, empty AND is returned. -WordCset* cset_trim_left_pointers(WordCset* wrd_cset) -{ - Atom* trimmed = trim_left_pointers(wrd_cset->get_cset()); - if (!trimmed) - return NULL; - return new WordCset(wrd_cset->get_word(), trimmed); -} - -} // namespace viterbi -} // namespace link-grammar - diff -Nru link-grammar-5.3.16/viterbi/connector-utils.h link-grammar-5.5.0/viterbi/connector-utils.h --- link-grammar-5.3.16/viterbi/connector-utils.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/connector-utils.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_CONNECTOR_UTILS_H -#define _LG_CONNECTOR_UTILS_H - -#include "atom.h" -#include "compile.h" - -namespace link_grammar { -namespace viterbi { - -bool conn_match(const NameString&, const NameString&); -NameString conn_merge(const NameString&, const NameString&); -bool is_optional(Atom *); - -WordCset* cset_trim_left_pointers(WordCset*); - -} // namespace viterbi -} // namespace link-grammar - -#endif // _LG_CONNECTOR_UTILS_H diff -Nru link-grammar-5.3.16/viterbi/disjoin.cc link-grammar-5.5.0/viterbi/disjoin.cc --- link-grammar-5.3.16/viterbi/disjoin.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/disjoin.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,198 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "compile.h" -#include "disjoin.h" - -namespace link_grammar { -namespace viterbi { - -/** - * Order connectors so that all left-pointers appear before all right- - * pointers. This is required, as the connect algo always tries to make - * connections sequentially, so of right-going connectors appear before - * the left-goers, the left-goers fail to connect. - * - * The input to this is presumed to be in DNF. - */ -static Atom* normal_order(Atom* dnf) -{ - - // Simply recurse on down. - Or* ora = dynamic_cast(dnf); - if (ora) - { - OutList norm; - foreach_outgoing(Atom*, a, ora) - { - norm.push_back(normal_order(a)); - } - return new Or(norm, ora->_tv); - } - - And* andy = dynamic_cast(dnf); - if (andy) - { - OutList norm; - foreach_outgoing(Connector*, c, andy) - { - assert(c, "normal_order: expecting a connector in the disjunct"); - if ('-' == c->get_direction()) - norm.push_back(normal_order(c)); - } - foreach_outgoing(Connector*, cc, andy) - { - assert(cc, "normal_order: expecting a connector in the disjunct"); - if ('+' == cc->get_direction()) - norm.push_back(normal_order(cc)); - - // Optional connectors are not allowed to appear in disjuncts! - assert(not cc->is_optional(), "Optional connector in disjunct"); - } - return new And(norm, andy->_tv); - } - - // no-op - return dnf; -} - - - -/** - * Convert mixed connector expressions into disjunctive normal form. - * The final form will consist of disjunction of conjunctions of - * connectors. - * - * Note that connector expressions resemble Boolean logic terms, but - * they are not actually Boolean logic; each connector can be used - * once, and only once. Thus, connector expressions are OR-distributive, - * but not AND-distributive. Thus, (A & (B or C)) = ((A & B) or (A & C)) - * but it is not at all the case that (A or (B & C)) is the same as - * ((A or B) & (A or C)) because connectors cannot be duplicated! - * That is, the logic of link-grammar connectors is that of linear - * logic (monoidal categories) not boolean logic (cartesian categories). - * - * The primary user of this function is the parser, to convert the - * mixed-form dictionary entries into a simpler structure, thus - * simplifying the parser algorithm. - * - * XXX This code somewhat duplicates the function offered by the - * disjoin() methods on the OR and AND nodes. BTW, those functions - * are unit-tested; this one is not. However, this one handles - * optionals, the other does not. - */ - -Atom* disjoin(Atom* mixed_form) -{ - AtomType intype = mixed_form->get_type(); - if ((OR != intype) and (AND != intype)) - return mixed_form; - - if (OR == intype) - { - Or* junct = dynamic_cast(mixed_form); - assert(junct, "disjoin: given a naked OR link!"); - - junct = junct->flatten(); - - // Just a recursive call, that's all. - OutList new_oset; - foreach_outgoing(Atom*, norm, junct) - new_oset.push_back(disjoin(norm)); - - Or* new_or = new Or(new_oset, junct->_tv); - return normal_order(new_or->super_flatten()); - } - - And* junct = dynamic_cast(mixed_form); - assert(junct, "disjoin: mixed form is not AND link!"); - - junct = junct->flatten(); - Atom* ajunct = junct->clean(); - - // After cleaning, it might be just a single optional clause. - // e.g. after (A+ or [[()]]) & B+; - junct = dynamic_cast(ajunct); - if (not junct) - return ajunct; - - // If we are here, the outgoing set is a conjunction of atoms. - // Search for the first disjunction in that set, and distribute - // over it. - OutList front; - size_t sz = junct->get_arity(); - size_t i; - for (i=0; iget_outgoing_atom(i)); - AtomType t = a->get_type(); - if (OR == t) - break; - front.push_back(a); - } - - /* If no disjunctions found, we are done */ - if (i == sz) - return normal_order(junct); - - Atom *orat = junct->get_outgoing_atom(i); - i++; - - OutList rest; - for(; iget_outgoing_atom(i)); - rest.push_back(norm); - } - - Or* orn = dynamic_cast(orat); - assert(orn, "Bad link type found during disjoin"); - - // Distribute over the elements in OR-list - OutList new_oset; - sz = orn->get_arity(); - for (i=0; iclone()); - - // insert one atom. - distrib.push_back(orn->get_outgoing_atom(i)->clone()); - - // Copy the rest. Again with the clone(). Irritating. - jsz = rest.size(); - for (size_t j=0; jclone()); - - And *andy = new And(distrib); - new_oset.push_back(andy->clean()); - } - - Or* new_or = new Or(new_oset, orn->_tv); - Atom* new_a = new_or->super_flatten(); - new_a = disjoin(new_a); - - Set* newset = dynamic_cast(new_a); - if (newset) - return normal_order(newset->super_flatten()); - return normal_order(new_a); -} - - -} // namespace viterbi -} // namespace link-grammar - diff -Nru link-grammar-5.3.16/viterbi/disjoin.h link-grammar-5.5.0/viterbi/disjoin.h --- link-grammar-5.3.16/viterbi/disjoin.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/disjoin.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,32 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_VITERBI_DISJOIN_H -#define _LG_VITERBI_DISJOIN_H - -#include "atom.h" - -namespace link_grammar { -namespace viterbi { - -/** - * Convert dictionary-normal form into disjunctive normal form. - * That is, convert the mixed-form dictionary entries into a disjunction - * of a list of conjoined connectors. The goal of this conversion is to - * simplify the parsing algorithm. - */ -Atom* disjoin(Atom* mixed_form); - - -} // namespace viterbi -} // namespace link-grammar - -#endif /* _LG_VITERBI_DISJOIN_H */ diff -Nru link-grammar-5.3.16/viterbi/environment.cc link-grammar-5.5.0/viterbi/environment.cc --- link-grammar-5.3.16/viterbi/environment.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/environment.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,145 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include -#include "environment.h" -#include "utilities.h" - -namespace atombase { - -Environment* Environment::top() -{ - static Environment* global_env = new Environment(); - return global_env; -} - -Environment::Environment() -{ -} - -/// Insert an atom into the environment. -/// The environment keeps a pointer to the atom, so that it won't -/// get garbage collected. -void Environment::insert_atom(Atom* a) -{ - std::lock_guard lck(_mtx); - _atoms.insert(a); -} - -/// Remove an atom from the environment. -/// If there are no other refrences to the atom, it will be -/// garbage-collected. -void Environment::remove_atom(Atom* a) -{ - std::lock_guard lck(_mtx); - _atoms.erase(a); -} - -/// Create a relation link, and put it into the environment, all in one go. -Relation* Environment::add_relation(const char* name, Atom* arg, Atom* val) -{ - Relation *rel = new Relation(name, arg, val); - insert_atom(rel); - return rel; -} - -/// Return a set of all of the relations with the given name, and given -/// argument (value from its domain). -Set* Environment::get_relations(const char* name, Atom* arg) -{ - // XXX this should use a pre-computed index, instead of searching ... - // XXX FIXME - OutList oset; - Label* lab = new Label(name); - std::set::iterator it = _atoms.begin(); - std::set::iterator end = _atoms.end(); - for (; it != end; ++it) - { - Atom* a = *it; - if (a->get_type() != RELATION) continue; - Link* l = dynamic_cast(a); - if (l->get_arity() != 3) continue; - if (not l->get_outgoing_atom(1)->operator==(arg)) continue; - if (not l->get_outgoing_atom(0)->operator==(lab)) continue; - oset.push_back(l); - } - - return new Set(oset); -} - -/// Return a set of of the "outputs" (codomain) of a relation, given it's -/// name and input argument (value from its domain). -Set* Environment::get_relation_vals(const char* name, Atom* arg) -{ - Set* relset = get_relations(name, arg); - OutList oset; - foreach_outgoing(Relation*, rel, relset) - { - oset.push_back(rel->get_outgoing_atom(2)); - } - - return new Set(oset); -} - -/// Create a relation link, and put it into the environment, all in one go. -/// If there alerady is a relation with the same name and input value, -/// it is replaced by the new output value. In other words, this -/// defines a 'function' in the mathematical sense: given a function -/// name and a value from its domain, it returns a single, unique value -/// from its codomain. -/// -Relation* Environment::set_function(const char* name, Atom* arg, Atom* val) -{ - Relation* old = get_function(name, arg); - if (old) - remove_atom(old); - - Relation *func = new Relation(name, arg, val); - insert_atom(func); - return func; -} - -Relation* Environment::get_function(const char* name, Atom* arg) -{ - Set* relset = get_relations(name, arg); - size_t arity = relset->get_arity(); - assert(arity < 2, "Function has bad arity!"); - if (0 == arity) return NULL; - - return dynamic_cast(relset->get_outgoing_atom(0)); -} - -// Get the value of function 'name' at argument 'arg' -Atom* Environment::get_function_value(const char* name, Atom* arg) -{ - Relation* val = get_function(name, arg); - if (val) return val->get_outgoing_atom(2); - return NULL; -} - -// Set the value of numeric function 'name' at argument 'arg' -Relation* Environment::set_number(const char* name, Atom* arg, double v) -{ - return set_function(name, arg, new Number(v)); -} - -// Get the value of numeric function 'name' at argument 'arg' -double Environment::get_number(const char* name, Atom* arg) -{ - Atom* a = get_function_value(name, arg); - if (!a) return nan(""); - Number* n = upcast(a); - if (!n) return nan(""); - return n->get_value(); -} - -} // namespace atombase diff -Nru link-grammar-5.3.16/viterbi/environment.h link-grammar-5.5.0/viterbi/environment.h --- link-grammar-5.3.16/viterbi/environment.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/environment.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,50 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _ATOMBASE_ENVIRONMENT_H -#define _ATOMBASE_ENVIRONMENT_H - -#include "compile-base.h" - -namespace atombase { - -/// Kind-of-like the opencog AtomSpace ... but smaller, simpler. -class Environment : public gc -{ - public: - Environment(); - - void insert_atom(Atom*); - void remove_atom(Atom*); - - Relation* add_relation(const char*, Atom*, Atom*); - Set* get_relations(const char*, Atom*); - Set* get_relation_vals(const char*, Atom*); - - Relation* set_function(const char*, Atom*, Atom*); - Relation* get_function(const char*, Atom*); - Atom* get_function_value(const char*, Atom*); - - Relation* set_number(const char*, Atom*, double); - double get_number(const char*, Atom*); - - static Environment* top(); - protected: - // Distinct mutex per envirnoment; this should avoid contention. - std::mutex _mtx; - - // Set of all atoms in the environment - std::set, gc_allocator > _atoms; -}; - -} // namespace atombase - -#endif // _ATOMBASE_ENVIRONMENT_H diff -Nru link-grammar-5.3.16/viterbi/garbage.cc link-grammar-5.5.0/viterbi/garbage.cc --- link-grammar-5.3.16/viterbi/garbage.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/garbage.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,41 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include -#include "garbage.h" - -namespace atombase { - -bool gc_is_inited = do_init_gc(); - -bool do_init_gc() -{ - static bool is_inited = false; - if (is_inited) // not thread safe. - return is_inited; - is_inited = true; - - GC_init(); - - /* Max heap size of a quarter-gig. */ - GC_set_max_heap_size(256*1024*1024); - - return is_inited; -} - -// Overkill ... and even so, its not called unless someone explicitly -// touches gc_is_inited or calls do_init_gc() !???? WTF? -static __attribute__ ((constructor)) void gc_init(void) -{ - do_init_gc(); -} - -} // namespace atombase diff -Nru link-grammar-5.3.16/viterbi/garbage.h link-grammar-5.5.0/viterbi/garbage.h --- link-grammar-5.3.16/viterbi/garbage.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/garbage.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _ATOMBASE_GARBAGE_H -#define _ATOMBASE_GARBAGE_H - -namespace atombase { - -bool do_init_gc(); - -} // namespace atombase - -#endif // _ATOMBASE_GARBAGE_H diff -Nru link-grammar-5.3.16/viterbi/Makefile.am link-grammar-5.5.0/viterbi/Makefile.am --- link-grammar-5.3.16/viterbi/Makefile.am 2016-09-13 18:44:33.000000000 +0000 +++ link-grammar-5.5.0/viterbi/Makefile.am 1970-01-01 00:00:00.000000000 +0000 @@ -1,66 +0,0 @@ -# -# Makefile.am -# -# Master makefile for the viterbi decoder. -# - -# Skip this entire directory, if not configured -if WITH_VITERBI - -AM_CPPFLAGS = -I.. -I$(top_srcdir) -I$(top_srcdir)/link-grammar \ - $(WARN_CXXFLAGS) - -lib_LTLIBRARIES = libvitacog.la - -libvitacog_la_SOURCES = \ - atom.cc \ - atom-types.cc \ - compile-base.cc \ - compile.cc \ - compress.cc \ - connect.cc \ - connector-utils.cc \ - disjoin.cc \ - environment.cc \ - garbage.cc \ - parser.cc \ - rewrite.cc \ - upcast.cc \ - word-monad.cc \ - atom.h \ - atom-types.h \ - compile-base.h \ - compile.h \ - compress.h \ - connect.h \ - connector-utils.h \ - disjoin.h \ - environment.h \ - garbage.h \ - parser.h \ - rewrite.h \ - viterbi.h \ - word-monad.h - -libvitacog_la_LIBADD = $(top_builddir)/link-grammar/liblink-grammar.la -libvitacog_la_LIBADD += $(LIBGC_LIBS) - -# Unit test, to make sure the parser is working correctly. -TESTS = test-env test-disjoin test-parser test-cost - -check_PROGRAMS = test-env test-disjoin test-parser test-cost - -test_disjoin_SOURCES = test-disjoin.cc -test_parser_SOURCES = test-parser.cc -test_cost_SOURCES = test-cost.cc -test_env_SOURCES = test-env.cc - -LDADD = libvitacog.la -LDADD += $(top_builddir)/link-grammar/liblink-grammar.la -LDADD += $(LIBGC_LIBS) - -EXTRA_DIST = \ - README \ - README.atombase - -endif diff -Nru link-grammar-5.3.16/viterbi/Makefile.in link-grammar-5.5.0/viterbi/Makefile.in --- link-grammar-5.3.16/viterbi/Makefile.in 2017-04-15 21:49:50.000000000 +0000 +++ link-grammar-5.5.0/viterbi/Makefile.in 1970-01-01 00:00:00.000000000 +0000 @@ -1,1238 +0,0 @@ -# Makefile.in generated by automake 1.15 from Makefile.am. -# @configure_input@ - -# Copyright (C) 1994-2014 Free Software Foundation, Inc. - -# This Makefile.in is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY, to the extent permitted by law; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. - -@SET_MAKE@ - -# -# Makefile.am -# -# Master makefile for the viterbi decoder. -# - -VPATH = @srcdir@ -am__is_gnu_make = { \ - if test -z '$(MAKELEVEL)'; then \ - false; \ - elif test -n '$(MAKE_HOST)'; then \ - true; \ - elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ - true; \ - else \ - false; \ - fi; \ -} -am__make_running_with_option = \ - case $${target_option-} in \ - ?) ;; \ - *) echo "am__make_running_with_option: internal error: invalid" \ - "target option '$${target_option-}' specified" >&2; \ - exit 1;; \ - esac; \ - has_opt=no; \ - sane_makeflags=$$MAKEFLAGS; \ - if $(am__is_gnu_make); then \ - sane_makeflags=$$MFLAGS; \ - else \ - case $$MAKEFLAGS in \ - *\\[\ \ ]*) \ - bs=\\; \ - sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ - | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ - esac; \ - fi; \ - skip_next=no; \ - strip_trailopt () \ - { \ - flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ - }; \ - for flg in $$sane_makeflags; do \ - test $$skip_next = yes && { skip_next=no; continue; }; \ - case $$flg in \ - *=*|--*) continue;; \ - -*I) strip_trailopt 'I'; skip_next=yes;; \ - -*I?*) strip_trailopt 'I';; \ - -*O) strip_trailopt 'O'; skip_next=yes;; \ - -*O?*) strip_trailopt 'O';; \ - -*l) strip_trailopt 'l'; skip_next=yes;; \ - -*l?*) strip_trailopt 'l';; \ - -[dEDm]) skip_next=yes;; \ - -[JT]) skip_next=yes;; \ - esac; \ - case $$flg in \ - *$$target_option*) has_opt=yes; break;; \ - esac; \ - done; \ - test $$has_opt = yes -am__make_dryrun = (target_option=n; $(am__make_running_with_option)) -am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) -pkgdatadir = $(datadir)/@PACKAGE@ -pkgincludedir = $(includedir)/@PACKAGE@ -pkglibdir = $(libdir)/@PACKAGE@ -pkglibexecdir = $(libexecdir)/@PACKAGE@ -am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd -install_sh_DATA = $(install_sh) -c -m 644 -install_sh_PROGRAM = $(install_sh) -c -install_sh_SCRIPT = $(install_sh) -c -INSTALL_HEADER = $(INSTALL_DATA) -transform = $(program_transform_name) -NORMAL_INSTALL = : -PRE_INSTALL = : -POST_INSTALL = : -NORMAL_UNINSTALL = : -PRE_UNINSTALL = : -POST_UNINSTALL = : -build_triplet = @build@ -host_triplet = @host@ -@WITH_VITERBI_TRUE@TESTS = test-env$(EXEEXT) test-disjoin$(EXEEXT) \ -@WITH_VITERBI_TRUE@ test-parser$(EXEEXT) test-cost$(EXEEXT) -@WITH_VITERBI_TRUE@check_PROGRAMS = test-env$(EXEEXT) \ -@WITH_VITERBI_TRUE@ test-disjoin$(EXEEXT) test-parser$(EXEEXT) \ -@WITH_VITERBI_TRUE@ test-cost$(EXEEXT) -subdir = viterbi -ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \ - $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ - $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ - $(top_srcdir)/m4/varcheckpoint.m4 $(top_srcdir)/configure.ac -am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ - $(ACLOCAL_M4) -DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) -mkinstalldirs = $(install_sh) -d -CONFIG_CLEAN_FILES = -CONFIG_CLEAN_VPATH_FILES = -am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; -am__vpath_adj = case $$p in \ - $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ - *) f=$$p;; \ - esac; -am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; -am__install_max = 40 -am__nobase_strip_setup = \ - srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` -am__nobase_strip = \ - for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" -am__nobase_list = $(am__nobase_strip_setup); \ - for p in $$list; do echo "$$p $$p"; done | \ - sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ - $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ - if (++n[$$2] == $(am__install_max)) \ - { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ - END { for (dir in files) print dir, files[dir] }' -am__base_list = \ - sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ - sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' -am__uninstall_files_from_dir = { \ - test -z "$$files" \ - || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ - || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ - $(am__cd) "$$dir" && rm -f $$files; }; \ - } -am__installdirs = "$(DESTDIR)$(libdir)" -LTLIBRARIES = $(lib_LTLIBRARIES) -am__DEPENDENCIES_1 = -@WITH_VITERBI_TRUE@libvitacog_la_DEPENDENCIES = $(top_builddir)/link-grammar/liblink-grammar.la \ -@WITH_VITERBI_TRUE@ $(am__DEPENDENCIES_1) -am__libvitacog_la_SOURCES_DIST = atom.cc atom-types.cc compile-base.cc \ - compile.cc compress.cc connect.cc connector-utils.cc \ - disjoin.cc environment.cc garbage.cc parser.cc rewrite.cc \ - upcast.cc word-monad.cc atom.h atom-types.h compile-base.h \ - compile.h compress.h connect.h connector-utils.h disjoin.h \ - environment.h garbage.h parser.h rewrite.h viterbi.h \ - word-monad.h -@WITH_VITERBI_TRUE@am_libvitacog_la_OBJECTS = atom.lo atom-types.lo \ -@WITH_VITERBI_TRUE@ compile-base.lo compile.lo compress.lo \ -@WITH_VITERBI_TRUE@ connect.lo connector-utils.lo disjoin.lo \ -@WITH_VITERBI_TRUE@ environment.lo garbage.lo parser.lo \ -@WITH_VITERBI_TRUE@ rewrite.lo upcast.lo word-monad.lo -libvitacog_la_OBJECTS = $(am_libvitacog_la_OBJECTS) -AM_V_lt = $(am__v_lt_@AM_V@) -am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) -am__v_lt_0 = --silent -am__v_lt_1 = -@WITH_VITERBI_TRUE@am_libvitacog_la_rpath = -rpath $(libdir) -am__test_cost_SOURCES_DIST = test-cost.cc -@WITH_VITERBI_TRUE@am_test_cost_OBJECTS = test-cost.$(OBJEXT) -test_cost_OBJECTS = $(am_test_cost_OBJECTS) -test_cost_LDADD = $(LDADD) -@WITH_VITERBI_TRUE@test_cost_DEPENDENCIES = libvitacog.la \ -@WITH_VITERBI_TRUE@ $(top_builddir)/link-grammar/liblink-grammar.la \ -@WITH_VITERBI_TRUE@ $(am__DEPENDENCIES_1) -am__test_disjoin_SOURCES_DIST = test-disjoin.cc -@WITH_VITERBI_TRUE@am_test_disjoin_OBJECTS = test-disjoin.$(OBJEXT) -test_disjoin_OBJECTS = $(am_test_disjoin_OBJECTS) -test_disjoin_LDADD = $(LDADD) -@WITH_VITERBI_TRUE@test_disjoin_DEPENDENCIES = libvitacog.la \ -@WITH_VITERBI_TRUE@ $(top_builddir)/link-grammar/liblink-grammar.la \ -@WITH_VITERBI_TRUE@ $(am__DEPENDENCIES_1) -am__test_env_SOURCES_DIST = test-env.cc -@WITH_VITERBI_TRUE@am_test_env_OBJECTS = test-env.$(OBJEXT) -test_env_OBJECTS = $(am_test_env_OBJECTS) -test_env_LDADD = $(LDADD) -@WITH_VITERBI_TRUE@test_env_DEPENDENCIES = libvitacog.la \ -@WITH_VITERBI_TRUE@ $(top_builddir)/link-grammar/liblink-grammar.la \ -@WITH_VITERBI_TRUE@ $(am__DEPENDENCIES_1) -am__test_parser_SOURCES_DIST = test-parser.cc -@WITH_VITERBI_TRUE@am_test_parser_OBJECTS = test-parser.$(OBJEXT) -test_parser_OBJECTS = $(am_test_parser_OBJECTS) -test_parser_LDADD = $(LDADD) -@WITH_VITERBI_TRUE@test_parser_DEPENDENCIES = libvitacog.la \ -@WITH_VITERBI_TRUE@ $(top_builddir)/link-grammar/liblink-grammar.la \ -@WITH_VITERBI_TRUE@ $(am__DEPENDENCIES_1) -AM_V_P = $(am__v_P_@AM_V@) -am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) -am__v_P_0 = false -am__v_P_1 = : -AM_V_GEN = $(am__v_GEN_@AM_V@) -am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) -am__v_GEN_0 = @echo " GEN " $@; -am__v_GEN_1 = -AM_V_at = $(am__v_at_@AM_V@) -am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) -am__v_at_0 = @ -am__v_at_1 = -DEFAULT_INCLUDES = -I.@am__isrc@ -depcomp = $(SHELL) $(top_srcdir)/depcomp -am__depfiles_maybe = depfiles -am__mv = mv -f -CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ - $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ - $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ - $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ - $(AM_CXXFLAGS) $(CXXFLAGS) -AM_V_CXX = $(am__v_CXX_@AM_V@) -am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@) -am__v_CXX_0 = @echo " CXX " $@; -am__v_CXX_1 = -CXXLD = $(CXX) -CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ - $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ - $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ -AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) -am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) -am__v_CXXLD_0 = @echo " CXXLD " $@; -am__v_CXXLD_1 = -COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ - $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ - $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ - $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ - $(AM_CFLAGS) $(CFLAGS) -AM_V_CC = $(am__v_CC_@AM_V@) -am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) -am__v_CC_0 = @echo " CC " $@; -am__v_CC_1 = -CCLD = $(CC) -LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ - $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ - $(AM_LDFLAGS) $(LDFLAGS) -o $@ -AM_V_CCLD = $(am__v_CCLD_@AM_V@) -am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) -am__v_CCLD_0 = @echo " CCLD " $@; -am__v_CCLD_1 = -SOURCES = $(libvitacog_la_SOURCES) $(test_cost_SOURCES) \ - $(test_disjoin_SOURCES) $(test_env_SOURCES) \ - $(test_parser_SOURCES) -DIST_SOURCES = $(am__libvitacog_la_SOURCES_DIST) \ - $(am__test_cost_SOURCES_DIST) $(am__test_disjoin_SOURCES_DIST) \ - $(am__test_env_SOURCES_DIST) $(am__test_parser_SOURCES_DIST) -am__can_run_installinfo = \ - case $$AM_UPDATE_INFO_DIR in \ - n|no|NO) false;; \ - *) (install-info --version) >/dev/null 2>&1;; \ - esac -am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) -# Read a list of newline-separated strings from the standard input, -# and print each of them once, without duplicates. Input order is -# *not* preserved. -am__uniquify_input = $(AWK) '\ - BEGIN { nonempty = 0; } \ - { items[$$0] = 1; nonempty = 1; } \ - END { if (nonempty) { for (i in items) print i; }; } \ -' -# Make sure the list of sources is unique. This is necessary because, -# e.g., the same source file might be shared among _SOURCES variables -# for different programs/libraries. -am__define_uniq_tagged_files = \ - list='$(am__tagged_files)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | $(am__uniquify_input)` -ETAGS = etags -CTAGS = ctags -am__tty_colors_dummy = \ - mgn= red= grn= lgn= blu= brg= std=; \ - am__color_tests=no -am__tty_colors = { \ - $(am__tty_colors_dummy); \ - if test "X$(AM_COLOR_TESTS)" = Xno; then \ - am__color_tests=no; \ - elif test "X$(AM_COLOR_TESTS)" = Xalways; then \ - am__color_tests=yes; \ - elif test "X$$TERM" != Xdumb && { test -t 1; } 2>/dev/null; then \ - am__color_tests=yes; \ - fi; \ - if test $$am__color_tests = yes; then \ - red=''; \ - grn=''; \ - lgn=''; \ - blu=''; \ - mgn=''; \ - brg=''; \ - std=''; \ - fi; \ -} -am__recheck_rx = ^[ ]*:recheck:[ ]* -am__global_test_result_rx = ^[ ]*:global-test-result:[ ]* -am__copy_in_global_log_rx = ^[ ]*:copy-in-global-log:[ ]* -# A command that, given a newline-separated list of test names on the -# standard input, print the name of the tests that are to be re-run -# upon "make recheck". -am__list_recheck_tests = $(AWK) '{ \ - recheck = 1; \ - while ((rc = (getline line < ($$0 ".trs"))) != 0) \ - { \ - if (rc < 0) \ - { \ - if ((getline line2 < ($$0 ".log")) < 0) \ - recheck = 0; \ - break; \ - } \ - else if (line ~ /$(am__recheck_rx)[nN][Oo]/) \ - { \ - recheck = 0; \ - break; \ - } \ - else if (line ~ /$(am__recheck_rx)[yY][eE][sS]/) \ - { \ - break; \ - } \ - }; \ - if (recheck) \ - print $$0; \ - close ($$0 ".trs"); \ - close ($$0 ".log"); \ -}' -# A command that, given a newline-separated list of test names on the -# standard input, create the global log from their .trs and .log files. -am__create_global_log = $(AWK) ' \ -function fatal(msg) \ -{ \ - print "fatal: making $@: " msg | "cat >&2"; \ - exit 1; \ -} \ -function rst_section(header) \ -{ \ - print header; \ - len = length(header); \ - for (i = 1; i <= len; i = i + 1) \ - printf "="; \ - printf "\n\n"; \ -} \ -{ \ - copy_in_global_log = 1; \ - global_test_result = "RUN"; \ - while ((rc = (getline line < ($$0 ".trs"))) != 0) \ - { \ - if (rc < 0) \ - fatal("failed to read from " $$0 ".trs"); \ - if (line ~ /$(am__global_test_result_rx)/) \ - { \ - sub("$(am__global_test_result_rx)", "", line); \ - sub("[ ]*$$", "", line); \ - global_test_result = line; \ - } \ - else if (line ~ /$(am__copy_in_global_log_rx)[nN][oO]/) \ - copy_in_global_log = 0; \ - }; \ - if (copy_in_global_log) \ - { \ - rst_section(global_test_result ": " $$0); \ - while ((rc = (getline line < ($$0 ".log"))) != 0) \ - { \ - if (rc < 0) \ - fatal("failed to read from " $$0 ".log"); \ - print line; \ - }; \ - printf "\n"; \ - }; \ - close ($$0 ".trs"); \ - close ($$0 ".log"); \ -}' -# Restructured Text title. -am__rst_title = { sed 's/.*/ & /;h;s/./=/g;p;x;s/ *$$//;p;g' && echo; } -# Solaris 10 'make', and several other traditional 'make' implementations, -# pass "-e" to $(SHELL), and POSIX 2008 even requires this. Work around it -# by disabling -e (using the XSI extension "set +e") if it's set. -am__sh_e_setup = case $$- in *e*) set +e;; esac -# Default flags passed to test drivers. -am__common_driver_flags = \ - --color-tests "$$am__color_tests" \ - --enable-hard-errors "$$am__enable_hard_errors" \ - --expect-failure "$$am__expect_failure" -# To be inserted before the command running the test. Creates the -# directory for the log if needed. Stores in $dir the directory -# containing $f, in $tst the test, in $log the log. Executes the -# developer- defined test setup AM_TESTS_ENVIRONMENT (if any), and -# passes TESTS_ENVIRONMENT. Set up options for the wrapper that -# will run the test scripts (or their associated LOG_COMPILER, if -# thy have one). -am__check_pre = \ -$(am__sh_e_setup); \ -$(am__vpath_adj_setup) $(am__vpath_adj) \ -$(am__tty_colors); \ -srcdir=$(srcdir); export srcdir; \ -case "$@" in \ - */*) am__odir=`echo "./$@" | sed 's|/[^/]*$$||'`;; \ - *) am__odir=.;; \ -esac; \ -test "x$$am__odir" = x"." || test -d "$$am__odir" \ - || $(MKDIR_P) "$$am__odir" || exit $$?; \ -if test -f "./$$f"; then dir=./; \ -elif test -f "$$f"; then dir=; \ -else dir="$(srcdir)/"; fi; \ -tst=$$dir$$f; log='$@'; \ -if test -n '$(DISABLE_HARD_ERRORS)'; then \ - am__enable_hard_errors=no; \ -else \ - am__enable_hard_errors=yes; \ -fi; \ -case " $(XFAIL_TESTS) " in \ - *[\ \ ]$$f[\ \ ]* | *[\ \ ]$$dir$$f[\ \ ]*) \ - am__expect_failure=yes;; \ - *) \ - am__expect_failure=no;; \ -esac; \ -$(AM_TESTS_ENVIRONMENT) $(TESTS_ENVIRONMENT) -# A shell command to get the names of the tests scripts with any registered -# extension removed (i.e., equivalently, the names of the test logs, with -# the '.log' extension removed). The result is saved in the shell variable -# '$bases'. This honors runtime overriding of TESTS and TEST_LOGS. Sadly, -# we cannot use something simpler, involving e.g., "$(TEST_LOGS:.log=)", -# since that might cause problem with VPATH rewrites for suffix-less tests. -# See also 'test-harness-vpath-rewrite.sh' and 'test-trs-basic.sh'. -am__set_TESTS_bases = \ - bases='$(TEST_LOGS)'; \ - bases=`for i in $$bases; do echo $$i; done | sed 's/\.log$$//'`; \ - bases=`echo $$bases` -RECHECK_LOGS = $(TEST_LOGS) -AM_RECURSIVE_TARGETS = check recheck -TEST_SUITE_LOG = test-suite.log -TEST_EXTENSIONS = @EXEEXT@ .test -LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver -LOG_COMPILE = $(LOG_COMPILER) $(AM_LOG_FLAGS) $(LOG_FLAGS) -am__set_b = \ - case '$@' in \ - */*) \ - case '$*' in \ - */*) b='$*';; \ - *) b=`echo '$@' | sed 's/\.log$$//'`; \ - esac;; \ - *) \ - b='$*';; \ - esac -am__test_logs1 = $(TESTS:=.log) -am__test_logs2 = $(am__test_logs1:@EXEEXT@.log=.log) -TEST_LOGS = $(am__test_logs2:.test.log=.log) -TEST_LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver -TEST_LOG_COMPILE = $(TEST_LOG_COMPILER) $(AM_TEST_LOG_FLAGS) \ - $(TEST_LOG_FLAGS) -am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp \ - $(top_srcdir)/test-driver README -DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) -ACLOCAL = @ACLOCAL@ -ALLOCA = @ALLOCA@ -AMTAR = @AMTAR@ -AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ -ANTfound = @ANTfound@ -AR = @AR@ -ASPELL_CFLAGS = @ASPELL_CFLAGS@ -ASPELL_LIBS = @ASPELL_LIBS@ -AUTOCONF = @AUTOCONF@ -AUTOHEADER = @AUTOHEADER@ -AUTOMAKE = @AUTOMAKE@ -AWK = @AWK@ -CC = @CC@ -CCDEPMODE = @CCDEPMODE@ -CFLAGS = @CFLAGS@ -CPP = @CPP@ -CPPFLAGS = @CPPFLAGS@ -CXX = @CXX@ -CXXCPP = @CXXCPP@ -CXXDEPMODE = @CXXDEPMODE@ -CXXFLAGS = @CXXFLAGS@ -CYGPATH_W = @CYGPATH_W@ -DEFS = @DEFS@ -DEPDIR = @DEPDIR@ -DLLTOOL = @DLLTOOL@ -DSYMUTIL = @DSYMUTIL@ -DUMPBIN = @DUMPBIN@ -ECHO_C = @ECHO_C@ -ECHO_N = @ECHO_N@ -ECHO_T = @ECHO_T@ -EGREP = @EGREP@ -EXEEXT = @EXEEXT@ -FGREP = @FGREP@ -GREP = @GREP@ -HUNSPELL_CFLAGS = @HUNSPELL_CFLAGS@ -HUNSPELL_LIBS = @HUNSPELL_LIBS@ -INSTALL = @INSTALL@ -INSTALL_DATA = @INSTALL_DATA@ -INSTALL_PROGRAM = @INSTALL_PROGRAM@ -INSTALL_SCRIPT = @INSTALL_SCRIPT@ -INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ -JAVA_CPPFLAGS = @JAVA_CPPFLAGS@ -LD = @LD@ -LDFLAGS = @LDFLAGS@ -LG_PYDIR = @LG_PYDIR@ -LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@ -LIBEDIT_LIBS = @LIBEDIT_LIBS@ -LIBGC_CFLAGS = @LIBGC_CFLAGS@ -LIBGC_LIBS = @LIBGC_LIBS@ -LIBOBJS = @LIBOBJS@ -LIBS = @LIBS@ -LIBTOOL = @LIBTOOL@ -LINK_CFLAGS = @LINK_CFLAGS@ -LINK_MAJOR_VERSION = @LINK_MAJOR_VERSION@ -LINK_MICRO_VERSION = @LINK_MICRO_VERSION@ -LINK_MINOR_VERSION = @LINK_MINOR_VERSION@ -LIPO = @LIPO@ -LN_S = @LN_S@ -LTLIBOBJS = @LTLIBOBJS@ -LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ -MAINT = @MAINT@ -MAKEINFO = @MAKEINFO@ -MANIFEST_TOOL = @MANIFEST_TOOL@ -MINISAT_INCLUDES = @MINISAT_INCLUDES@ -MINISAT_LIBS = @MINISAT_LIBS@ -MKDIR_P = @MKDIR_P@ -NM = @NM@ -NMEDIT = @NMEDIT@ -OBJDUMP = @OBJDUMP@ -OBJEXT = @OBJEXT@ -OTOOL = @OTOOL@ -OTOOL64 = @OTOOL64@ -PACKAGE = @PACKAGE@ -PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ -PACKAGE_NAME = @PACKAGE_NAME@ -PACKAGE_STRING = @PACKAGE_STRING@ -PACKAGE_TARNAME = @PACKAGE_TARNAME@ -PACKAGE_URL = @PACKAGE_URL@ -PACKAGE_VERSION = @PACKAGE_VERSION@ -PATH_SEPARATOR = @PATH_SEPARATOR@ -PCRE_CFLAGS = @PCRE_CFLAGS@ -PCRE_LIBS = @PCRE_LIBS@ -PERL_CFLAGS = @PERL_CFLAGS@ -PERL_EXT_INC = @PERL_EXT_INC@ -PERL_EXT_LIB = @PERL_EXT_LIB@ -PKG_CONFIG = @PKG_CONFIG@ -PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ -PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ -PYTHON = @PYTHON@ -PYTHON2_CPPFLAGS = @PYTHON2_CPPFLAGS@ -PYTHON2_LDFLAGS = @PYTHON2_LDFLAGS@ -PYTHON2_LIBS = @PYTHON2_LIBS@ -PYTHON3 = @PYTHON3@ -PYTHON3_CPPFLAGS = @PYTHON3_CPPFLAGS@ -PYTHON3_LDFLAGS = @PYTHON3_LDFLAGS@ -PYTHON3_LIBS = @PYTHON3_LIBS@ -PYTHON_CPPFLAGS = @PYTHON_CPPFLAGS@ -PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@ -PYTHON_EXTRA_LDFLAGS = @PYTHON_EXTRA_LDFLAGS@ -PYTHON_EXTRA_LIBS = @PYTHON_EXTRA_LIBS@ -PYTHON_LIBS = @PYTHON_LIBS@ -PYTHON_PLATFORM = @PYTHON_PLATFORM@ -PYTHON_PREFIX = @PYTHON_PREFIX@ -PYTHON_SITE_PKG = @PYTHON_SITE_PKG@ -PYTHON_VERSION = @PYTHON_VERSION@ -RANLIB = @RANLIB@ -REGEX_LIBS = @REGEX_LIBS@ -SED = @SED@ -SET_MAKE = @SET_MAKE@ -SHELL = @SHELL@ -SQLITE3_CFLAGS = @SQLITE3_CFLAGS@ -SQLITE3_LIBS = @SQLITE3_LIBS@ -STRIP = @STRIP@ -SWIG = @SWIG@ -SWIG_LIB = @SWIG_LIB@ -SWIGfound = @SWIGfound@ -VERSION = @VERSION@ -VERSION_INFO = @VERSION_INFO@ -WARN_CFLAGS = @WARN_CFLAGS@ -WARN_CXXFLAGS = @WARN_CXXFLAGS@ -ZLIB_CPPFLAGS = @ZLIB_CPPFLAGS@ -abs_builddir = @abs_builddir@ -abs_srcdir = @abs_srcdir@ -abs_top_builddir = @abs_top_builddir@ -abs_top_srcdir = @abs_top_srcdir@ -ac_ct_AR = @ac_ct_AR@ -ac_ct_CC = @ac_ct_CC@ -ac_ct_CXX = @ac_ct_CXX@ -ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ -am__include = @am__include@ -am__leading_dot = @am__leading_dot@ -am__quote = @am__quote@ -am__tar = @am__tar@ -am__untar = @am__untar@ -bindir = @bindir@ -build = @build@ -build_alias = @build_alias@ -build_cpu = @build_cpu@ -build_os = @build_os@ -build_vendor = @build_vendor@ -builddir = @builddir@ -datadir = @datadir@ -datarootdir = @datarootdir@ -docdir = @docdir@ -dvidir = @dvidir@ -exec_prefix = @exec_prefix@ -host = @host@ -host_alias = @host_alias@ -host_cpu = @host_cpu@ -host_os = @host_os@ -host_vendor = @host_vendor@ -htmldir = @htmldir@ -includedir = @includedir@ -infodir = @infodir@ -install_sh = @install_sh@ -libdir = @libdir@ -libexecdir = @libexecdir@ -localedir = @localedir@ -localstatedir = @localstatedir@ -mandir = @mandir@ -mkdir_p = @mkdir_p@ -oldincludedir = @oldincludedir@ -pdfdir = @pdfdir@ -pkgpyexecdir = @pkgpyexecdir@ -pkgpythondir = @pkgpythondir@ -prefix = @prefix@ -program_transform_name = @program_transform_name@ -psdir = @psdir@ -pyexecdir = @pyexecdir@ -python2dir = @python2dir@ -python3dir = @python3dir@ -pythondir = @pythondir@ -runstatedir = @runstatedir@ -sbindir = @sbindir@ -sharedstatedir = @sharedstatedir@ -srcdir = @srcdir@ -sysconfdir = @sysconfdir@ -target_alias = @target_alias@ -top_build_prefix = @top_build_prefix@ -top_builddir = @top_builddir@ -top_srcdir = @top_srcdir@ - -# Skip this entire directory, if not configured -@WITH_VITERBI_TRUE@AM_CPPFLAGS = -I.. -I$(top_srcdir) -I$(top_srcdir)/link-grammar \ -@WITH_VITERBI_TRUE@ $(WARN_CXXFLAGS) - -@WITH_VITERBI_TRUE@lib_LTLIBRARIES = libvitacog.la -@WITH_VITERBI_TRUE@libvitacog_la_SOURCES = \ -@WITH_VITERBI_TRUE@ atom.cc \ -@WITH_VITERBI_TRUE@ atom-types.cc \ -@WITH_VITERBI_TRUE@ compile-base.cc \ -@WITH_VITERBI_TRUE@ compile.cc \ -@WITH_VITERBI_TRUE@ compress.cc \ -@WITH_VITERBI_TRUE@ connect.cc \ -@WITH_VITERBI_TRUE@ connector-utils.cc \ -@WITH_VITERBI_TRUE@ disjoin.cc \ -@WITH_VITERBI_TRUE@ environment.cc \ -@WITH_VITERBI_TRUE@ garbage.cc \ -@WITH_VITERBI_TRUE@ parser.cc \ -@WITH_VITERBI_TRUE@ rewrite.cc \ -@WITH_VITERBI_TRUE@ upcast.cc \ -@WITH_VITERBI_TRUE@ word-monad.cc \ -@WITH_VITERBI_TRUE@ atom.h \ -@WITH_VITERBI_TRUE@ atom-types.h \ -@WITH_VITERBI_TRUE@ compile-base.h \ -@WITH_VITERBI_TRUE@ compile.h \ -@WITH_VITERBI_TRUE@ compress.h \ -@WITH_VITERBI_TRUE@ connect.h \ -@WITH_VITERBI_TRUE@ connector-utils.h \ -@WITH_VITERBI_TRUE@ disjoin.h \ -@WITH_VITERBI_TRUE@ environment.h \ -@WITH_VITERBI_TRUE@ garbage.h \ -@WITH_VITERBI_TRUE@ parser.h \ -@WITH_VITERBI_TRUE@ rewrite.h \ -@WITH_VITERBI_TRUE@ viterbi.h \ -@WITH_VITERBI_TRUE@ word-monad.h - -@WITH_VITERBI_TRUE@libvitacog_la_LIBADD = $(top_builddir)/link-grammar/liblink-grammar.la \ -@WITH_VITERBI_TRUE@ $(LIBGC_LIBS) -@WITH_VITERBI_TRUE@test_disjoin_SOURCES = test-disjoin.cc -@WITH_VITERBI_TRUE@test_parser_SOURCES = test-parser.cc -@WITH_VITERBI_TRUE@test_cost_SOURCES = test-cost.cc -@WITH_VITERBI_TRUE@test_env_SOURCES = test-env.cc -@WITH_VITERBI_TRUE@LDADD = libvitacog.la \ -@WITH_VITERBI_TRUE@ $(top_builddir)/link-grammar/liblink-grammar.la \ -@WITH_VITERBI_TRUE@ $(LIBGC_LIBS) -@WITH_VITERBI_TRUE@EXTRA_DIST = \ -@WITH_VITERBI_TRUE@ README \ -@WITH_VITERBI_TRUE@ README.atombase - -all: all-am - -.SUFFIXES: -.SUFFIXES: .cc .lo .log .o .obj .test .test$(EXEEXT) .trs -$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) - @for dep in $?; do \ - case '$(am__configure_deps)' in \ - *$$dep*) \ - ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ - && { if test -f $@; then exit 0; else break; fi; }; \ - exit 1;; \ - esac; \ - done; \ - echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign viterbi/Makefile'; \ - $(am__cd) $(top_srcdir) && \ - $(AUTOMAKE) --foreign viterbi/Makefile -Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status - @case '$?' in \ - *config.status*) \ - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ - *) \ - echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ - cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ - esac; - -$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh - -$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh -$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh -$(am__aclocal_m4_deps): - -install-libLTLIBRARIES: $(lib_LTLIBRARIES) - @$(NORMAL_INSTALL) - @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ - list2=; for p in $$list; do \ - if test -f $$p; then \ - list2="$$list2 $$p"; \ - else :; fi; \ - done; \ - test -z "$$list2" || { \ - echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \ - $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \ - echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \ - $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \ - } - -uninstall-libLTLIBRARIES: - @$(NORMAL_UNINSTALL) - @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ - for p in $$list; do \ - $(am__strip_dir) \ - echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \ - $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \ - done - -clean-libLTLIBRARIES: - -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES) - @list='$(lib_LTLIBRARIES)'; \ - locs=`for p in $$list; do echo $$p; done | \ - sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ - sort -u`; \ - test -z "$$locs" || { \ - echo rm -f $${locs}; \ - rm -f $${locs}; \ - } - -libvitacog.la: $(libvitacog_la_OBJECTS) $(libvitacog_la_DEPENDENCIES) $(EXTRA_libvitacog_la_DEPENDENCIES) - $(AM_V_CXXLD)$(CXXLINK) $(am_libvitacog_la_rpath) $(libvitacog_la_OBJECTS) $(libvitacog_la_LIBADD) $(LIBS) - -clean-checkPROGRAMS: - @list='$(check_PROGRAMS)'; test -n "$$list" || exit 0; \ - echo " rm -f" $$list; \ - rm -f $$list || exit $$?; \ - test -n "$(EXEEXT)" || exit 0; \ - list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \ - echo " rm -f" $$list; \ - rm -f $$list - -test-cost$(EXEEXT): $(test_cost_OBJECTS) $(test_cost_DEPENDENCIES) $(EXTRA_test_cost_DEPENDENCIES) - @rm -f test-cost$(EXEEXT) - $(AM_V_CXXLD)$(CXXLINK) $(test_cost_OBJECTS) $(test_cost_LDADD) $(LIBS) - -test-disjoin$(EXEEXT): $(test_disjoin_OBJECTS) $(test_disjoin_DEPENDENCIES) $(EXTRA_test_disjoin_DEPENDENCIES) - @rm -f test-disjoin$(EXEEXT) - $(AM_V_CXXLD)$(CXXLINK) $(test_disjoin_OBJECTS) $(test_disjoin_LDADD) $(LIBS) - -test-env$(EXEEXT): $(test_env_OBJECTS) $(test_env_DEPENDENCIES) $(EXTRA_test_env_DEPENDENCIES) - @rm -f test-env$(EXEEXT) - $(AM_V_CXXLD)$(CXXLINK) $(test_env_OBJECTS) $(test_env_LDADD) $(LIBS) - -test-parser$(EXEEXT): $(test_parser_OBJECTS) $(test_parser_DEPENDENCIES) $(EXTRA_test_parser_DEPENDENCIES) - @rm -f test-parser$(EXEEXT) - $(AM_V_CXXLD)$(CXXLINK) $(test_parser_OBJECTS) $(test_parser_LDADD) $(LIBS) - -mostlyclean-compile: - -rm -f *.$(OBJEXT) - -distclean-compile: - -rm -f *.tab.c - -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atom-types.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atom.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/compile-base.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/compile.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/compress.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/connect.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/connector-utils.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/disjoin.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/environment.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/garbage.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parser.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rewrite.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test-cost.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test-disjoin.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test-env.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test-parser.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/upcast.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/word-monad.Plo@am__quote@ - -.cc.o: -@am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ -@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ -@am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< - -.cc.obj: -@am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\ -@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\ -@am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` - -.cc.lo: -@am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\ -@am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ -@am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Plo -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< - -mostlyclean-libtool: - -rm -f *.lo - -clean-libtool: - -rm -rf .libs _libs - -ID: $(am__tagged_files) - $(am__define_uniq_tagged_files); mkid -fID $$unique -tags: tags-am -TAGS: tags - -tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) - set x; \ - here=`pwd`; \ - $(am__define_uniq_tagged_files); \ - shift; \ - if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ - test -n "$$unique" || unique=$$empty_fix; \ - if test $$# -gt 0; then \ - $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ - "$$@" $$unique; \ - else \ - $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ - $$unique; \ - fi; \ - fi -ctags: ctags-am - -CTAGS: ctags -ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) - $(am__define_uniq_tagged_files); \ - test -z "$(CTAGS_ARGS)$$unique" \ - || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ - $$unique - -GTAGS: - here=`$(am__cd) $(top_builddir) && pwd` \ - && $(am__cd) $(top_srcdir) \ - && gtags -i $(GTAGS_ARGS) "$$here" -cscopelist: cscopelist-am - -cscopelist-am: $(am__tagged_files) - list='$(am__tagged_files)'; \ - case "$(srcdir)" in \ - [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ - *) sdir=$(subdir)/$(srcdir) ;; \ - esac; \ - for i in $$list; do \ - if test -f "$$i"; then \ - echo "$(subdir)/$$i"; \ - else \ - echo "$$sdir/$$i"; \ - fi; \ - done >> $(top_builddir)/cscope.files - -distclean-tags: - -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags - -# Recover from deleted '.trs' file; this should ensure that -# "rm -f foo.log; make foo.trs" re-run 'foo.test', and re-create -# both 'foo.log' and 'foo.trs'. Break the recipe in two subshells -# to avoid problems with "make -n". -.log.trs: - rm -f $< $@ - $(MAKE) $(AM_MAKEFLAGS) $< - -# Leading 'am--fnord' is there to ensure the list of targets does not -# expand to empty, as could happen e.g. with make check TESTS=''. -am--fnord $(TEST_LOGS) $(TEST_LOGS:.log=.trs): $(am__force_recheck) -am--force-recheck: - @: - -$(TEST_SUITE_LOG): $(TEST_LOGS) - @$(am__set_TESTS_bases); \ - am__f_ok () { test -f "$$1" && test -r "$$1"; }; \ - redo_bases=`for i in $$bases; do \ - am__f_ok $$i.trs && am__f_ok $$i.log || echo $$i; \ - done`; \ - if test -n "$$redo_bases"; then \ - redo_logs=`for i in $$redo_bases; do echo $$i.log; done`; \ - redo_results=`for i in $$redo_bases; do echo $$i.trs; done`; \ - if $(am__make_dryrun); then :; else \ - rm -f $$redo_logs && rm -f $$redo_results || exit 1; \ - fi; \ - fi; \ - if test -n "$$am__remaking_logs"; then \ - echo "fatal: making $(TEST_SUITE_LOG): possible infinite" \ - "recursion detected" >&2; \ - elif test -n "$$redo_logs"; then \ - am__remaking_logs=yes $(MAKE) $(AM_MAKEFLAGS) $$redo_logs; \ - fi; \ - if $(am__make_dryrun); then :; else \ - st=0; \ - errmsg="fatal: making $(TEST_SUITE_LOG): failed to create"; \ - for i in $$redo_bases; do \ - test -f $$i.trs && test -r $$i.trs \ - || { echo "$$errmsg $$i.trs" >&2; st=1; }; \ - test -f $$i.log && test -r $$i.log \ - || { echo "$$errmsg $$i.log" >&2; st=1; }; \ - done; \ - test $$st -eq 0 || exit 1; \ - fi - @$(am__sh_e_setup); $(am__tty_colors); $(am__set_TESTS_bases); \ - ws='[ ]'; \ - results=`for b in $$bases; do echo $$b.trs; done`; \ - test -n "$$results" || results=/dev/null; \ - all=` grep "^$$ws*:test-result:" $$results | wc -l`; \ - pass=` grep "^$$ws*:test-result:$$ws*PASS" $$results | wc -l`; \ - fail=` grep "^$$ws*:test-result:$$ws*FAIL" $$results | wc -l`; \ - skip=` grep "^$$ws*:test-result:$$ws*SKIP" $$results | wc -l`; \ - xfail=`grep "^$$ws*:test-result:$$ws*XFAIL" $$results | wc -l`; \ - xpass=`grep "^$$ws*:test-result:$$ws*XPASS" $$results | wc -l`; \ - error=`grep "^$$ws*:test-result:$$ws*ERROR" $$results | wc -l`; \ - if test `expr $$fail + $$xpass + $$error` -eq 0; then \ - success=true; \ - else \ - success=false; \ - fi; \ - br='==================='; br=$$br$$br$$br$$br; \ - result_count () \ - { \ - if test x"$$1" = x"--maybe-color"; then \ - maybe_colorize=yes; \ - elif test x"$$1" = x"--no-color"; then \ - maybe_colorize=no; \ - else \ - echo "$@: invalid 'result_count' usage" >&2; exit 4; \ - fi; \ - shift; \ - desc=$$1 count=$$2; \ - if test $$maybe_colorize = yes && test $$count -gt 0; then \ - color_start=$$3 color_end=$$std; \ - else \ - color_start= color_end=; \ - fi; \ - echo "$${color_start}# $$desc $$count$${color_end}"; \ - }; \ - create_testsuite_report () \ - { \ - result_count $$1 "TOTAL:" $$all "$$brg"; \ - result_count $$1 "PASS: " $$pass "$$grn"; \ - result_count $$1 "SKIP: " $$skip "$$blu"; \ - result_count $$1 "XFAIL:" $$xfail "$$lgn"; \ - result_count $$1 "FAIL: " $$fail "$$red"; \ - result_count $$1 "XPASS:" $$xpass "$$red"; \ - result_count $$1 "ERROR:" $$error "$$mgn"; \ - }; \ - { \ - echo "$(PACKAGE_STRING): $(subdir)/$(TEST_SUITE_LOG)" | \ - $(am__rst_title); \ - create_testsuite_report --no-color; \ - echo; \ - echo ".. contents:: :depth: 2"; \ - echo; \ - for b in $$bases; do echo $$b; done \ - | $(am__create_global_log); \ - } >$(TEST_SUITE_LOG).tmp || exit 1; \ - mv $(TEST_SUITE_LOG).tmp $(TEST_SUITE_LOG); \ - if $$success; then \ - col="$$grn"; \ - else \ - col="$$red"; \ - test x"$$VERBOSE" = x || cat $(TEST_SUITE_LOG); \ - fi; \ - echo "$${col}$$br$${std}"; \ - echo "$${col}Testsuite summary for $(PACKAGE_STRING)$${std}"; \ - echo "$${col}$$br$${std}"; \ - create_testsuite_report --maybe-color; \ - echo "$$col$$br$$std"; \ - if $$success; then :; else \ - echo "$${col}See $(subdir)/$(TEST_SUITE_LOG)$${std}"; \ - if test -n "$(PACKAGE_BUGREPORT)"; then \ - echo "$${col}Please report to $(PACKAGE_BUGREPORT)$${std}"; \ - fi; \ - echo "$$col$$br$$std"; \ - fi; \ - $$success || exit 1 - -check-TESTS: - @list='$(RECHECK_LOGS)'; test -z "$$list" || rm -f $$list - @list='$(RECHECK_LOGS:.log=.trs)'; test -z "$$list" || rm -f $$list - @test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG) - @set +e; $(am__set_TESTS_bases); \ - log_list=`for i in $$bases; do echo $$i.log; done`; \ - trs_list=`for i in $$bases; do echo $$i.trs; done`; \ - log_list=`echo $$log_list`; trs_list=`echo $$trs_list`; \ - $(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) TEST_LOGS="$$log_list"; \ - exit $$?; -recheck: all $(check_PROGRAMS) - @test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG) - @set +e; $(am__set_TESTS_bases); \ - bases=`for i in $$bases; do echo $$i; done \ - | $(am__list_recheck_tests)` || exit 1; \ - log_list=`for i in $$bases; do echo $$i.log; done`; \ - log_list=`echo $$log_list`; \ - $(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) \ - am__force_recheck=am--force-recheck \ - TEST_LOGS="$$log_list"; \ - exit $$? -test-env.log: test-env$(EXEEXT) - @p='test-env$(EXEEXT)'; \ - b='test-env'; \ - $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ - --log-file $$b.log --trs-file $$b.trs \ - $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ - "$$tst" $(AM_TESTS_FD_REDIRECT) -test-disjoin.log: test-disjoin$(EXEEXT) - @p='test-disjoin$(EXEEXT)'; \ - b='test-disjoin'; \ - $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ - --log-file $$b.log --trs-file $$b.trs \ - $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ - "$$tst" $(AM_TESTS_FD_REDIRECT) -test-parser.log: test-parser$(EXEEXT) - @p='test-parser$(EXEEXT)'; \ - b='test-parser'; \ - $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ - --log-file $$b.log --trs-file $$b.trs \ - $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ - "$$tst" $(AM_TESTS_FD_REDIRECT) -test-cost.log: test-cost$(EXEEXT) - @p='test-cost$(EXEEXT)'; \ - b='test-cost'; \ - $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ - --log-file $$b.log --trs-file $$b.trs \ - $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ - "$$tst" $(AM_TESTS_FD_REDIRECT) -.test.log: - @p='$<'; \ - $(am__set_b); \ - $(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \ - --log-file $$b.log --trs-file $$b.trs \ - $(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \ - "$$tst" $(AM_TESTS_FD_REDIRECT) -@am__EXEEXT_TRUE@.test$(EXEEXT).log: -@am__EXEEXT_TRUE@ @p='$<'; \ -@am__EXEEXT_TRUE@ $(am__set_b); \ -@am__EXEEXT_TRUE@ $(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \ -@am__EXEEXT_TRUE@ --log-file $$b.log --trs-file $$b.trs \ -@am__EXEEXT_TRUE@ $(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \ -@am__EXEEXT_TRUE@ "$$tst" $(AM_TESTS_FD_REDIRECT) - -distdir: $(DISTFILES) - @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ - topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ - list='$(DISTFILES)'; \ - dist_files=`for file in $$list; do echo $$file; done | \ - sed -e "s|^$$srcdirstrip/||;t" \ - -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ - case $$dist_files in \ - */*) $(MKDIR_P) `echo "$$dist_files" | \ - sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ - sort -u` ;; \ - esac; \ - for file in $$dist_files; do \ - if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ - if test -d $$d/$$file; then \ - dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ - if test -d "$(distdir)/$$file"; then \ - find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ - fi; \ - if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ - cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ - find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ - fi; \ - cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ - else \ - test -f "$(distdir)/$$file" \ - || cp -p $$d/$$file "$(distdir)/$$file" \ - || exit 1; \ - fi; \ - done -check-am: all-am - $(MAKE) $(AM_MAKEFLAGS) $(check_PROGRAMS) - $(MAKE) $(AM_MAKEFLAGS) check-TESTS -check: check-am -all-am: Makefile $(LTLIBRARIES) -installdirs: - for dir in "$(DESTDIR)$(libdir)"; do \ - test -z "$$dir" || $(MKDIR_P) "$$dir"; \ - done -install: install-am -install-exec: install-exec-am -install-data: install-data-am -uninstall: uninstall-am - -install-am: all-am - @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am - -installcheck: installcheck-am -install-strip: - if test -z '$(STRIP)'; then \ - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - install; \ - else \ - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ - fi -mostlyclean-generic: - -test -z "$(TEST_LOGS)" || rm -f $(TEST_LOGS) - -test -z "$(TEST_LOGS:.log=.trs)" || rm -f $(TEST_LOGS:.log=.trs) - -test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG) - -clean-generic: - -distclean-generic: - -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) - -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) - -maintainer-clean-generic: - @echo "This command is intended for maintainers to use" - @echo "it deletes files that may require special tools to rebuild." -clean: clean-am - -clean-am: clean-checkPROGRAMS clean-generic clean-libLTLIBRARIES \ - clean-libtool mostlyclean-am - -distclean: distclean-am - -rm -rf ./$(DEPDIR) - -rm -f Makefile -distclean-am: clean-am distclean-compile distclean-generic \ - distclean-tags - -dvi: dvi-am - -dvi-am: - -html: html-am - -html-am: - -info: info-am - -info-am: - -install-data-am: - -install-dvi: install-dvi-am - -install-dvi-am: - -install-exec-am: install-libLTLIBRARIES - -install-html: install-html-am - -install-html-am: - -install-info: install-info-am - -install-info-am: - -install-man: - -install-pdf: install-pdf-am - -install-pdf-am: - -install-ps: install-ps-am - -install-ps-am: - -installcheck-am: - -maintainer-clean: maintainer-clean-am - -rm -rf ./$(DEPDIR) - -rm -f Makefile -maintainer-clean-am: distclean-am maintainer-clean-generic - -mostlyclean: mostlyclean-am - -mostlyclean-am: mostlyclean-compile mostlyclean-generic \ - mostlyclean-libtool - -pdf: pdf-am - -pdf-am: - -ps: ps-am - -ps-am: - -uninstall-am: uninstall-libLTLIBRARIES - -.MAKE: check-am install-am install-strip - -.PHONY: CTAGS GTAGS TAGS all all-am check check-TESTS check-am clean \ - clean-checkPROGRAMS clean-generic clean-libLTLIBRARIES \ - clean-libtool cscopelist-am ctags ctags-am distclean \ - distclean-compile distclean-generic distclean-libtool \ - distclean-tags distdir dvi dvi-am html html-am info info-am \ - install install-am install-data install-data-am install-dvi \ - install-dvi-am install-exec install-exec-am install-html \ - install-html-am install-info install-info-am \ - install-libLTLIBRARIES install-man install-pdf install-pdf-am \ - install-ps install-ps-am install-strip installcheck \ - installcheck-am installdirs maintainer-clean \ - maintainer-clean-generic mostlyclean mostlyclean-compile \ - mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ - recheck tags tags-am uninstall uninstall-am \ - uninstall-libLTLIBRARIES - -.PRECIOUS: Makefile - - -# Tell versions [3.59,3.63) of GNU make to not export all variables. -# Otherwise a system limit (for SysV at least) may be exceeded. -.NOEXPORT: diff -Nru link-grammar-5.3.16/viterbi/parser.cc link-grammar-5.5.0/viterbi/parser.cc --- link-grammar-5.3.16/viterbi/parser.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/parser.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,427 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include - -#include -#include -#include -#include -#include - -#include -#include "api-types.h" -#include "read-dict.h" -#include "structures.h" - -#include "atom.h" -#include "compile.h" -#include "disjoin.h" -#include "garbage.h" -#include "parser.h" -#include "viterbi.h" -#include "word-monad.h" - -using namespace std; - -#define DBG(X) X; - -namespace link_grammar { -namespace viterbi { - -Parser::Parser(Dictionary dict) - : _dict(dict), _alternatives(NULL) -{ - DBG(cout << "=============== Parser ctor ===============" << endl); - do_init_gc(); - initialize_state(); -} - -// =================================================================== -/** - * Convert LG dictionary expression to atomic formula. - * - * The returned expression is in the form of an opencog-style - * prefix-notation boolean expression. Note that it is not in any - * particular kind of normal form. In particular, some AND nodes - * may have only one child: these can be removed. - * - * Note that the order of the connectors is important: while linking, - * these must be satisfied in left-to-right (nested!?) order. - * - * Optional clauses are indicated by OR-ing with null, where "null" - * is a CONNECTOR Node with string-value "0". Optional clauses are - * not necessarily in any sort of normal form; the null connector can - * appear anywhere. - */ -Atom * Parser::lg_exp_to_atom(Exp* exp) -{ - // log-likelihood is identical to the link-grammar cost. - float likli = exp->cost; - - if (CONNECTOR_type == exp->type) - { - stringstream ss; - if (exp->multi) ss << "@"; - ss << exp->u.string << exp->dir; - - return new Connector(ss.str().c_str(), likli); - } - - // Whenever a null appears in an OR-list, it means the - // entire OR-list is optional. A null can never appear - // in an AND-list. - E_list* el = exp->u.l; - if (NULL == el) - return new Connector(OPTIONAL_CLAUSE, likli); - - // The C data structure that link-grammar uses for connector - // expressions is totally insane, as witnessed by the loop below. - // Anyway: operators are infixed, i.e. are always binary, - // with exp->u.l->e being the left hand side, and - // exp->u.l->next->e being the right hand side. - // This means that exp->u.l->next->next is always null. - OutList alist; - alist.push_back(lg_exp_to_atom(el->e)); - el = el->next; - - while (el && exp->type == el->e->type) - { - el = el->e->u.l; - alist.push_back(lg_exp_to_atom(el->e)); - el = el->next; - } - - if (el) - alist.push_back(lg_exp_to_atom(el->e)); - - if (AND_type == exp->type) - return new And(alist, likli); - - if (OR_type == exp->type) - return new Or(alist, likli); - - assert(0, "Not reached"); -} - -// =================================================================== -/** - * Iterate over a set of connector-sets. If there is a mixture of - * different costs found in a connector-set, then split it up into - * several differeent ones, each with the appropriate cost. - * - * In principle, we could split up everything. Right now, we don't - * because: - * 1) the current unit tests would be surprised by this, and some - * would fail. - * 2) the resulting graph would be larger, more verbose. - * On the other hand, if we did split up everything here, then the - * parsing algo could become simpler/smaller. Hmmm... what to do ... - */ -static Set* cost_split(Set* raw_csets) -{ - OutList cooked; - foreach_outgoing(WordCset*, wcs, raw_csets) - { - Atom* c = wcs->get_cset(); - Or* orc = dynamic_cast(c); - if (!orc) - { - // Promote costs, if any, from the disjunct to the connector set. - wcs->_tv = c->_tv; - c->_tv = 0.0f; - cooked.push_back(wcs); - continue; - } - - // If we are here, then we have a set of disjuncts. - // Split out any costly disjuncts into their own. - OutList trim; - foreach_outgoing(Atom*, dj, orc) - { - if (dj->_tv == 0.0f) - { - trim.push_back(dj); - continue; - } - WordCset* cwcs = new WordCset(wcs->get_word(), dj); - cwcs->_tv = dj->_tv; - dj->_tv = 0.0f; - cooked.push_back(cwcs); - } - if (1 == trim.size()) - { - WordCset* dj = new WordCset(wcs->get_word(), trim[0]); - cooked.push_back(dj); - } - else if (1 < trim.size()) - { - WordCset* dj = new WordCset(wcs->get_word(), new Or(trim)); - cooked.push_back(dj); - } - } - return new Set(cooked); -} - -// =================================================================== - -/// Given a disjunct of connectors, propagate a cost -/// on any one of them up to the disjunct as a whole. -static void promote_cost(And* disjunct) -{ - // Promote costs, if any, from each connector, to the disjunct - foreach_outgoing(Atom*, a, disjunct) - { - disjunct->_tv += a->_tv; - a->_tv = 0.0f; - } -} - -/// Given a list of connector sets, search for any costs pasted onto -/// some individual connector, and push it up onto the disjunct that -/// contains that connector. -static Set* cost_up(Set* raw_csets) -{ - foreach_outgoing(WordCset*, wcs, raw_csets) - { - Atom* c = wcs->get_cset(); - And* dj = dynamic_cast(c); - if (dj) - { - promote_cost(dj); - continue; - } - - Or* orc = dynamic_cast(c); - if (orc) - { - foreach_outgoing(And*, dj, orc) - { - if (dj) - promote_cost(dj); - } - } - } - return raw_csets; -} - -// =================================================================== -/** - * Return atomic formula connector expression for the given word. - * - * This looks up the word in the link-grammar dictionary, and converts - * the resulting link-grammar connective expression into an formula - * composed of atoms. - */ -Set* Parser::word_consets(const string& word) -{ - Set* raw_csets = raw_word_consets(word); - return cost_split(cost_up(raw_csets)); -} - -// =================================================================== -/** - * Return atomic formula connector expression for the given word. - * - * This looks up the word in the link-grammar dictionary, and converts - * the resulting link-grammar connective expression into a formula - * composed of atoms. - * - * The return form is 'raw' in that the costs have not yet been correctly - * distributed over the words: i.e. this might return a connector set that - * might be a disjunction over different costs. For example: - * - * SET : - * WORD_CSET : - * WORD : is.v - * OR: - * AND : - * CONNECTOR : Ss- - * CONNECTOR : Wi- - * AND : (2) - * CONNECTOR : Ss- - * CONNECTOR : Wd- - * - * Notice the cost on the second disjunt: this would completely mess - * things up if it were placed into the atomspace, since that cost would - * screw things up for any other expressions having this sub-expression. - */ -Set * Parser::raw_word_consets(const string& word) -{ - // See if we know about this word, or not. - Dict_node* dn_head = dictionary_lookup_list(_dict, word.c_str()); - if (!dn_head) return new Set(); - - OutList djset; - for (Dict_node*dn = dn_head; dn; dn= dn->right) - { - Exp* exp = dn->exp; - DBG({cout << "=============== Parser word: " << dn->string << ": "; - print_expression(exp); }); - - Atom *dj = lg_exp_to_atom(exp); - dj = disjoin(dj); - - // First atom at the front of the outgoing set is the word itself. - // Second atom is the first disjuct that must be fulfilled. - Word* nword = new Word(dn->string); - djset.push_back(new WordCset(nword, dj)); - } - free_lookup_list(dn_head); - return new Set(djset); -} - -// =================================================================== -/** - * Set up initial viterbi state for the parser - */ -void Parser::initialize_state() -{ - _alternatives = new Set( - new StateTriple( - new Seq(), - new Seq(), - new Set() - ) - ); - - const char * wall_word = "LEFT-WALL"; - stream_word(wall_word); -} - -// =================================================================== -/** - * Add a single word to the parse. - */ -void Parser::stream_word(const string& word) -{ - // Look up the dictionary entries for this word. - Set *djset = word_consets(word); - if (!djset) - { - cout << "Unhandled error; word not in dict: >>" << word << "<<" << endl; - assert (0, "word not in dict"); - return; - } - - // Try to add each dictionary entry to the parse state. - Set* new_alts = new Set(); - foreach_outgoing(WordCset*, wrd_cset, djset) - { - WordMonad cnct(wrd_cset); - Set* alts = cnct(_alternatives); - new_alts = new_alts->sum(alts); - } - _alternatives = new_alts; -} - -// =================================================================== -/** convenience wrapper */ -Set* Parser::get_alternatives() -{ - return _alternatives; -} - -// =================================================================== -/** - * Add a stream of text to the input. - * - * No particular assumptiions are made about the input, other than - * that its space-separated words (i.e. no HTML markup or other junk) - */ -void Parser::streamin(const string& text) -{ - // A trivial tokenizer - size_t pos = 0; - while(true) - { - size_t wend = text.find(' ', pos); // wend == word end - if (wend != string::npos) - { - const string word = text.substr(pos, wend-pos); - stream_word(word); - pos = wend + 1; // skip over space - while (' ' == text[pos]) - pos++; - } - else - { - const string word = text.substr(pos); - if (0 < word.size()) - stream_word(word); - break; - } - } -} - -// Send in the right wall -- the traditional link-grammar -// design wants this to terminate sentences. -void Parser::stream_end() -{ - const char * right_wall_word = "RIGHT-WALL"; - Set *wall_disj = word_consets(right_wall_word); - - // We are expecting the initial wall to be unique. - assert(wall_disj->get_arity() == 1, "Unexpected wall structure"); - Atom* wall_cset = wall_disj->get_outgoing_atom(0); - WordCset* rwcs = dynamic_cast(wall_cset); - - WordMonad cnct(rwcs); - _alternatives = cnct(_alternatives); -} - -void viterbi_parse(Dictionary dict, const char * sentence) -{ - Parser pars(dict); - - pars.streamin(sentence); - - // The old link-grammar design insists on having a RIGHT-WALL, - // so provide one. - pars.stream_end(); - - atombase::Link* alts = pars.get_alternatives(); - - /* Print some diagnostic outputs ... for now. Remove when finished. */ - size_t num_alts = alts->get_arity(); - printf("Found %lu alternatives\n", num_alts); - for (size_t i=0; iget_outgoing_atom(i); - StateTriple* sp = dynamic_cast(a); - Seq* state = sp->get_state(); - size_t state_sz = state->get_arity(); - if (0 == state_sz) - { - cout << "\nAlternative =============== " << i << endl; - cout << sp->get_output() << endl; - } - else - { - cout << "\nIncomplete parse =============== " << i << endl; - cout << sp->get_output() << endl; - cout << "\n---- state for ----" << i << endl; - cout << sp->get_state() << endl; - } - } -} - -} // namespace viterbi -} // namespace link-grammar - -// =================================================================== - -// Wrapper to escape out from C++ -void viterbi_parse(const char * sentence, Dictionary dict) -{ - link_grammar::viterbi::viterbi_parse(dict, sentence); -} - diff -Nru link-grammar-5.3.16/viterbi/parser.h link-grammar-5.5.0/viterbi/parser.h --- link-grammar-5.3.16/viterbi/parser.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/parser.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,54 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_VITERBI_PARSER_H -#define _LG_VITERBI_PARSER_H - -#include - -#include "atom.h" -#include "compile.h" -#include "garbage.h" - -// link-grammar include files, needed for Exp, Dict -#include "api-types.h" -#include "structures.h" - -namespace link_grammar { -namespace viterbi { - -class Parser : public gc -{ - public: - Parser(Dictionary dict); - - void streamin(const std::string&); - void stream_word(const std::string&); - void stream_end(); - - Set* word_consets(const std::string& word); - - Set* get_alternatives(); - protected: - void initialize_state(); - Atom* lg_exp_to_atom(Exp*); - Set* raw_word_consets(const std::string& word); - - Dictionary _dict; - private: - Set* _alternatives; -}; - - -} // namespace viterbi -} // namespace link-grammar - -#endif // _LG_VITERBI_PARSER_H diff -Nru link-grammar-5.3.16/viterbi/README link-grammar-5.5.0/viterbi/README --- link-grammar-5.3.16/viterbi/README 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/README 1970-01-01 00:00:00.000000000 +0000 @@ -1,326 +0,0 @@ - - Viterbi Decoder - --------------- - Linas Vepstas March 2013 - -This directory contains code for a prototype Viterbi decoder for Link -Grammar. It barely functions and is currently very slow. - - -Motivation -========== -The goal of providing this decoder is to present a flexible, powerful -interface for implementing high-level semantic algorithms on top of -the low-level link-grammar syntactic parser, and, in particular, -for steering the parse based on high-level semantic knowledge. This -allows the parser to move beyond being merely a syntactic parser, -and to become fully integrated with general semantic artificial -intelligence. - -A less abstract list of expected benefits include: - - * Incremental parsing: the ability to obtain partial results after - providing partial sentences, a word at a time. - * Less sensitivity to sentence boundaries, allowing longer, - run-on sentences to be parsed far more quickly. - * Mitigation of the combinatorial explosion of parses. - * Allow grammatically broken/incorrect chat dialog to be parsed; - in general, to do better with slang, hip-speak. - * Enable co-reference resolution and anaphora resolution across - sentences (resolve pronouns, etc.) - * Enable annotation of the parse graph with word-sense data, entity - markers. - * Allow richer state to be passed up to higher layers: specifically, - alternate parses for fractions of a sentence, alternative reference - resolutions. - * Allow a plug-in architecture, so that plug-ins, employing higher- - level semantic (AGI) algorithms can provide parse guidance and - parse disambiguation. - * Eliminate many of the hard-coded array sizes in the code. - * Fix the word-count problem during spell-guessing. So, for - example, if the mis-spelled word "dont" shows up in the input, it - could be issued as one word ("done") or two ("do n't"). The - current suffix-stripping/word-issuing algo cannot deal with this - correctly. This is also an issue for the Russian dictionary, where - the stem+suffix processing can generate variable word counts. - - -The data structures used to implement this resemble those of the -OpenCog AtomSpace. All data classes inherit from a class called Atom -(which is an atomic predicate, in the sense of mathematical logic). -Atoms are typed; the two core types are Links and Nodes. Thus, all data -is represented in the form of a "term algebra" (aka the "Free Theory", -in the sense of model theory). This structure allows all data to be -represented as (hyper-)graphs, which in turn makes the implementation -of graph algorithms easier to implement. All these theoretical -considerations provide a natural setting for storing Viterbi state -information. Put differently, this provide a generic, uniform way of -holding the various partly-finished parses, and effecting state -transformations on them. - -Making the internal state directly visible allows low-level syntactic -algorithms, as well as high-level, semantic algorithms to control parsing. -In other words, the intended use of the Viterbi decoder is to provide -a framework for parsing that should make it possible to integrate -tightly (and cleanly) with high-level semantic analysis algorithms. -Thus, reference and anaphora resolution can be done using the same -graph structure as used for parsing; it should also allow graphical -transformations, such as those currently implemented in RelEx. - -Since all of the data is represented dynamically (at run-time) by -these (hyper-)graphs composed of atoms, developing custom algorithms -to manipulate the parse becomes easy: there are no strange compile-time -structures to master. All algorithms can access the data in a uniform, -common way. - -One may argue that Viterbi is a more natural, biological way of working -with sequences. Some experimental, psychological support for this can -be found here: -http://www.sciencedaily.com/releases/2012/09/120925143555.htm -per Morten Christiansen, Cornell professor of psychology. - - -Why "Viterbi"? -============== -The parser implemented here is called a "Viterbi decoder" because it is -inspired by (and vaguely resembles) the Viterbi algorithm famous from signal -processing. A characteristic feature of that algorithm is that it maintains -a set of states in parallel. As each new bit is received, some of the states -become inherently inconsistent (e.g. because some checksum is violated), -while other new states become possible. Once some certain number of bits -have been received, the ones that can be consistently interpreted with the -checksum constraints can be output. The process then repeats with each new bit -streaming in. Likewise, the code here keeps a set of states in parallel; -each state is tied to some possible pending output that is consistent with -that state. As each new word comes in, some of the states become invalid -and are discarded (because the word cannot link to that state). When all -states agree on some linkage, that linkage can be confidently output. -As words flow in, the process is repeated. If there are ambiguities, then -multiple parse alternatives can be output (this is rather unlike the -original Viterbi algorithm, whose goal is to generate only one output, -the single most likely output). For language parsing, however, it is -useful to convey parse alternatives to higher-order processes, which can -make a final determination (analogous to having a higher-order checksum -functioning within the Viterbi decoder). - -I like this analogy because it is vaguely biological as well: or perhaps -I should say "neural net-ish". The multiple, provisional states that -are kept around are sort of like the activation states of a feed-forward -artificial neural network. But this is not very deep: the feed-forward -neural net looks like a Hidden Markov Model (HMM), and the Viterbi -algorithm is essentially an HMM algorithm. No surprise! - - -Status -====== -Currently, the parser can correctly parse many short sentences. It -currently runs very slowly, as no pruning algorithms have yet been -implemented. No probabilities or weights have yet been implemented, -groundwork for this is being laid now. - - -Using the parser -================ -To use this parser, you must first configure with the --enable-viterbi -flag: - - configure --enable-viterbi - -Then, in the client, switch to the viterbi parser with the ! command -!viterbi (If confused, try !help and !var for general help). - -The vitest.cc file contains unit tests. Currently, it consists of 95 -tests, and all but the last of them pass. More tests will be added. - - -Design -====== -What follows is a writeup of the current design. It is incomplete. -Parts of it may also be stale & out-of-date. This part is meant only for -programmers wishing to update this code. If you are just a user, you -can skip reading this section. - - -AtomSpace Design ----------------- -The concept of Atoms, Nodes, Links used in the implementation here -generally resembles the concepts with the matching names, in OpenCog. -That is, Nodes and Links are two fundamental types of Atoms. Nodes -have names, whereas Links contain lists of Atoms. These two types are -used to form hypergraphs, where nodes are labelled, but links are not. -In C++, these are implemented as derived classes. Each, in turn, has -further subtypes: for example, Word is a special type of Node, and Set -is a subtype of Link. - -Each atom has associated with it a 'truth value'. In OpenCog, truth -values may be rather complicated; in the current implementation, here, -each truth value is just a single floating point number. It is meant to -be interpreted as the 'log likelihood', or the 'cost' or the 'entropy', -depending on the context. The word 'cost' comes from link-grammar, -and is used to indicate preference: a parse with the lowest total cost -is the most likely parse. Note that costs are additive. - -Roughly speaking, probability P can be defined as P = exp(-C) where -C is the cost. Traditional OpenCog stores P in truth values; we store -C instead. It just seems easier to debug using C instead of P. One -can always convert the one into the other. - -It is the intent of the Viterbi parser design to not only use cost -as a guide for selecting and prioritizing parses, but also to eventually -collect statistical data on real-world corpora, and automatically, -through machine learning, update the dictionary with costs and new -disjuncts. Thus, the hypergraphs will behave somewhat like Bayesian -networks, or perhaps somewhat like Markov Logic Networks. Detailed -interpretation of costs and how they are distributed and propagated -through the network, both during parsing, and during learning, have -not yet been figured out. Its kind of complicated :-) - -The traditional OpenCog AtomSpace design insists that there can only -ever be one single instance of any given atom at any time. This is -not yet enforced in the code here, but might be 'real soon now'. It -is certainly shaping design decisions. - - -Disjunct Design ---------------- -The core underlying concept of link-grammar are 'disjuncts', which are -sets of connectors, each of which must be connected in order for a -disjunct to be satisfied. A given dictionary word may have many -disjuncts; however, only one may be satisfied at a time. A typical -dictionary entry for a word consists of several connectors composed -with '&' and 'or' operators. Although it is very tempting to think of -'&' and 'or' as being boolean operators, they are not. First of all, -connectors must be kept in sequential order: they do not commute -under '&' (However, 'or' does commute). Next, one can distribute -'or' over '&' but not vice versa. Thus, if A+, B+ and C+ are -connectors, then it is true that - - A+ & (B+ or C+) == A+ & (C+ or B+) - == (A+ & B+) or (A+ & C+) - == (A+ & C+) or (A+ & B+) - -However, one cannot distribute the other way: - - A+ or (B+ & C+) != (A+ or B+) & (A+ or C+) - -There are several ways to understand the above prohibition: - 1) Non-copying of connectors: connectors cannot be duplicated. The - RHS of the above has two copies of A+, but the LHS has just one. - (Non-copying is quantum-like. How weird is that?). - - 2) One might say that 'or' does behave like boolean OR, but '&' does - not. - - 3) One might say that 'or' behaves like 'tensor plus' while '&' - behaves like 'tensor times' in linear logic. Except that tensor - times is commutative, and & is not... - - 4) Alternately, 'or' behaves sort-of like 'exclusive-or', in that - only one of the arguments to 'exclusive-or' is allowed to hold. - An N>2 argument XOR is more correctly called a 'choose-one-of-N' - function: a logic demultiplexer. - -The simplest interpretation seems to be a 'many-worlds' interpretation: -That is, use 4) so that 'or' is 'choose-one-world-of-many', and enforce -1) non-copying of connectors within a single world. BTW, note that any -connector prefixes with a '@' is explicitly copyable! - -In the C++ code, the '&' operator will be called 'Tensand' (tensor-and), -while the 'or' operator will be called 'Plexor' (multiplexed-or). -(TODO: change the names.) - -Link-grammar dictionary entries are written as expressions of connectors, -'&' and 'or'. In order to actually use them in parsing a sentence, it -seems easiest to expand them into disjunctive normal form (DNF). This -makes it easy to determine that one and only one disjunct is being used, -and that all of the connectors in the disjunct are being satisfied. -The various disjoin() methods on the various classes are used to place -expressions into DNF. - -To be clear: (A+ & B+) is a disjunct (all of the connectors are -disjoined). By contrast, A+ & (B+ or C+) is NOT a disjunct; but it -can be disjoined into two of them. - -Several complications arise when we want to consider truth values -(probabilities, or 'costs') associated with disjuncts. First of all, -costs need to be correctly distributed when disjuncts are being formed. -Thus, for example, using [[]] to denote cost (just as in the -link-grammar dicts), we have: - - [[A+]] & (B+ or C+) == ([[A+]] & B+) or ([[A+]] & C+) - == [[A+ & B+]] or [[A+ & C+]] - == [[A+ & (B+ or C+)]] - == A+ & [[B+ or C+]] - -while - - A+ & ([[B+]] or C+) == (A+ & [[B+]]) or (A+ & C+) - == [[A+ & B+]] or (A+ & C+) - -In this last expression, the disjunct [[A+ & B+]] is costly, while -(A+ & C+) is not. - -The next issue is that the same disjunct can have different costs when -used with different words. Thus, [[A+ & B+]] may be costly when used -with the verb 'to be' but not when used with the verb 'to go'. Thus, -the cost annotation must be associated with the word-disjunct pair, -and not with the disjunct alone. Putting all these different -observations together results in the following representation for an -entry in the dictionary (after it has been placed in DNF): - -Dictionary entry as it appear in link-grammar dicts: - - jabberwoky: A+ & ([[B+]] or C+); - -Dictionary entry, as represented in the atomspace, after being disjoined: - - Plexor - WordCset (cost 2.0) ; Notice non-zero cost - Word "jabberwoky" - Tensand ; Notice cost of zero for tensand - Connector "A+" - Connector "B+" - - WordCset ; Zero cost here - Word "jabberwoky" - Tensand - Connector "A+" - Connector "C+" - -Note that the act of disjoining has to have the Plexor 'pass through' -the word labels (but not the Tensand). So, for example, just after -the dictionary entry is read from the file, but before before it is -placed into the atomspace, it is represented as: - - WordCset - Word "jabberwoky" - Tensand - Connector "A+" - Plexor - Connector "B+" (cost 2.0) - Connector "C+" - -which is certainly more compact than above, except that: - -1) Its very hard to develop an algo that will correctly couple - connectors during a parse, when the entry is in this form. - Thus, we want DNF just to make the algos doable. - -2) The cost on B+ means that we have to be careful that this connector - is not in the atomspace. This is because the atomspace rules allow - only one, singular, unique atom called "Connector B+", and thus, - changing its cost would affect all expressions that contain it! - -3) Updating costs on Tensands (disjuncts) also raises similar issues: - the atomspace allows one and only one disjunct of the form - (Tensand (Connector "A+") (Connector "B+")) in the atomspace, so - setting its cost would change the perceived cost for any other - expression that might contain this sub-expression. - - - -Historical dates -================ -Idea conceived: April 2008 -Coding started: October 2012 -First successful long sentence parse: March 2013 diff -Nru link-grammar-5.3.16/viterbi/rewrite.cc link-grammar-5.5.0/viterbi/rewrite.cc --- link-grammar-5.3.16/viterbi/rewrite.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/rewrite.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,23 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "rewrite.h" - -namespace link_grammar { -namespace viterbi { - - -// Nothing here yet ... - - -} // namespace viterbi -} // namespace link-grammar - diff -Nru link-grammar-5.3.16/viterbi/rewrite.h link-grammar-5.5.0/viterbi/rewrite.h --- link-grammar-5.3.16/viterbi/rewrite.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/rewrite.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,41 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_VITERBI_REWRITE_H -#define _LG_VITERBI_REWRITE_H - -#include "atom.h" - -namespace link_grammar { -namespace viterbi { -using namespace atombase; - -// Base class for all graph-rewrite rules. -// For now, most/all rewrite rules will be implemented in C++ -// Eventually, the goal is not to do this, but for now, this seems -// like the shortest path to something functional. So assorted -// misc algrothms will inherit from this class. -class Rule : public atombase::Link -{ - public: - Rule(void) - : Link(RULE, new Node("")) - {} - Node* rule_name() { return dynamic_cast(_oset[0]); } - Atom* apply(Atom* a) { return a; } -}; - - -} // namespace viterbi -} // namespace link-grammar - -#endif // _LG_VITERBI_REWRITE_H - diff -Nru link-grammar-5.3.16/viterbi/test-cost.cc link-grammar-5.5.0/viterbi/test-cost.cc --- link-grammar-5.3.16/viterbi/test-cost.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/test-cost.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,192 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/// This file provides a unit test for the operation of the viterbi parser. - -#include "test-header.h" - -#include -#include - -// ================================================================== -bool test_disjoin_cost() -{ - total_tests++; - const char* post_nominal = - ": [[[()]]];" - "wtf: ({[Bpj+]} & Xd- & (Xc+ or ) & MX-);"; - - Dictionary dict = dictionary_create_from_utf8(post_nominal); - Parser parser(dict); - - Set* csets = parser.word_consets("wtf"); - - Lynk* expected = - ALINK4(SET, - ALINK2C(WORD_CSET, - ANODE(WORD, "wtf"), - ALINK2(AND, - ANODE(CONNECTOR, "Xd-"), - ANODE(CONNECTOR, "MX-") - ), - 3.0f), - ALINK2C(WORD_CSET, - ANODE(WORD, "wtf"), - ALINK4(AND, - ANODE(CONNECTOR, "Xd-"), - ANODE(CONNECTOR, "MX-"), - ANODE(CONNECTOR, "Bpj+"), - ANODE(CONNECTOR, "Xc+") - ), - 1.0f), - ALINK2C(WORD_CSET, - ANODE(WORD, "wtf"), - ALINK3(AND, - ANODE(CONNECTOR, "Xd-"), - ANODE(CONNECTOR, "MX-"), - ANODE(CONNECTOR, "Bpj+") - ), - 4.0f), - ALINK2C(WORD_CSET, - ANODE(WORD, "wtf"), - ALINK3(AND, - ANODE(CONNECTOR, "Xd-"), - ANODE(CONNECTOR, "MX-"), - ANODE(CONNECTOR, "Xc+") - ), - 0.0f) - ); - - if (not (csets->operator==(expected))) - { - cout << "Error: test failure on test \"test_disjoin_cost\"" << endl; - cout << "=== Expecting:\n" << expected << endl; - cout << "=== Got:\n" << csets << endl; - return false; - } - cout << "PASS: test_disjoin_cost" << endl; - return true; -} - -// ================================================================== -// XXX currently a copy of test_short_sent ... -bool test_cost(const char *id, const char *dict_str, bool empty_state) -{ - total_tests++; - - Dictionary dict = dictionary_create_from_utf8(dict_str); - // print_dictionary_data(dict); - -cout<<"xxxxxxxxxxxxxxxxxxxxxxxx last test xxxxxxxxxxxxxxxx" <operator==(alts))) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting:\n" << ans << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - } - else - { - // At least one alternative should be the desired state pair. - bool found = false; - foreach_outgoing(Atom*, a, alts) - { - if (sp->operator==(a)) - found = true; - } - if (not found) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting one of them to be:\n" << sp << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - } - - cout<<"PASS: test_short_sent(" << id << ") " << endl; - return true; -} - -bool test_cost_this() -{ - return test_cost("short cost sent", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+;" - "is.v: Ss- and Wi-;" - "is.w: [[Ss- and Wd-]];", - true - ); -} - -int ntest_cost() -{ - size_t num_failures = 0; - - if (!test_disjoin_cost()) num_failures++; - - if (!test_cost_this()) num_failures++; - return num_failures; -} - -// ================================================================== - -int -main(int argc, char *argv[]) -{ - size_t num_failures = 0; - bool exit_on_fail = true; - - num_failures += ntest_cost(); - report(num_failures, exit_on_fail); - - exit (0); -} - diff -Nru link-grammar-5.3.16/viterbi/test-disjoin.cc link-grammar-5.5.0/viterbi/test-disjoin.cc --- link-grammar-5.3.16/viterbi/test-disjoin.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/test-disjoin.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,1112 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/// This file provides a unit test for the operation of the viterbi parser. -#include "test-header.h" -#include "compile.h" - -// ================================================================== -// Test some basic atomic functions. - -bool test_operator_equals() -{ - Node* na = new Node("abc"); - Node* nb = new Node("abc", 0.8f); - CHECK_NE(__FUNCTION__, na, nb); -} - -bool test_operator_equals2() -{ - Node* na = new Node("abcd", 0.8f*0.8f); - Node* nb = new Node("abcd", 0.6400001f); - CHECK(__FUNCTION__, na, nb); -} - -int ntest_core() -{ - size_t num_failures = 0; - if (!test_operator_equals()) num_failures++; - if (!test_operator_equals2()) num_failures++; - return num_failures; -} - -// ================================================================== -// Test the flatten function - -bool test_flatten() -{ - Or* or_right = new Or( - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3")) - ); - Or* computed = or_right->flatten(); - - Lynk* expected = - ALINK3(OR, - ANODE(WORD, "AA1"), - ANODE(WORD, "BB2"), - ANODE(WORD, "CC3") - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_flatten_rec() -{ - Or* or_right = new Or( - ALINK2(OR, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))) - ); - Or* computed = or_right->flatten(); - - Lynk* expected = - ALINK3(OR, - ANODE(WORD, "AA1"), - ANODE(WORD, "BB2"), - ANODE(WORD, "CC3") - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_flatten_nest() -{ - And* and_right = new And( - ALINK2(OR, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))) - ); - Atom* computed = and_right->super_flatten(); - - Lynk* expected = - ALINK3(OR, - ANODE(WORD, "AA1"), - ANODE(WORD, "BB2"), - ANODE(WORD, "CC3") - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_flatten_nest_deep() -{ - And* and_right = new And( - ALINK3(OR, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3")), - ALINK3(AND, - ANODE(WORD, "XAA1"), - ALINK2(AND, ANODE(WORD, "XBB2"), ANODE(WORD, "XCC3")), - ALINK2(AND, ANODE(WORD, "XDD4"), ANODE(WORD, "XEE5")) - ) - ) - ); - Atom* computed = and_right->super_flatten(); - - Lynk* expected = - ALINK4(OR, - ANODE(WORD, "AA1"), - ANODE(WORD, "BB2"), - ANODE(WORD, "CC3"), - ALINK5(AND, - ANODE(WORD, "XAA1"), - ANODE(WORD, "XBB2"), - ANODE(WORD, "XCC3"), - ANODE(WORD, "XDD4"), - ANODE(WORD, "XEE5") - ) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_cost_flatten() -{ - Or* or_right = new Or( - ANODEC(WORD, "AA1", 0.01), - ALINK2C(OR, ANODEC(WORD, "BB2", 0.02), ANODEC(WORD, "CC3", 0.03), 0.001), - 0.1f); - Or* computed = or_right->flatten(); - - Lynk* expected = - ALINK3C(OR, - ANODEC(WORD, "AA1", 0.01f), - ANODEC(WORD, "BB2", 0.021f), - ANODEC(WORD, "CC3", 0.031f), - 0.1f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_cost_flatten_rec() -{ - Or* or_right = new Or( - ALINK2C(OR, - ANODEC(WORD, "AA1", 0.01f), - ALINK2C(OR, - ANODEC(WORD, "BB2", 0.02f), - ANODEC(WORD, "CC3",0.03f), - 0.003f), - 0.0004f), - 0.1f); - Or* computed = or_right->flatten(); - - Lynk* expected = - ALINK3C(OR, - ANODEC(WORD, "AA1", 0.0104f), - ANODEC(WORD, "BB2", 0.0234f), - ANODEC(WORD, "CC3", 0.0334f), - 0.1f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_cost_flatten_nest() -{ - And* and_right = new And( - ALINK2C(OR, - ANODEC(WORD, "AA1", 0.01f), - ALINK2C(OR, - ANODEC(WORD, "BB2", 0.02f), - ANODEC(WORD, "CC3", 0.03f), - 0.003f), - 0.0004f), - 0.1f); - Atom* computed = and_right->super_flatten(); - - Lynk* expected = - ALINK3C(OR, - ANODEC(WORD, "AA1", 0.01f), - ANODEC(WORD, "BB2", 0.023f), - ANODEC(WORD, "CC3", 0.033f), - 0.1004f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_cost_flatten_nest_deep() -{ - And* and_right = new And( - ALINK3C(OR, - ANODEC(WORD, "AA1", 0.01f), - ALINK2C(OR, - ANODEC(WORD, "BB2", 0.02f), - ANODEC(WORD, "CC3", 0.03f), - 0.003f), - ALINK3C(AND, - ANODEC(WORD, "XAA1", 0.00001f), - ALINK2C(AND, - ANODEC(WORD, "XBB2", 0.00002f), - ANODEC(WORD, "XCC3", 0.00003f), - 0.000003f), - ALINK2C(AND, - ANODEC(WORD, "XDD4", 0.00004f), - ANODEC(WORD, "XEE5", 0.00005f), - 0.000006f), - 0.5f), - 0.00007f), - 0.1f); - Atom* computed = and_right->super_flatten(); - - Lynk* expected = - ALINK4C(OR, - ANODEC(WORD, "AA1", 0.01f), - ANODEC(WORD, "BB2", 0.023f), - ANODEC(WORD, "CC3", 0.033f), - ALINK5C(AND, - ANODEC(WORD, "XAA1", 0.00001f), - ANODEC(WORD, "XBB2", 0.000023f), - ANODEC(WORD, "XCC3", 0.000033f), - ANODEC(WORD, "XDD4", 0.000046f), - ANODEC(WORD, "XEE5", 0.000056f), - 0.5f), - 0.10007f); - - CHECK(__FUNCTION__, expected, computed); -} - -int ntest_flatten() -{ - size_t num_failures = 0; - if (!test_flatten()) num_failures++; - if (!test_flatten_rec()) num_failures++; - if (!test_flatten_nest()) num_failures++; - if (!test_flatten_nest_deep()) num_failures++; - - if (!test_cost_flatten()) num_failures++; - if (!test_cost_flatten_rec()) num_failures++; - if (!test_cost_flatten_nest()) num_failures++; - if (!test_cost_flatten_nest_deep()) num_failures++; - return num_failures; -} - -// ================================================================== -// Make sure that the disjoined functions actually work. - -bool test_and_dnf_single() -{ - And* and_singleton = new And(ANODE(WORD, "AA1")); - Atom* computed = and_singleton->disjoin(); - - Atom* expected = ANODE(WORD, "AA1"); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_and_dnf_double() -{ - And* and_two = new And(ANODE(WORD, "AA1"), ANODE(WORD, "BB2")); - Atom* computed = and_two->disjoin(); - - Lynk* expected = - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_and_distrib_left() -{ - And* and_right = new And( - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3")), - ANODE(WORD, "RR1")); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2(AND, ANODE(WORD, "BB2"), ANODE(WORD, "RR1")), - ALINK2(AND, ANODE(WORD, "CC3"), ANODE(WORD, "RR1")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_and_distrib_right() -{ - And* and_right = new And(ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")), - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_and_distrib_middle() -{ - And* and_mid = new And(ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3")), - ANODE(WORD, "DD4")); - Atom* computed = and_mid->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK3(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), ANODE(WORD, "DD4")), - ALINK3(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), ANODE(WORD, "DD4")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_and_distrib_quad() -{ - And* and_mid = new And( - ALINK2(OR, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")), - ALINK2(OR, ANODE(WORD, "CC3"), ANODE(WORD, "DD4"))); - Atom* computed = and_mid->disjoin(); - - Lynk* expected = - ALINK4(OR, - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3")), - ALINK2(AND, ANODE(WORD, "BB2"), ANODE(WORD, "CC3")), - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "DD4")), - ALINK2(AND, ANODE(WORD, "BB2"), ANODE(WORD, "DD4")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_and_distrib_quad_right() -{ - And* and_mid = new And( - ALINK2(OR, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")), - ALINK2(OR, ANODE(WORD, "CC3"), ANODE(WORD, "DD4")), - ANODE(WORD, "EE5") - ); - Atom* computed = and_mid->disjoin(); - - Lynk* expected = - ALINK4(OR, - ALINK3(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), ANODE(WORD, "EE5")), - ALINK3(AND, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), ANODE(WORD, "EE5")), - ALINK3(AND, ANODE(WORD, "AA1"), ANODE(WORD, "DD4"), ANODE(WORD, "EE5")), - ALINK3(AND, ANODE(WORD, "BB2"), ANODE(WORD, "DD4"), ANODE(WORD, "EE5")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_and_distrib_quad_left() -{ - And* and_mid = new And( - ANODE(WORD, "EE5"), - ALINK2(OR, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")), - ALINK2(OR, ANODE(WORD, "CC3"), ANODE(WORD, "DD4"))); - Atom* computed = and_mid->disjoin(); - - Lynk* expected = - ALINK4(OR, - ALINK3(AND, ANODE(WORD, "EE5"), ANODE(WORD, "AA1"), ANODE(WORD, "CC3")), - ALINK3(AND, ANODE(WORD, "EE5"), ANODE(WORD, "BB2"), ANODE(WORD, "CC3")), - ALINK3(AND, ANODE(WORD, "EE5"), ANODE(WORD, "AA1"), ANODE(WORD, "DD4")), - ALINK3(AND, ANODE(WORD, "EE5"), ANODE(WORD, "BB2"), ANODE(WORD, "DD4")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_dnf_single() -{ - Or* or_singleton = new Or(ANODE(WORD, "AA1")); - Atom* computed = or_singleton->disjoin(); - - Atom* expected = ANODE(WORD, "AA1"); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_dnf_double() -{ - Or* or_two = new Or(ANODE(WORD, "AA1"), ANODE(WORD, "BB2")); - Atom* computed = or_two->disjoin(); - - Lynk* expected = - ALINK2(OR, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_left() -{ - Or* or_right = new Or( - ALINK2(AND, - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3")), - ANODE(WORD, "RR1")) - ); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2(AND, ANODE(WORD, "BB2"), ANODE(WORD, "RR1")), - ALINK2(AND, ANODE(WORD, "CC3"), ANODE(WORD, "RR1")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_right() -{ - Or* or_right = new Or( - ALINK2(AND, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))) - ); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")), - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_nest() -{ - Or* or_right = new Or( - ALINK1(OR, - ALINK2(AND, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3")))) - ); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")), - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_nest2() -{ - Or* or_right = new Or( - ALINK3(OR, - ANODE(WORD, "DD4"), - ALINK2(AND, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))), - ANODE(WORD, "EE5")) - ); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK4(OR, - ANODE(WORD, "DD4"), - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2")), - ALINK2(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3")), - ANODE(WORD, "EE5") - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_nest3() -{ - Or* or_right = new Or( - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK3(OR, - ANODE(WORD, "AA1"), - ANODE(WORD, "BB2"), - ANODE(WORD, "CC3") - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_nest4() -{ - Or* or_right = new Or( - ALINK2(OR, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))) - ); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK3(OR, - ANODE(WORD, "AA1"), - ANODE(WORD, "BB2"), - ANODE(WORD, "CC3") - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_nest5() -{ - And* and_right = new And( - ALINK2(OR, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))) - ); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK3(OR, - ANODE(WORD, "AA1"), - ANODE(WORD, "BB2"), - ANODE(WORD, "CC3") - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_nest6() -{ - Or* or_right = new Or( - ALINK1(AND, - ALINK2(OR, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))) - ) - ); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK3(OR, - ANODE(WORD, "AA1"), - ANODE(WORD, "BB2"), - ANODE(WORD, "CC3") - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_or_distrib_nest7() -{ - Or* or_right = new Or( - ALINK3(AND, - ANODE(WORD, "DD4"), - ALINK2(OR, - ANODE(WORD, "AA1"), - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"))), - ANODE(WORD, "EE5")) - ); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK3(OR, - ALINK3(AND, ANODE(WORD, "DD4"), ANODE(WORD, "AA1"), ANODE(WORD, "EE5")), - ALINK3(AND, ANODE(WORD, "DD4"), ANODE(WORD, "BB2"), ANODE(WORD, "EE5")), - ALINK3(AND, ANODE(WORD, "DD4"), ANODE(WORD, "CC3"), ANODE(WORD, "EE5")) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -int ntest_disjoin() -{ - size_t num_failures = 0; - if (!test_and_dnf_single()) num_failures++; - if (!test_and_dnf_double()) num_failures++; - if (!test_and_distrib_left()) num_failures++; - if (!test_and_distrib_right()) num_failures++; - if (!test_and_distrib_middle()) num_failures++; - if (!test_and_distrib_quad()) num_failures++; - if (!test_and_distrib_quad_right()) num_failures++; - if (!test_and_distrib_quad_left()) num_failures++; - - if (!test_or_dnf_single()) num_failures++; - if (!test_or_dnf_double()) num_failures++; - if (!test_or_distrib_left()) num_failures++; - if (!test_or_distrib_right()) num_failures++; - - if (!test_or_distrib_nest()) num_failures++; - if (!test_or_distrib_nest2()) num_failures++; - if (!test_or_distrib_nest3()) num_failures++; - if (!test_or_distrib_nest4()) num_failures++; - if (!test_or_distrib_nest5()) num_failures++; - if (!test_or_distrib_nest6()) num_failures++; - if (!test_or_distrib_nest7()) num_failures++; - return num_failures; -} - - -// ================================================================== -// Make sure that the disjoined functions actually work. -// Identical to the above, except this time, there are costs involved. - -bool test_costly_and_dnf_single() -{ - And* and_singleton = new And(ANODEC(WORD, "AA1", 1.5f)); - Atom* computed = and_singleton->disjoin(); - - Atom* expected = ANODEC(WORD, "AA1", 1.5f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_dnf_single_ne() -{ - And* and_singleton = new And(ANODEC(WORD, "AA1", 1.5f)); - Atom* computed = and_singleton->disjoin(); - - Atom* expected = ANODEC(WORD, "AA1", 31.6f); - - CHECK_NE(__FUNCTION__, expected, computed); -} - -bool test_costly_and_dnf_single_sum() -{ - And* and_singleton = new And(ANODEC(WORD, "AA1", 1.5f), 0.3f); - Atom* computed = and_singleton->disjoin(); - - Atom* expected = ANODEC(WORD, "AA1", 1.8f); - - CHECK(__FUNCTION__, expected, computed); -} - -// ----------------------------------------------- -bool test_costly_and_dnf_double() -{ - And* and_two = new And(ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 1.6f); - Atom* computed = and_two->disjoin(); - - Lynk* expected = - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 1.6f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_dnf_double_w() -{ - And* and_two = new And(ANODE(WORD, "AA1"), ANODEC(WORD, "BB2", 2.8f)); - Atom* computed = and_two->disjoin(); - - Lynk* expected = - ALINK2(AND, ANODE(WORD, "AA1"), ANODEC(WORD, "BB2", 2.8f)); - - CHECK(__FUNCTION__, expected, computed); -} - -// ----------------------------------------------- -bool test_costly_and_distrib_left() -{ - And* and_right = new And( - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 1.1f), - ANODE(WORD, "RR1")); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2C(AND, ANODE(WORD, "BB2"), ANODE(WORD, "RR1"), 1.1f), - ALINK2C(AND, ANODE(WORD, "CC3"), ANODE(WORD, "RR1"), 1.1f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_distrib_left_sum() -{ - And* and_right = new And( - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 1.1f), - ANODE(WORD, "RR1"), 0.8f); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2C(AND, ANODE(WORD, "BB2"), ANODE(WORD, "RR1"), 1.9f), - ALINK2C(AND, ANODE(WORD, "CC3"), ANODE(WORD, "RR1"), 1.9f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_distrib_left_w() -{ - And* and_right = new And( - ALINK2(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3")), - ANODEC(WORD, "RR1", 3.14f)); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2(AND, ANODE(WORD, "BB2"), ANODEC(WORD, "RR1", 3.14f)), - ALINK2(AND, ANODE(WORD, "CC3"), ANODEC(WORD, "RR1", 3.14f)) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -// ----------------------------------------------- -bool test_costly_and_distrib_right() -{ - And* and_right = new And(ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.35)); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.35), - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), 0.35) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_distrib_right_sum() -{ - And* and_right = new And(ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.35f), 0.5f); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.85f), - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), 0.85f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_distrib_middle() -{ - And* and_mid = new And(ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 2.1f), - ANODE(WORD, "DD4"), 0.6f); - Atom* computed = and_mid->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK3C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), ANODE(WORD, "DD4"), 2.7f), - ALINK3C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), ANODE(WORD, "DD4"), 2.7f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_distrib_quad() -{ - And* and_mid = new And( - ALINK2C(OR, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 1.1f), - ALINK2C(OR, ANODE(WORD, "CC3"), ANODE(WORD, "DD4"), 2.2f), 0.4f); - Atom* computed = and_mid->disjoin(); - - Lynk* expected = - ALINK4(OR, - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), 3.7f), - ALINK2C(AND, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 3.7f), - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "DD4"), 3.7f), - ALINK2C(AND, ANODE(WORD, "BB2"), ANODE(WORD, "DD4"), 3.7f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_distrib_quad_right() -{ - And* and_mid = new And( - ALINK2C(OR, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.25f), - ALINK2C(OR, ANODE(WORD, "CC3"), ANODE(WORD, "DD4"), 0.35f), - ANODE(WORD, "EE5"), 0.5f - ); - Atom* computed = and_mid->disjoin(); - - Lynk* expected = - ALINK4(OR, - ALINK3C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), ANODE(WORD, "EE5"), 1.1f), - ALINK3C(AND, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), ANODE(WORD, "EE5"), 1.1f), - ALINK3C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "DD4"), ANODE(WORD, "EE5"), 1.1f), - ALINK3C(AND, ANODE(WORD, "BB2"), ANODE(WORD, "DD4"), ANODE(WORD, "EE5"), 1.1f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_and_distrib_quad_left() -{ - And* and_mid = new And( - ANODE(WORD, "EE5"), - ALINK2C(OR, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.11f), - ALINK2C(OR, ANODE(WORD, "CC3"), ANODE(WORD, "DD4"), 0.22f), 0.1f); - Atom* computed = and_mid->disjoin(); - - Lynk* expected = - ALINK4(OR, - ALINK3C(AND, ANODE(WORD, "EE5"), ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), 0.43f), - ALINK3C(AND, ANODE(WORD, "EE5"), ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.43f), - ALINK3C(AND, ANODE(WORD, "EE5"), ANODE(WORD, "AA1"), ANODE(WORD, "DD4"), 0.43f), - ALINK3C(AND, ANODE(WORD, "EE5"), ANODE(WORD, "BB2"), ANODE(WORD, "DD4"), 0.43f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_dnf_single() -{ - Or* or_singleton = new Or(ANODE(WORD, "AA1"), 0.75f); - Atom* computed = or_singleton->disjoin(); - - Atom* expected = ANODEC(WORD, "AA1", 0.75f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_dnf_double() -{ - Or* or_two = new Or(ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.65f); - Atom* computed = or_two->disjoin(); - - Lynk* expected = - ALINK2C(OR, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.65f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_left() -{ - Or* or_right = new Or( - ALINK2C(AND, - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.1f), - ANODE(WORD, "RR1"), 0.22f), - 0.333f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2C(AND, ANODE(WORD, "BB2"), ANODE(WORD, "RR1"), 0.653f), - ALINK2C(AND, ANODE(WORD, "CC3"), ANODE(WORD, "RR1"), 0.653f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_right() -{ - Or* or_right = new Or( - ALINK2C(AND, - ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.111f), 0.222f), - 0.5f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.833f), - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), 0.833f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -// ----------------------------------------------- -bool test_costly_or_distrib_nest() -{ - Or* or_right = new Or( - ALINK1C(OR, - ALINK2C(AND, - ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.1f), - 0.02f), - 0.003f), - 0.0004f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK2(OR, - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.1234f), - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), 0.1234f) - ); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_nest2() -{ - Or* or_right = new Or( - ALINK1C(OR, - ALINK2C(AND, - ANODE(WORD, "AA1"), - ANODEC(WORD, "BB2", 0.1f), - 0.02f), - 0.003f), - 0.0004f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK2C(AND, ANODE(WORD, "AA1"), ANODEC(WORD, "BB2", 0.1f), 0.0234f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_nest3() -{ - Or* or_right = new Or( - ALINK1C(OR, - ANODEC(WORD, "BB2", 0.1f), - 0.003f), - 0.0004f); - Atom* computed = or_right->disjoin(); - - Atom* expected = ANODEC(WORD, "BB2", 0.1034f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_nest4() -{ - Or* or_right = new Or( - ALINK3C(OR, - ANODE(WORD, "DD4"), - ALINK2C(AND, - ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.1f), 0.02f), - ANODE(WORD, "EE5"), 0.003f), - 0.0004f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK4C(OR, - ANODEC(WORD, "DD4", 0.0f), - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "BB2"), 0.12f), - ALINK2C(AND, ANODE(WORD, "AA1"), ANODE(WORD, "CC3"), 0.12f), - ANODEC(WORD, "EE5", 0.0f), - 0.0034f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_nest5() -{ - Or* or_right = new Or( - ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.01), - 0.1f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK3C(OR, - ANODE(WORD, "AA1"), - ANODEC(WORD, "BB2", 0.01f), - ANODEC(WORD, "CC3", 0.01f), - 0.1f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_nest6() -{ - Or* or_right = new Or( - ALINK2C(OR, - ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.001f), - 0.01f), - 0.1f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK3C(OR, - ANODE(WORD, "AA1"), - ANODEC(WORD, "BB2", 0.001f), - ANODEC(WORD, "CC3", 0.001f), - 0.11f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_nest7() -{ - And* and_right = new And( - ALINK2C(OR, - ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.001f), - 0.01f), - 0.1f); - Atom* computed = and_right->disjoin(); - - Lynk* expected = - ALINK3C(OR, - ANODE(WORD, "AA1"), - ANODEC(WORD, "BB2", 0.001f), - ANODEC(WORD, "CC3", 0.001f), - 0.11f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_nest8() -{ - Or* or_right = new Or( - ALINK1C(AND, - ALINK2C(OR, - ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.0004f), - 0.003f), - 0.02f), - 0.1f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK3C(OR, - ANODE(WORD, "AA1"), - ANODEC(WORD, "BB2", 0.0004f), - ANODEC(WORD, "CC3", 0.0004f), - 0.123f); - - CHECK(__FUNCTION__, expected, computed); -} - -bool test_costly_or_distrib_nest9() -{ - Or* or_right = new Or( - ALINK3C(AND, - ANODE(WORD, "DD4"), - ALINK2C(OR, - ANODE(WORD, "AA1"), - ALINK2C(OR, ANODE(WORD, "BB2"), ANODE(WORD, "CC3"), 0.1f), 0.02f), - ANODE(WORD, "EE5"), 0.003f), - 0.0004f); - Atom* computed = or_right->disjoin(); - - Lynk* expected = - ALINK3C(OR, - ALINK3C(AND, ANODE(WORD, "DD4"), ANODE(WORD, "AA1"), ANODE(WORD, "EE5"), 0.0234f), - ALINK3C(AND, ANODE(WORD, "DD4"), ANODEC(WORD, "BB2", 0.1f), ANODE(WORD, "EE5"), 0.0234f), - ALINK3C(AND, ANODE(WORD, "DD4"), ANODEC(WORD, "CC3", 0.1f), ANODE(WORD, "EE5"), 0.0234f), - 0.0f); - - CHECK(__FUNCTION__, expected, computed); -} - -int ntest_costly_disjoin() -{ - size_t num_failures = 0; - if (!test_costly_and_dnf_single()) num_failures++; - if (!test_costly_and_dnf_single_ne()) num_failures++; - if (!test_costly_and_dnf_single_sum()) num_failures++; - - if (!test_costly_and_dnf_double()) num_failures++; - if (!test_costly_and_dnf_double_w()) num_failures++; - - if (!test_costly_and_distrib_left()) num_failures++; - if (!test_costly_and_distrib_left_sum()) num_failures++; - if (!test_costly_and_distrib_left_w()) num_failures++; - - if (!test_costly_and_distrib_right()) num_failures++; - if (!test_costly_and_distrib_right_sum()) num_failures++; - - if (!test_costly_and_distrib_middle()) num_failures++; - if (!test_costly_and_distrib_quad()) num_failures++; - if (!test_costly_and_distrib_quad_right()) num_failures++; - if (!test_costly_and_distrib_quad_left()) num_failures++; - - if (!test_costly_or_dnf_single()) num_failures++; - if (!test_costly_or_dnf_double()) num_failures++; - if (!test_costly_or_distrib_left()) num_failures++; - if (!test_costly_or_distrib_right()) num_failures++; - - if (!test_costly_or_distrib_nest()) num_failures++; - if (!test_costly_or_distrib_nest2()) num_failures++; - if (!test_costly_or_distrib_nest3()) num_failures++; - if (!test_costly_or_distrib_nest4()) num_failures++; - if (!test_costly_or_distrib_nest5()) num_failures++; - if (!test_costly_or_distrib_nest6()) num_failures++; - if (!test_costly_or_distrib_nest7()) num_failures++; - if (!test_costly_or_distrib_nest8()) num_failures++; - if (!test_costly_or_distrib_nest9()) num_failures++; - return num_failures; -} - -// ================================================================== - -int -main(int argc, char *argv[]) -{ - size_t num_failures = 0; - bool exit_on_fail = true; - - num_failures += ntest_core(); - report(num_failures, exit_on_fail); - - num_failures += ntest_flatten(); - report(num_failures, exit_on_fail); - - num_failures += ntest_disjoin(); - report(num_failures, exit_on_fail); - - num_failures += ntest_costly_disjoin(); - report(num_failures, exit_on_fail); - - exit (0); -} - diff -Nru link-grammar-5.3.16/viterbi/test-env.cc link-grammar-5.5.0/viterbi/test-env.cc --- link-grammar-5.3.16/viterbi/test-env.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/test-env.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,158 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/// This file provides a unit test for the operation of the viterbi parser. - -#include "test-header.h" -#include "environment.h" - -// ================================================================== - -bool test_func() -{ - total_tests++; - - Environment* env = new Environment(); - - Atom* word = ANODE(WORD, "wtf"); - Atom* disj = - ALINK2(AND, - ANODE(CONNECTOR, "Xd-"), - ANODE(CONNECTOR, "MX-") - ); - - env->set_function("con", word, disj); - Atom* got = env->get_function_value("con", word); - - Atom* expected = // same as disj, just different addrs - ALINK2(AND, - ANODE(CONNECTOR, "Xd-"), - ANODE(CONNECTOR, "MX-") - ); - - if (not (expected->operator==(got))) - { - cout << "Error: test failure on test \"test_disjoin_cost\"" << endl; - cout << "=== Expecting:\n" << expected << endl; - cout << "=== Got:\n" << got << endl; - return false; - } - cout << "PASS: test_func" << endl; - return true; -} - -// ================================================================== - -bool test_func_clobber() -{ - total_tests++; - - Environment* env = new Environment(); - - Atom* word = ANODE(WORD, "asdf"); - Atom* disj = - ALINK2(AND, - ANODE(CONNECTOR, "AAA-"), - ANODE(CONNECTOR, "Q-") - ); - - env->set_function("con", word, disj); - - word = ANODE(WORD, "wtf"); - disj = - ALINK2(AND, - ANODE(CONNECTOR, "ZZ-"), - ANODE(CONNECTOR, "WWW-") - ); - - env->set_function("con", word, disj); - - disj = - ALINK2(AND, - ANODE(CONNECTOR, "Xd-"), - ANODE(CONNECTOR, "MX-") - ); - - env->set_function("con", word, disj); - word = ANODE(WORD, "wtf"); - Atom* got = env->get_function_value("con", word); - - Atom* expected = // same as disj, just different addrs - ALINK2(AND, - ANODE(CONNECTOR, "Xd-"), - ANODE(CONNECTOR, "MX-") - ); - - if (not (expected->operator==(got))) - { - cout << "Error: test failure on test \"test_disjoin_cost\"" << endl; - cout << "=== Expecting:\n" << expected << endl; - cout << "=== Got:\n" << got << endl; - return false; - } - cout << "PASS: test_func_clobber" << endl; - return true; -} - - -// ================================================================== - -bool test_number() -{ - total_tests++; - - Environment* env = new Environment(); - - Atom* word = ANODE(WORD, "asdf"); - env->set_number("f", word, 41.0); - - word = ANODE(WORD, "asdf"); - env->set_number("f", word, 42.0); - - word = ANODE(WORD, "asdf"); - double got = env->get_number("f", word); - - if (not (got == 42.0)) - { - cout << "Error: test failure on test \"test_disjoin_cost\"" << endl; - cout << "=== Expecting: 42.0\n"; - cout << "=== Got:\n" << got << endl; - return false; - } - cout << "PASS: test_number" << endl; - return true; -} - - -int ntest_env() -{ - size_t num_failures = 0; - - if (!test_func()) num_failures++; - if (!test_func_clobber()) num_failures++; - if (!test_number()) num_failures++; - return num_failures; -} - -// ================================================================== - -int -main(int argc, char *argv[]) -{ - size_t num_failures = 0; - bool exit_on_fail = true; - - num_failures += ntest_env(); - report(num_failures, exit_on_fail); - - exit (0); -} - diff -Nru link-grammar-5.3.16/viterbi/test-parser.cc link-grammar-5.5.0/viterbi/test-parser.cc --- link-grammar-5.3.16/viterbi/test-parser.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/test-parser.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,1123 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -/// This file provides a unit test for the operation of the viterbi parser. -#include "test-header.h" - -#include -#include - -// ================================================================== -// A simple hello test; several different dictionaries -// should give exactly the same output. The input sentence -// is just one word, it should connect to the left-wall in -// just one way. The result should be just one alternative: -// that alternatives has an empty state, and output with -// just one link. -bool test_hello(const char *id, const char *dict_str, - bool empty_state, bool must_connect) -{ - total_tests++; - - Dictionary dict = dictionary_create_from_utf8(dict_str); - - // print_dictionary_data(dict); - - Parser parser(dict); - parser.streamin("Hello"); - - // This is the expected output, no matter what the - // dictionary may be. Its just one word, connected to the wall. - Lynk* one_word = - ALINK3(LING, - ANODE(LING_TYPE, "Wd"), - ALINK2(WORD_DISJ, - ANODE(WORD, "LEFT-WALL"), - ANODE(CONNECTOR, "Wd+") - ), - ALINK2(WORD_DISJ, - ANODE(WORD, "Hello"), - ANODE(CONNECTOR, "Wd-") - ) - ); - - if (empty_state) - { - // This is the expected set of alternatives: just one alternative, - // a single, empty state, and the output, above. - Lynk* ans = - ALINK1(SET, - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), - ALINK0(SEQ), - ALINK1(SET, one_word) - ) - ); - - Lynk* alts = parser.get_alternatives(); - if (not (ans->operator==(alts))) - { - cout << "Error: test failure on test \"" << id << "\"" << endl; - cout << "=== Expecting:\n" << ans << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - } - else - { - // This test will have lots of alternatives. One should have - // an empty state. - Lynk* ans = - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), - ALINK0(SEQ), - ALINK1(SET, one_word) - ); - - Lynk* out = new Set(one_word); - - bool pass_test = false; - Lynk* alts = parser.get_alternatives(); - foreach_outgoing(StateTriple*, sp, alts) - { - // At least one alternative should have an empty state. - if (ans->operator==(sp)) - pass_test = true; - - // In all cases, the output should be just the one word, - // no matter what the state. - if (must_connect and not sp->get_output()->operator==(out)) - pass_test = false; - } - if (pass_test) - { - cout<<"PASS: test_hello(" << id << ") " << endl; - return true; - } - cout << "Error: test failure on test \"" << id << "\"" << endl; - cout << "=== Expecting:\n" << ans << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - - cout<<"PASS: test_hello(" << id << ") " << endl; - return true; -} - -bool test_simplest() -{ - return test_hello ("test_simplest", - "LEFT-WALL: Wd+;" - "Hello: Wd-;", - true, true - ); -} - -bool test_simple_left_disj() -{ - return test_hello ("simple left disj", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "Hello: Wd-;", - true, true - ); -} - -bool test_simple_optional_left_cset() -{ - return test_hello ("optional left cset", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {CP+} & {Xx+} & {RW+ or Xp+};" - "Hello: Wd-;", - false, true - ); -} - -bool test_simple_right_disj() -{ - return test_hello ("simple right disj", - "LEFT-WALL: Wd+;" - "Hello: Wd- or Wi-;", - true, true - ); -} - -bool test_simple_right_required_cset() -{ - return test_hello ("required right cset", - "LEFT-WALL: Wd+;" - "Hello: Wd- or Xi- or (Xj- & (A+ or B+));", - true, true - ); -} - -bool test_simple_optional() -{ - return test_hello ("optionals in both csets", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {CP+} & {Xx+} & {RW+ or Xp+};" - "Hello: Wd- or Xi- or (Xj- & {A+ or B+});", - false, true - ); -} - -bool test_simple_onereq() -{ - return test_hello ("one required link and opt righties (simple)", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "Hello: Wd- & {A+} & {B+} & {C+};", - false, true - ); -} - -bool test_simple_zeroreq() -{ - return test_hello ("zero required links and opt righties (simple)", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "Hello: {Wd-} & {A+} & {B+} & {C+};", - false, false - ); -} - -bool test_simple_onereq_and_left() -{ - return test_hello ("one required link and opt lefties (simple)", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "Hello: Wd- & {A-} & {B-} & {C+};", - false, true - ); -} - -int ntest_simple() -{ - size_t num_failures = 0; - - if (!test_simplest()) num_failures++; - if (!test_simple_left_disj()) num_failures++; - if (!test_simple_optional_left_cset()) num_failures++; - if (!test_simple_right_disj()) num_failures++; - if (!test_simple_right_required_cset()) num_failures++; - if (!test_simple_optional()) num_failures++; - if (!test_simple_onereq()) num_failures++; - if (!test_simple_zeroreq()) num_failures++; - if (!test_simple_onereq_and_left()) num_failures++; - return num_failures; -} - -// ================================================================== -// A test of two alternative parses of a sentence with single word in it. -// Expect to get back a set with two alternative parses, each parse is -// assigned a probability of 1/2. - -bool test_alternative(const char *id, const char *dict_str, bool empty_state) -{ - total_tests++; - - Dictionary dict = dictionary_create_from_utf8(dict_str); - - // print_dictionary_data(dict); - - Parser parser(dict); - parser.streamin("Hello"); - - Lynk* alt_out_one = - ALINK3(LING, - ANODE(LING_TYPE, "Wd"), - ALINK2(WORD_DISJ, - ANODE(WORD, "LEFT-WALL"), - ANODE(CONNECTOR, "Wd+") - ), - ALINK2(WORD_DISJ, - ANODE(WORD, "Hello"), - ANODE(CONNECTOR, "Wd-") - ) - ); - - Lynk* alt_out_two = - ALINK3(LING, - ANODE(LING_TYPE, "Wi"), - ALINK2(WORD_DISJ, - ANODE(WORD, "LEFT-WALL"), - ANODE(CONNECTOR, "Wi+") - ), - ALINK2(WORD_DISJ, - ANODE(WORD, "Hello"), - ANODE(CONNECTOR, "Wi-") - ) - ); - - Lynk* alt_pair_one = - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), - ALINK0(SEQ), - ALINK1(SET, alt_out_one) - ); - - Lynk* alt_pair_two = - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), - ALINK0(SEQ), - ALINK1(SET, alt_out_two) - ); - - if (empty_state) - { - // This is the expected set of alternatives: two alternatives, - // each with an empty state, and one of the two outputs, above. - Lynk* ans = ALINK2(SET, alt_pair_one, alt_pair_two); - - Lynk* output = parser.get_alternatives(); - if (not (ans->operator==(output))) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting:\n" << ans << endl; - cout << "=== Got:\n" << output << endl; - return false; - } - } - else - { - // The final state here might not be empty. However, both - // of the alternatives should show up somwhere in the output. - - bool found_one = false; - bool found_two = false; - Lynk* alts = parser.get_alternatives(); - foreach_outgoing(StateTriple*, sp, alts) - { - // At least one alternative should have an empty state. - if (alt_pair_one->operator==(sp)) - found_one = true; - - if (alt_pair_two->operator==(sp)) - found_two = true; - } - - // Both should have been found, somewhere. - if (not alt_pair_one or not alt_pair_two) - { - cout << "Error: test failure on test \"" << id << "\"" << endl; - cout << "=== Expecting this alt:\n" << alt_pair_one << endl; - cout << "=== Expecting this alt:\n" << alt_pair_two << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - } - - cout<<"PASS: test_alternative(" << id << ") " << endl; - return true; -} - -bool test_two_alts() -{ - return test_alternative("two alternatives", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "Hello: Wd- or Wi-;", - true - ); -} - -bool test_two_opts() -{ - return test_alternative("two alts plus opts", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {A+};" - "Hello: Wd- or Wi- or (Xj- & {A+ or B+});", - false - ); -} - -bool test_two_one_opts() -{ - return test_alternative("two alt, or one opt", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {A+};" - "Hello: Wd- or {Wi-} or (Xj- & {A+ or B+});", - false - ); -} - -bool test_two_all_opts() -{ - return test_alternative("two alts, or all opt", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {A+};" - "Hello: {Wd-} or {Wi-} or (Xj- & {A+ or B+});", - false - ); -} - -bool test_two_and_opts() -{ - return test_alternative("two alts, and an opt", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {A+};" - "Hello: Wd- or (Wi- & {Xj- & {A+ or B+}} & {C+});", - false - ); -} - -bool test_two_and_no_opts() -{ - return test_alternative("two alt, and all opt", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {A+};" - "Hello: Wd- or ({Wi-} & {Xj- & {A+ or B+}} & {C+});", - false - ); -} - -bool test_two_and_excess() -{ - return test_alternative("two alt, and excess reqs", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {A+};" - "Hello: Wd- or (Wi- & Xj- & {A+ or B+} & {C+}) or Wi-;", - false - ); -} - -int ntest_two() -{ - size_t num_failures = 0; - - if (!test_two_alts()) num_failures++; - if (!test_two_opts()) num_failures++; - if (!test_two_one_opts()) num_failures++; - if (!test_two_all_opts()) num_failures++; - if (!test_two_and_opts()) num_failures++; - if (!test_two_and_no_opts()) num_failures++; - if (!test_two_and_excess()) num_failures++; - - return num_failures; -} - -// ================================================================== - -bool test_simple_state(const char *id, const char *dict_str) -{ - total_tests++; - - Dictionary dict = dictionary_create_from_utf8(dict_str); - - // print_dictionary_data(dict); - - Parser parser(dict); - // Expecting more words to follow, so a non-trivial state. - parser.streamin("this"); - - Lynk* ans_state = - ALINK2(SEQ, - ALINK2(WORD_CSET, - ANODE(WORD, "this"), - ANODE(CONNECTOR, "Ss*b+") - ), - ALINK2(WORD_CSET, - ANODE(WORD, "LEFT-WALL"), - ALINK3(OR, - ANODE(CONNECTOR, "Wd+"), - ANODE(CONNECTOR, "Wi+"), - ANODE(CONNECTOR, "Wq+") - ) - ) - ); - - Lynk* ans = - ALINK1(SET, - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), - ans_state, - ALINK0(SET) - ) - ); - - Lynk* state = parser.get_alternatives(); - if (not (ans->operator==(state))) - { - cout << "Error: test failure on test " << id << endl; - cout << "=== Expecting state:\n" << ans << endl; - cout << "=== Got state:\n" << state << endl; - return false; - } - - cout<<"PASS: test_simple_state(" << id << ") " << endl; - return true; -} - -bool test_first_state() -{ - return test_simple_state("first state", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+;" - ); -} - -bool test_first_opt_lefty() -{ - return test_simple_state("first state, left-going optional", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+ and {Xi-};" - ); -} - -bool test_first_or_lefty() -{ - return test_simple_state("first state, OR left-going", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+ or Xi-;" - ); -} - -bool test_first_or_multi_lefty() -{ - return test_simple_state("first state, multi-OR left-going", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+ or Xi- or Y- or Z-;" - ); -} - -bool test_first_opt_cpx() -{ - return test_simple_state("first state, complex left-going optional", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+ and {Xi- or P- or {Q- & Z+}};" - ); -} - -bool test_first_infer_opt() -{ - return test_simple_state("first state, complex infer optional", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+ and (Xi- or P- or {Q- & Z+});" - ); -} - -int ntest_first() -{ - size_t num_failures = 0; - - if (!test_first_state()) num_failures++; - if (!test_first_opt_lefty()) num_failures++; - if (!test_first_or_lefty()) num_failures++; - if (!test_first_or_multi_lefty()) num_failures++; - if (!test_first_opt_cpx()) num_failures++; - if (!test_first_infer_opt()) num_failures++; - - return num_failures; -} - -// ================================================================== - -bool test_short_sent(const char *id, const char *dict_str, bool empty_state) -{ - total_tests++; - - Dictionary dict = dictionary_create_from_utf8(dict_str); - // print_dictionary_data(dict); - - Parser parser(dict); - - // Expecting more words to follow, so a non-trivial state. - // In particular, the dictionary will link the left-wall to - // "is", so "this" has to be pushed on stack until the "is" - // shows up. The test_seq_sent() below will link the other - // way around. - parser.streamin("this is"); - - Lynk* alts = parser.get_alternatives(); - - // At least one result should be this state pair. - Lynk* sp = - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), // empty input - ALINK0(SEQ), // empty state - ALINK2(SET, - ALINK3(LING, - ANODE(LING_TYPE, "Ss*b"), - ALINK2(WORD_DISJ, - ANODE(WORD, "this"), - ANODE(CONNECTOR, "Ss*b+")), - ALINK2(WORD_DISJ, - ANODE(WORD, "is.v"), - ANODE(CONNECTOR, "Ss-"))) - , - ALINK3(LING, - ANODE(LING_TYPE, "Wi"), - ALINK2(WORD_DISJ, - ANODE(WORD, "LEFT-WALL"), - ANODE(CONNECTOR, "Wi+")), - ALINK2(WORD_DISJ, - ANODE(WORD, "is.v"), - ANODE(CONNECTOR, "Wi-"))) - )); - - if (empty_state) - { - Lynk* ans = ALINK1(SET, sp); - if (not (ans->operator==(alts))) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting:\n" << ans << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - } - else - { - // At least one alternative should be the desired state pair. - bool found = false; - foreach_outgoing(Atom*, a, alts) - { - if (sp->operator==(a)) - found = true; - } - if (not found) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting one of them to be:\n" << sp << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - } - - cout<<"PASS: test_short_sent(" << id << ") " << endl; - return true; -} - -bool test_short_this() -{ - return test_short_sent("short sent", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+;" - "is.v: Ss- and Wi-;", - true - ); -} - -bool test_short_this_opt() -{ - return test_short_sent("short sent opt", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+;" - "is.v: Ss- and Wi- and {O+};", - false - ); -} - -bool test_short_this_obj_opt() -{ - return test_short_sent("short sent with obj", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+ or Os-;" - "is.v: Ss- and Wi- and {O+};", - false - ); -} - -bool test_short_this_costly() -{ - return test_short_sent("short sent with costly null", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Ss*b+ or [[[()]]];" - "is.v: Ss- and Wi-;", - true - ); -} - -bool test_short_this_complex() -{ - return test_short_sent("short sent complex", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "" - ": {({@COd-} & (C-)) or ({@CO-} & (Wd- & {CC+})) or [Rn-]};" - ":" - " (Jd- & Dmu- & Os-)" - " or (Jd- & Dmu- & {Wd-} & Ss*b+)" - " or (Ss*b+ & ) or SIs*b- or [[Js-]] or [Os-];" - "" - "this:" - " ;" - "" - "is.v: Ss- and Wi- and {O+};", - false - ); -} - -bool test_short_this_noun_dict() -{ - return test_short_sent("short sent realistic dict entry for noun", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - ": [[[()]]];" - "" - ":" - " ({[B*j+]} & Xd- & (Xc+ or ) & MX-);" - "" - ": RJrc- or RJlc+;" - "" - ": {({@COd-} & (C- or )) or ({@CO-} & (Wd- & {CC+})) or [Rn-]};" - "" - ":" - " (Jd- & Dmu- & Os-)" - " or (Jd- & Dmu- & {Wd-} & Ss*b+)" - " or (Ss*b+ & ) or SIs*b- or [[Js-]] or [Os-]" - " or " - " or ;" - "" - "this:" - " ;" - "" - "is.v: Ss- and Wi- and {O+};", - false - ); -} - -int ntest_short() -{ - size_t num_failures = 0; - - if (!test_short_this()) num_failures++; - if (!test_short_this_opt()) num_failures++; - if (!test_short_this_obj_opt()) num_failures++; - if (!test_short_this_costly()) num_failures++; - if (!test_short_this_complex()) num_failures++; - if (!test_short_this_noun_dict()) num_failures++; - - return num_failures; -} - -// ================================================================== - -bool test_seq_sent(const char *id, const char *dict_str, bool empty_state) -{ - total_tests++; - - Dictionary dict = dictionary_create_from_utf8(dict_str); - - // print_dictionary_data(dict); - - Parser parser(dict); - // Expecting more words to follow, so a non-trivial state. - // Unlike test_short_sent() above, here, we link "this" to - // the left wall, followed by "is" to for a sequence. - parser.streamin("this is"); - - Lynk* alts = parser.get_alternatives(); - - // At least one result should be this state pair. - Lynk* sp = - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), // empty input - ALINK0(SEQ), // empty state - ALINK2(SET, - ALINK3(LING, - ANODE(LING_TYPE, "Wd"), - ALINK2(WORD_DISJ, - ANODE(WORD, "LEFT-WALL"), - ANODE(CONNECTOR, "Wd+")), - ALINK2(WORD_DISJ, - ANODE(WORD, "this"), - ANODE(CONNECTOR, "Wd-"))), - ALINK3(LING, - ANODE(LING_TYPE, "Ss*b"), - ALINK2(WORD_DISJ, - ANODE(WORD, "this"), - ANODE(CONNECTOR, "Ss*b+")), - ALINK2(WORD_DISJ, - ANODE(WORD, "is.v"), - ANODE(CONNECTOR, "Ss-"))))); - - if (empty_state) - { - Lynk* ans = ALINK1(SET, sp); - if (not (ans->operator==(alts))) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting:\n" << ans << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - } - else - { - // At least one alternative should be the desired state pair. - bool found = false; - foreach_outgoing(Atom*, a, alts) - { - if (sp->operator==(a)) - found = true; - } - if (not found) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting one of them to be:\n" << sp << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - } - - cout<<"PASS: test_short_sent(" << id << ") " << endl; - return true; -} - -bool test_seq_this() -{ - return test_seq_sent("short seq sent", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Wd- and Ss*b+;" - "is.v: Ss-;", - true - ); -} - -bool test_seq_this_opt() -{ - return test_seq_sent("short seq sent opt", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Wd- and Ss*b+;" - "is.v: Ss- and {O+};", - false - ); -} - -bool test_seq_this_obj_opt() -{ - return test_seq_sent("short seq sent with obj", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Wd- and (Ss*b+ or Os-);" - "is.v: Ss- and {O+};", - false - ); -} - -bool test_seq_this_costly() -{ - return test_seq_sent("short seq sent with costly null", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "this: Wd- and (Ss*b+ or [[[()]]]);" - "is.v: Ss-;", - true - ); -} - -bool test_seq_this_complex() -{ - return test_seq_sent("short seq sent complex", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - "" - ": {({@COd-} & (C-)) or ({@CO-} & (Wd- & {CC+})) or [Rn-]};" - ":" - " (Jd- & Dmu- & Os-)" - " or (Jd- & Dmu- & {Wd-} & Ss*b+)" - " or (Ss*b+ & ) or SIs*b- or [[Js-]] or [Os-];" - "" - "this:" - " ;" - "" - "is.v: Ss- and {O+};", - false - ); -} - -bool test_seq_this_noun_dict() -{ - return test_seq_sent("short seq sent realistic dict entry for noun", - "LEFT-WALL: Wd+ or Wi+ or Wq+;" - ": [[[()]]];" - "" - ":" - " ({[B*j+]} & Xd- & (Xc+ or ) & MX-);" - "" - ": RJrc- or RJlc+;" - "" - ": {({@COd-} & (C- or )) or ({@CO-} & (Wd- & {CC+})) or [Rn-]};" - "" - ":" - " (Jd- & Dmu- & Os-)" - " or (Jd- & Dmu- & {Wd-} & Ss*b+)" - " or (Ss*b+ & ) or SIs*b- or [[Js-]] or [Os-]" - " or " - " or ;" - "" - "this:" - " ;" - "" - "is.v: Ss- and {O+};", - false - ); -} - - -bool test_seq_this_verb_dict() -{ - return test_seq_sent("short seq sent realistic dict entry for verb", - "LEFT-WALL:" - " (Wd+ or Wq+ or Ws+ or Wj+ or Wc+ or Wi+ or We+ or Qd+)" - " & {CP+} & {Xx+} & {RW+ or Xp+};" - "" - ": [[[()]]];" - "" - ":" - " ({[B*j+]} & Xd- & (Xc+ or ) & MX-);" - "" - ": RJrc- or RJlc+;" - "" - ": {({@COd-} & (C- or )) or ({@CO-} & (Wd- & {CC+})) or [Rn-]};" - "" - ":" - " (Jd- & Dmu- & Os-)" - " or (Jd- & Dmu- & {Wd-} & Ss*b+)" - " or (Ss*b+ & ) or SIs*b- or [[Js-]] or [Os-]" - " or " - " or ;" - "" - "this:" - " ;" - "" - ": {@E-} & VJrs-;" - ": {@E-} & VJls+;" - "" - ": {@E-} & (Ss- or SFs- or SFu- or (RS- & Bs-));" - "" - ":" - " {@EBm+} & O*t+ & {@MV+};" - "" - ":" - " ({@EBm+} & ((([B**t-] or [K+] or BI+ or OF+ or PF- or" - " (Osi+ & R+ & Bs+) or" - " (Opi+ & R+ & Bp+) or" - " [[()]]) & {@MV+}) or" - " (Pp+ & {THi+ or @MV+}) or" - " THb+ or" - " TO+ or" - " Pa+)) or" - " ({N+} & (AF- or Pv+ or I*v+)) or" - " (({N+} or {Pp+}) & Pg*b+);" - "" - ": or ;" - "" - "is.v:" - " ( & ) or" - " ( & ) or ( & ) or" - " (((Rw- or ({Ic-} & Q-) or [()]) & (SIs+ or SFIs+)) & );" - "", - false - ); -} - -int ntest_short_seq() -{ - size_t num_failures = 0; - - if (!test_seq_this()) num_failures++; - if (!test_seq_this_opt()) num_failures++; - if (!test_seq_this_obj_opt()) num_failures++; - if (!test_seq_this_costly()) num_failures++; - if (!test_seq_this_complex()) num_failures++; - if (!test_seq_this_noun_dict()) num_failures++; - if (!test_seq_this_verb_dict()) num_failures++; - - return num_failures; -} - -// ================================================================== - -bool test_state_sent(const char *id, const char *dict_str) -{ - total_tests++; - - Dictionary dict = dictionary_create_from_utf8(dict_str); - - // print_dictionary_data(dict); - - Parser parser(dict); - // Expecting more words to follow, so a non-trivial state. - parser.streamin("this is a test"); - - Lynk* alts = parser.get_alternatives(); - - // We expect no output, and a crazy state: - // The provided dictionary will not allow a linkage to happen; - // this is really just testing the push of stack state. - Lynk* sp = - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), // empyt input - ALINK5(SEQ, - ALINK2(WORD_CSET, - ANODE(WORD, "test.n"), - ALINK2(OR, - ANODE(CONNECTOR, "XXXGIVEN+"), - ANODE(CONNECTOR, "AN+"))) - , - ALINK2(WORD_CSET, - ANODE(WORD, "a"), - ANODE(CONNECTOR, "Ds+")) - , - ALINK2(WORD_CSET, - ANODE(WORD, "is.v"), - ANODE(CONNECTOR, "SIs+")) - , - ALINK2(WORD_CSET, - ANODE(WORD, "this.J2"), - ANODE(CONNECTOR, "JDBKQ+")) - , - ALINK2(WORD_CSET, - ANODE(WORD, "LEFT-WALL"), - ANODE(CONNECTOR, "Wq+")) - ), - ALINK0(SET)); // empty output - - Lynk* ans = ALINK1(SET, sp); - if (not (ans->operator==(alts))) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting:\n" << ans << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - cout<<"PASS: test_state_sent(" << id << ") " << endl; - return true; -} - -bool test_state_order() -{ - return test_state_sent("short state sent", - "LEFT-WALL: Wq+;" - "this.J2: JDBKQ+;" - "is.v: SIs+;" - "a: Ds+;" - "test.n: XXXGIVEN+ or AN+;" - ); -} - -bool test_state_order_left() -{ - return test_state_sent("short state sent leftwards", - "LEFT-WALL: Wq+;" - "this.J2: JDBKQ+ or JAAA-;" - "is.v: SIs+ or KBB-;" - "a: Ds+ & {Junk-} ;" - "test.n: XXXGIVEN+ or BOGUS- or (AN+ & {GLOP-});" - ); -} - -int ntest_short_state() -{ - size_t num_failures = 0; - - if (!test_state_order()) num_failures++; - if (!test_state_order_left()) num_failures++; - return num_failures; -} - -// ================================================================== - -bool test_right_wall(const char *id, const char *dict_str, bool empty_state) -{ - total_tests++; - - Dictionary dict = dictionary_create_from_utf8(dict_str); - - // print_dictionary_data(dict); - Parser parser(dict); - // Expecting more words to follow, so a non-trivial state. - parser.streamin("this is ."); - - Lynk* alts = parser.get_alternatives(); - - // We expect empty final state. - Lynk* sp = - ALINK3(STATE_TRIPLE, - ALINK0(SEQ), // empty input - ALINK0(SEQ), // empty state - ALINK3(SET, - ALINK3(LING, - ANODE(LING_TYPE, "Wd"), - ALINK2(WORD_DISJ, - ANODE(WORD, "LEFT-WALL"), - ANODE(CONNECTOR, "Wd+")), - ALINK2(WORD_DISJ, - ANODE(WORD, "this"), - ANODE(CONNECTOR, "Wd-"))) - , - ALINK3(LING, - ANODE(LING_TYPE, "Ss*b"), - ALINK2(WORD_DISJ, - ANODE(WORD, "this"), - ANODE(CONNECTOR, "Ss*b+")), - ALINK2(WORD_DISJ, - ANODE(WORD, "is.v"), - ANODE(CONNECTOR, "Ss-"))) - , - ALINK3(LING, - ANODE(LING_TYPE, "Xp"), - ALINK2(WORD_DISJ, - ANODE(WORD, "LEFT-WALL"), - ANODE(CONNECTOR, "Xp+")), - ALINK2(WORD_DISJ, - ANODE(WORD, "."), - ANODE(CONNECTOR, "Xp-"))) - )); - - - Lynk* ans = ALINK1(SET, sp); - if (not (ans->operator==(alts))) - { - cout << "Error: test failure on test \"" << id <<"\"" << endl; - cout << "=== Expecting:\n" << ans << endl; - cout << "=== Got:\n" << alts << endl; - return false; - } - cout<<"PASS: test_right_wall(" << id << ") " << endl; - return true; -} - -bool test_period() -{ - return test_right_wall("period", - "LEFT-WALL: (Wd+ or Wi+ or Wq+) & {Xp+};" - "this: Wd- and Ss*b+;" - "is.v: Ss-;" - "\".\": Xp-;", - false - ); -} - -int ntest_right_wall() -{ - size_t num_failures = 0; - - if (!test_period()) num_failures++; - return num_failures; -} - -// ================================================================== - -int -main(int argc, char *argv[]) -{ - size_t num_failures = 0; - bool exit_on_fail = true; - - num_failures += ntest_simple(); - report(num_failures, exit_on_fail); - - num_failures += ntest_two(); - report(num_failures, exit_on_fail); - - num_failures += ntest_first(); - report(num_failures, exit_on_fail); - - num_failures += ntest_short(); - report(num_failures, exit_on_fail); - - num_failures += ntest_short_seq(); - report(num_failures, exit_on_fail); - - num_failures += ntest_short_state(); - report(num_failures, exit_on_fail); - - num_failures += ntest_right_wall(); - report(num_failures, exit_on_fail); - - exit (0); -} - diff -Nru link-grammar-5.3.16/viterbi/upcast.cc link-grammar-5.5.0/viterbi/upcast.cc --- link-grammar-5.3.16/viterbi/upcast.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/upcast.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,129 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include - -#include "compile.h" -#include "compile-base.h" -#include "utilities.h" // needed for assert - -using namespace link_grammar::viterbi; - -namespace atombase { - -// ============================================================ - -Atom* Atom::upcaster() -{ - if (!this) return this; - - const Node* n = dynamic_cast(this); - const Link* l = dynamic_cast(this); - - switch (get_type()) - { - // Links - case AND: - if (dynamic_cast(this)) return this; - return new And(l->get_outgoing_set(), _tv); - case OR: - if (dynamic_cast(this)) return this; - return new Or(l->get_outgoing_set(), _tv); - case SEQ: - if (dynamic_cast(this)) return this; - return new Seq(l->get_outgoing_set(), _tv); - case SET: - if (dynamic_cast(this)) return this; - return new Set(l->get_outgoing_set(), _tv); - - // Nodes - case NUMBER: - if (dynamic_cast(this)) return this; - return new Number(atof(n->get_name().c_str()), _tv); - - case CONNECTOR: - if (dynamic_cast(this)) return this; - return new Connector(n->get_name(), _tv); - - case WORD: - if (dynamic_cast(this)) return this; - return new Word(n->get_name(), _tv); - - default: - assert(0, "Atom::upcaster(): implement me!"); - } -} - -// ============================================================ - -Link* Link::append(Atom* a) const -{ - OutList ol = get_outgoing_set(); - ol.push_back(a); - - switch (get_type()) - { - // Links - case AND: - return new And(ol, _tv); - case OR: - return new Or(ol, _tv); - case SEQ: - return new Seq(ol, _tv); - case SET: - return new Set(ol, _tv); - - default: - std::cerr << "unhandled atom type "<< type_name(get_type()) << std::endl; - assert(0, "atombase::Link::append: implement me!"); - } -} - -// ============================================================ - -/// Replace stary with nowy. -Link* Link::replace(Atom* novi, Atom* ctari) const -{ - OutList ol = get_outgoing_set(); - - // Loo for stary, replace with nowy if found. - size_t sz = ol.size(); - size_t i = 0; - for (; i */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include "../link-grammar/link-features.h" - -LINK_BEGIN_DECLS -void viterbi_parse(const char * sentence, Dictionary dict); -LINK_END_DECLS - diff -Nru link-grammar-5.3.16/viterbi/word-monad.cc link-grammar-5.5.0/viterbi/word-monad.cc --- link-grammar-5.3.16/viterbi/word-monad.cc 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/word-monad.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,320 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#include - -#include -#include -#include - -#include "utilities.h" // From base link-grammar - -#include "atom.h" -#include "compress.h" -#include "connect.h" -#include "word-monad.h" - -using namespace std; - -// #define DBG(X) X; -#define DBG(X) - -namespace link_grammar { -namespace viterbi { - -/** - * This class is vaguely monad-like. The constructor takes some - * arguments. It returns a "function" called operator() which - * acts on state, and returns different state. - * - * The "core" function wrapped by the monad is reassemble(). - * Everything else is a 'wrapper' around this, handling the various - * special cases that appear in the state. - * - * It is both a C++ functor (because it defines operator()) and it is - * also a hypergraph functor (because it maps one hypergraph into - * another). It has an adjoint functor, which can look at the parse - * state, and rip out the right-most word, to restore the original - * state. (This adjoint is not currently implemented; its not - * obviously useful. A different functor, which looks at state, and - * generates a string of words, left-to-right, might be useful for - * speech generation.) - */ - -/** - * constructor: argument is a connector set (typically, for a single - * word) that will be used to try connections. This connector set is - * assumed to be to the right of the argument to the try_connect() - * method. - * - * To be precise, the right_wconset is presumed to be of the form: - * WORD_CSET : - * WORD : jabberwoky.n - * OR : - * CONNECTOR : Wd- - * CONNECTOR : Abc+ etc... - * - * In particular, it is assumed to be in DNF (disjunctive normal form). - */ -WordMonad::WordMonad(WordCset* right_wconset) - : _right_cset(right_wconset) -{ - assert(_right_cset, "Unexpected NULL dictionary entry!"); - DBG(cout << "--------------- WordMonad ctor right=\n" << _right_cset << endl); -} - -/// Unite two states into one. -/// -/// This is needed to implement zippering properly: The old set of -/// states is the collection of as-yet unconnected connectors; the -/// new set of states is the collection that remains after connecting. -/// We have to peel off and discard some certain number of the old -/// states (as these are now connected), and append in their place -/// the new states. We also typically peel off one new one, as that -/// one will be used for trying new onnections. -static StateTriple* unite(StateTriple* old_sp, size_t old_peel_off, - StateTriple* new_sp, size_t new_peel_off) -{ - OutList united_states; - Seq* old_state = old_sp->get_state(); - Seq* new_state = new_sp->get_state(); - - // The no-links-cross rule means that the first state with - // unconnected right-going links must necessarily be at the - // head of the state vector. That is, new comes before old. - const OutList& no = new_state->get_outgoing_set(); - united_states.insert(united_states.end(), - no.begin() + new_peel_off, no.end()); - - const OutList& oo = old_state->get_outgoing_set(); - united_states.insert(united_states.end(), - oo.begin() + old_peel_off, oo.end()); - - // Unite the outputs too ... - // This is easy, just concatenate old and append new. - OutList united_outputs; - Set* old_output = old_sp->get_output(); - Set* new_output = new_sp->get_output(); - - // I don't think the output order matters much, but appending - // new output to old seems reasonable. - const OutList& ooo = old_output->get_outgoing_set(); - united_outputs.insert(united_outputs.end(), ooo.begin(), ooo.end()); - - const OutList& noo = new_output->get_outgoing_set(); - united_outputs.insert(united_outputs.end(), noo.begin(), noo.end()); - -// XXX clearly failing to deal with input ... - return new StateTriple(new Seq(), new Seq(united_states), new Set(united_outputs)); -} - -/** - * Try connecting this connector set, from the left, to the right cset. - * This returns a set of alternative connections, as state pairs: each - * alternative will consist of new state, and the links that were issued. - */ -static Set* next_connect(WordCset* left_cset, WordCset* right_cset) -{ - assert(left_cset, "State word-connectorset is null"); - Atom* left_a = left_cset->get_cset(); - - DBG(cout << "Enter next_connect(), left cset " << left_a << endl); - - // Wrap bare connector with OR; this simplifie the nested loop below. - Or* left_dnf = NULL; - AtomType lt = left_a->get_type(); - if (CONNECTOR == lt or AND == lt) - left_dnf = new Or(left_a); - else - left_dnf = upcast(left_a); - assert(left_dnf != NULL, "Left disjuncts not in DNF"); - - - Atom *right_a = right_cset->get_cset(); - DBG(cout << "in next_connect(), right cset "<< right_a <get_type(); - if (CONNECTOR == rt or AND == rt) - right_dnf = new Or(right_a); - else - right_dnf = upcast(right_a); - assert(right_dnf != NULL, "Right disjuncts not in DNF"); - - - // At this point, both left_dnf and right_dnf should be in - // disjunctive normal form. Perform a nested loop over each - // of them, connecting each to each. - - // "alternatives" records the various different successful ways - // that connectors can be mated. Its a list of state pairs. - OutList alternatives; - foreach_outgoing(Atom*, ldj, left_dnf) - { - foreach_outgoing(Atom*, rdj, right_dnf) - { - Connect cnct(left_cset, right_cset); - StateTriple* sp = cnct.try_alternative(ldj, rdj); - if (sp) - alternatives.push_back(sp); - } - } - - return compress_alternatives(new Set(alternatives)); -} - -/** - * Try connecting this connector set sequence, from the left, to what - * was passed in ctor. It is preseumed that left_sp is a single parse - * state: it should contain no left-pointing connectors whatsoever. This - * routine will attempt to attach the right-pointing connectors to the - * left-pointers passed in the ctor. A connection is considered to be - * successful if *all* left-pointers in the ctor were attached (except - * possibly for optionals). The returned value is a set of all possible - * alternative state pairs for which there is a connnection. - * - * WordMonadors must be satisfied sequentially: that is, the first connector - * set in the sequence must be fully satisfied before a connection is made - * to the second one in the sequence, etc. (counting from zero). - */ -static Set* try_connect_one(StateTriple* left_sp, WordCset* right_cset) -{ - // Zipper up the zipper. - // The left state can the thought of as a strand of zipper teeth, - // or half of a strand of DNA, if you wish. Each of the zipper teeth - // are right-pointing connectors. These must be mated with the - // left-pointing connectors from the right cset. The reason its - // zipper-like is that the right cset might connect to multiple csets - // on the left. The connectins must be made in order, and so we loop - // through the left state, first trying to satisfy all connectors in - // the first cset, then the second, and so on, until all of the - // left-pointing connectors in the right cset have been connected, - // or they're optional, or there is a failure to connect. - Seq* left_state = left_sp->get_state(); - Atom* a = left_state->get_outgoing_atom(0); - WordCset* lwc = dynamic_cast(a); - Set* alternatives = next_connect(lwc, right_cset); - - size_t lsz = left_state->get_arity(); - size_t lnext = 1; - - // OK, so do any of the alternatives include state with - // left-pointing connectors? If not, then we are done. If so, - // then these need to be mated to the next word cset on the left. - // If they can't be mated, then fail, and we are done. - OutList filtered_alts; - foreach_outgoing(StateTriple*, new_sp, alternatives) - { - Seq* new_state = new_sp->get_state(); - - if (0 < new_state->get_arity()) - { - Atom* a = new_state->get_outgoing_atom(0); - WordCset* new_cset = dynamic_cast(a); - if (new_cset->has_left_pointers()) - { - // The left-pointers are either mandatory or optional. - // If they're mandatory and there is no state to zipper with, - // then its a parse fail. Otherwise recurse. -// XXX check for optional... - if (lnext < lsz) - { - // Atom* a = left_state->get_outgoing_atom(lnext); - -// cout <<"rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrecurse"<get_state(), new Set()); - DBG(cout << "United states:" << united_sp << endl); - Set* new_alts = try_connect_one(usp, new_cset); -// cout << "woot got this:" << new_alts<get_state(); - if (0 == state->get_arity()) - { - // If there is nothing to connect to, then we presume that we're - // dealing with a sentence boundary. Seentence boundaries cannot - // have any left-pointers in them. - if (_right_cset->has_left_pointers()) - continue; - Set* next_alts = new Set(new StateTriple(new Seq(), - new Seq(_right_cset), new Set())); - new_alts = new_alts->sum(next_alts); - continue; - } - // Each state sequence consists of a sequence of right-pointing - // links. These must be sequentially satisfied: This is the - // viterbi equivalent of "planar graph" or "no-crossing-links" - // in the classical link-grammar parser. That is, a new word - // must link to the first sequence element that has dangling - // right-pointing connectors. - Set* next_alts = try_connect_one(sp, _right_cset); - new_alts = new_alts->sum(next_alts); - } - - // set_clean_state(new_state_set); - return new_alts; -} - -} // namespace viterbi -} // namespace link-grammar diff -Nru link-grammar-5.3.16/viterbi/word-monad.h link-grammar-5.5.0/viterbi/word-monad.h --- link-grammar-5.3.16/viterbi/word-monad.h 2014-11-15 21:14:03.000000000 +0000 +++ link-grammar-5.5.0/viterbi/word-monad.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,36 +0,0 @@ -/*************************************************************************/ -/* Copyright (c) 2012, 2013 Linas Vepstas */ -/* All rights reserved */ -/* */ -/* Use of the Viterbi parsing system is subject to the terms of the */ -/* license set forth in the LICENSE file included with this software. */ -/* This license allows free redistribution and use in source and binary */ -/* forms, with or without modification, subject to certain conditions. */ -/* */ -/*************************************************************************/ - -#ifndef _LG_VITERBI_WORD_MONAD_H -#define _LG_VITERBI_WORD_MONAD_H - -#include "atom.h" -#include "compile.h" -#include "garbage.h" - -namespace link_grammar { -namespace viterbi { - -class WordMonad : public gc -{ - public: - WordMonad(WordCset*); - Set* operator()(Set*); - - private: - WordCset* _right_cset; -}; - - -} // namespace viterbi -} // namespace link-grammar - -#endif // _LG_VITERBI_WORD_MONAD_H diff -Nru link-grammar-5.3.16/ylwrap link-grammar-5.5.0/ylwrap --- link-grammar-5.3.16/ylwrap 1970-01-01 00:00:00.000000000 +0000 +++ link-grammar-5.5.0/ylwrap 2017-05-27 12:05:34.000000000 +0000 @@ -0,0 +1,247 @@ +#! /bin/sh +# ylwrap - wrapper for lex/yacc invocations. + +scriptversion=2013-01-12.17; # UTC + +# Copyright (C) 1996-2014 Free Software Foundation, Inc. +# +# Written by Tom Tromey . +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# This file is maintained in Automake, please report +# bugs to or send patches to +# . + +get_dirname () +{ + case $1 in + */*|*\\*) printf '%s\n' "$1" | sed -e 's|\([\\/]\)[^\\/]*$|\1|';; + # Otherwise, we want the empty string (not "."). + esac +} + +# guard FILE +# ---------- +# The CPP macro used to guard inclusion of FILE. +guard () +{ + printf '%s\n' "$1" \ + | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]/_/g' \ + -e 's/__*/_/g' +} + +# quote_for_sed [STRING] +# ---------------------- +# Return STRING (or stdin) quoted to be used as a sed pattern. +quote_for_sed () +{ + case $# in + 0) cat;; + 1) printf '%s\n' "$1";; + esac \ + | sed -e 's|[][\\.*]|\\&|g' +} + +case "$1" in + '') + echo "$0: No files given. Try '$0 --help' for more information." 1>&2 + exit 1 + ;; + --basedir) + basedir=$2 + shift 2 + ;; + -h|--h*) + cat <<\EOF +Usage: ylwrap [--help|--version] INPUT [OUTPUT DESIRED]... -- PROGRAM [ARGS]... + +Wrapper for lex/yacc invocations, renaming files as desired. + + INPUT is the input file + OUTPUT is one file PROG generates + DESIRED is the file we actually want instead of OUTPUT + PROGRAM is program to run + ARGS are passed to PROG + +Any number of OUTPUT,DESIRED pairs may be used. + +Report bugs to . +EOF + exit $? + ;; + -v|--v*) + echo "ylwrap $scriptversion" + exit $? + ;; +esac + + +# The input. +input=$1 +shift +# We'll later need for a correct munging of "#line" directives. +input_sub_rx=`get_dirname "$input" | quote_for_sed` +case $input in + [\\/]* | ?:[\\/]*) + # Absolute path; do nothing. + ;; + *) + # Relative path. Make it absolute. + input=`pwd`/$input + ;; +esac +input_rx=`get_dirname "$input" | quote_for_sed` + +# Since DOS filename conventions don't allow two dots, +# the DOS version of Bison writes out y_tab.c instead of y.tab.c +# and y_tab.h instead of y.tab.h. Test to see if this is the case. +y_tab_nodot=false +if test -f y_tab.c || test -f y_tab.h; then + y_tab_nodot=true +fi + +# The parser itself, the first file, is the destination of the .y.c +# rule in the Makefile. +parser=$1 + +# A sed program to s/FROM/TO/g for all the FROM/TO so that, for +# instance, we rename #include "y.tab.h" into #include "parse.h" +# during the conversion from y.tab.c to parse.c. +sed_fix_filenames= + +# Also rename header guards, as Bison 2.7 for instance uses its header +# guard in its implementation file. +sed_fix_header_guards= + +while test $# -ne 0; do + if test x"$1" = x"--"; then + shift + break + fi + from=$1 + # Handle y_tab.c and y_tab.h output by DOS + if $y_tab_nodot; then + case $from in + "y.tab.c") from=y_tab.c;; + "y.tab.h") from=y_tab.h;; + esac + fi + shift + to=$1 + shift + sed_fix_filenames="${sed_fix_filenames}s|"`quote_for_sed "$from"`"|$to|g;" + sed_fix_header_guards="${sed_fix_header_guards}s|"`guard "$from"`"|"`guard "$to"`"|g;" +done + +# The program to run. +prog=$1 +shift +# Make any relative path in $prog absolute. +case $prog in + [\\/]* | ?:[\\/]*) ;; + *[\\/]*) prog=`pwd`/$prog ;; +esac + +dirname=ylwrap$$ +do_exit="cd '`pwd`' && rm -rf $dirname > /dev/null 2>&1;"' (exit $ret); exit $ret' +trap "ret=129; $do_exit" 1 +trap "ret=130; $do_exit" 2 +trap "ret=141; $do_exit" 13 +trap "ret=143; $do_exit" 15 +mkdir $dirname || exit 1 + +cd $dirname + +case $# in + 0) "$prog" "$input" ;; + *) "$prog" "$@" "$input" ;; +esac +ret=$? + +if test $ret -eq 0; then + for from in * + do + to=`printf '%s\n' "$from" | sed "$sed_fix_filenames"` + if test -f "$from"; then + # If $2 is an absolute path name, then just use that, + # otherwise prepend '../'. + case $to in + [\\/]* | ?:[\\/]*) target=$to;; + *) target=../$to;; + esac + + # Do not overwrite unchanged header files to avoid useless + # recompilations. Always update the parser itself: it is the + # destination of the .y.c rule in the Makefile. Divert the + # output of all other files to a temporary file so we can + # compare them to existing versions. + if test $from != $parser; then + realtarget=$target + target=tmp-`printf '%s\n' "$target" | sed 's|.*[\\/]||g'` + fi + + # Munge "#line" or "#" directives. Don't let the resulting + # debug information point at an absolute srcdir. Use the real + # output file name, not yy.lex.c for instance. Adjust the + # include guards too. + sed -e "/^#/!b" \ + -e "s|$input_rx|$input_sub_rx|" \ + -e "$sed_fix_filenames" \ + -e "$sed_fix_header_guards" \ + "$from" >"$target" || ret=$? + + # Check whether files must be updated. + if test "$from" != "$parser"; then + if test -f "$realtarget" && cmp -s "$realtarget" "$target"; then + echo "$to is unchanged" + rm -f "$target" + else + echo "updating $to" + mv -f "$target" "$realtarget" + fi + fi + else + # A missing file is only an error for the parser. This is a + # blatant hack to let us support using "yacc -d". If -d is not + # specified, don't fail when the header file is "missing". + if test "$from" = "$parser"; then + ret=1 + fi + fi + done +fi + +# Remove the directory. +cd .. +rm -rf $dirname + +exit $ret + +# Local Variables: +# mode: shell-script +# sh-indentation: 2 +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: