diff -Nru apertium-3.5.1/apertium/apertium.1 apertium-3.5.2/apertium/apertium.1
--- apertium-3.5.1/apertium/apertium.1 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/apertium.1 2018-09-11 10:38:25.000000000 +0000
@@ -10,10 +10,10 @@
.B apertium
[\-d datadir] [\-f format] [\-u] [\-a] {language-pair} [infile [outfile]]
.SH DESCRIPTION
-.BR apertium
+.BR apertium
is the application that most people will be using as it simplifies the
use of apertium/lt-toolbox tools for machine translation
-purposes.
+purposes.
.PP
This tool tries to ease the use of \fIlt-toolbox\fR (which contains
all the lexical processing modules and tools) and \fIapertium\fR
@@ -90,11 +90,11 @@
.B -m memory.tmx
use a translation memory to recycle translations
.PP
-.B -o direction
+.B -o direction
translation direction using the translation memory, by default 'direction' is used instead
.PP
.B -l
-lists the available translation directions and exits direction
+lists the available translation directions and exits direction
typically, LANG1-LANG2, but see modes.xml in language data
.PP
.B infile
diff -Nru apertium-3.5.1/apertium/apertium-deshtml.1 apertium-3.5.2/apertium/apertium-deshtml.1
--- apertium-3.5.1/apertium/apertium-deshtml.1 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/apertium-deshtml.1 2018-09-11 10:38:25.000000000 +0000
@@ -1,7 +1,7 @@
.TH apertium-deshtml 1 2006-03-21 "" ""
.SH NAME
apertium-deshtml \- This application is part of (
-.B apertium
+.B apertium
)
.PP
This tool is part of the apertium open-source machine translation
@@ -12,8 +12,8 @@
[ [
+
+
diff -Nru apertium-3.5.1/apertium/lextor.cc apertium-3.5.2/apertium/lextor.cc
--- apertium-3.5.1/apertium/lextor.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/lextor.cc 2018-09-11 10:38:25.000000000 +0000
@@ -38,33 +38,33 @@
lextor_data=NULL;
tlmodel=NULL;
}
-
+
LexTor::LexTor(const LexTor& lt) :
fstpbil(0)
{
lextor_data=lt.lextor_data;
tlmodel=lt.tlmodel;
}
-
+
LexTor::~LexTor() {
}
-void
+void
LexTor::set_lextor_data(LexTorData* ltd) {
lextor_data=ltd;
}
-void
+void
LexTor::set_tlmodel(LexTorData* tlm) {
tlmodel=tlm;
}
-void
+void
LexTor::set_bildic(FSTProcessor *fstp) {
fstpbil=fstp;
}
-void
+void
LexTor::trainwrd(wistream& is, int left, int right, double weigth_exponent) {
if (lextor_data==NULL) {
wcerr<get_word_string()
<reduce(ltword->get_word_string())<reduce(ltword->get_word_string());
- if (!lextor_data->is_stopword(reduced_word)) {
+ if (!lextor_data->is_stopword(reduced_word)) {
if (buffer.size()>=buffer_max_size) {
buffer.pop_front();
}
@@ -278,12 +278,12 @@
//We translate each word in the context
//Note: Words in the context can also be ambiguous (with more than one lexical choice)
//In that case the count will come from all the possible
- //translations
+ //translations
vector > translation_buffer(buffer_max_size);
vector reduced_buffer(buffer_max_size);
for (int i=0; i<(int)buffer_max_size; i++) {
- reduced_buffer[i]=lextor_data->reduce(buffer[i].get_word_string());
+ reduced_buffer[i]=lextor_data->reduce(buffer[i].get_word_string());
}
if(debug) {
@@ -372,7 +372,7 @@
}
double aux_vote=0;
- //aux_vote=tlwordmodel.vote_from_word(lexchoice_translation[*itlch],
+ //aux_vote=tlwordmodel.vote_from_word(lexchoice_translation[*itlch],
// translation_buffer[i][j])*translation_weighs[j];
if (tlwordmodel.get_wordcount(lexchoice_translation[*itlch])>0) {
aux_vote=(tlwordmodel.vote_from_word(lexchoice_translation[*itlch],translation_buffer[i][j])/
@@ -415,7 +415,7 @@
double local_lexsumsum=0.0;
for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) {
int distance=(-1)*left;
- for (int i=0; i<(int)buffer_max_size; i++) {
+ for (int i=0; i<(int)buffer_max_size; i++) {
if ((i!=word_index)&&(reduced_buffer[i]!=(*itword))) {
if (local_context[*itlch][reduced_buffer[i]]>0) {
double cc=local_context[*itlch][reduced_buffer[i]]/sumvotes_context[reduced_buffer[i]];
@@ -451,7 +451,7 @@
wcerr<set_cooccurrence_context(lch, context_v);
@@ -504,7 +504,7 @@
//wcerr<::iterator itlcs;
@@ -516,10 +516,10 @@
}
- wcerr<reduce(ltword->get_word_string())<n_lexical_choices()<is_stopword(lextor_data->reduce(ltword->get_word_string()))) {
- if (window.size()>=(unsigned)(left+1+right))
+ if (!lextor_data->is_stopword(lextor_data->reduce(ltword->get_word_string()))) {
+ if (window.size()>=(unsigned)(left+1+right))
window.pop_front();
-
+
window.push_back(*ltword);
if (ltword->n_lexical_choices()>1) {
@@ -555,11 +555,11 @@
if (retain>1)
buffer.push_back(*ltword);
} else {
- if (retain>0)
+ if (retain>0)
buffer.push_back(*ltword);
else {
wcout<get_lexical_choice(-1,true);
- if (lteval)
+ if (lteval)
lteval->evalword(*ltword, -1, lextor_data);
}
}
@@ -579,15 +579,15 @@
for(int i=0; i<(int)buffer.size(); i++)
wcerr<evalword(window[left], winner, lextor_data);
-
+
//For debug
/*
cout<0) {
while ((buffer.size()>0)&&(buffer[0].n_lexical_choices()==1)) {
wcout<evalword(buffer[0], -1, lextor_data);
buffer.pop_front();
}
if ((buffer.size()>0)&&(buffer[0].n_lexical_choices()>1))
- buffer.pop_front();
+ buffer.pop_front();
retain--;
}
- }
+ }
} else { //It's a stopword
- if (retain>0)
+ if (retain>0)
buffer.push_back(*ltword);
else {
wcout<get_lexical_choice(-1,true);
- if (lteval)
+ if (lteval)
lteval->evalword(*ltword, -1, lextor_data);
}
}
@@ -651,12 +651,12 @@
if(retain>0) {
while ((buffer.size()>0)&&(buffer[0].n_lexical_choices()==1)) {
wcout<evalword(buffer[0], -1, lextor_data);
buffer.pop_front();
}
if ((buffer.size()>0)&&(buffer[0].n_lexical_choices()>1))
- buffer.pop_front();
+ buffer.pop_front();
retain--;
}
@@ -668,7 +668,7 @@
//wcerr<& window, int word_index, double weigth_exponent) {
//return estimate_winner_lch_cosine(window, word_index, weigth_exponent);
return estimate_winner_lch_voting(window, word_index, weigth_exponent);
@@ -677,7 +677,7 @@
//return -1;
}
-int
+int
LexTor::estimate_winner_lch_voting(deque& window, int word_index, double weigth_exponent) {
vector lexchoices_count(window[word_index].n_lexical_choices());
@@ -715,7 +715,7 @@
}
int distance=(-1)*(word_index);
- for(int j=0; j<(int)window.size(); j++) {
+ for(int j=0; j<(int)window.size(); j++) {
//For all words in the context window
if(j!=word_index) {
COUNT_DATA_TYPE vote=0;
@@ -748,7 +748,7 @@
if ((lexchoices_count[i]>0) && (lexchoices_count[i]>winner_vote)) {
winner_vote=lexchoices_count[i];
winner=i;
- }
+ }
/*
else if ((lexchoices_count[i]>0) && (lexchoices_count[i]==winner_vote)) {
//Take the most probable one, the one with the highest sum
@@ -759,14 +759,14 @@
}
*/
}
-
+
if (debug) {
wcerr<& window, int word_index, double weigth_exponent) {
int winner=-1;
double greatest_sum=-1;
@@ -788,13 +788,13 @@
if (greatest_sum==0)
winner=-1;
- if (debug)
+ if (debug)
wcerr<& window, int word_index, double weigth_exponent) {
map vcontext;
@@ -855,26 +855,26 @@
if (diff_angle<=angleth)
winner=-1;
- if (debug)
+ if (debug)
wcerr<& window, int word_index, double weigth_exponent) {
if (tlmodel==NULL) {
wcerr< lexchoices_count(window[word_index].n_lexical_choices());
vector > translation_window (window.size());
vector reduced_window(window.size());
- for (unsigned i=0; ireduce(window[i].get_word_string());
-
+ for (unsigned i=0; ireduce(window[i].get_word_string());
+
if(debug) {
wcerr<vote_from_word(translation_window[word_index][i],translation_window[k][j])<get_wordcount(translation_window[k][j])<get_wordcount(translation_window[k][j])>0) {
aux_vote=(tlmodel->vote_from_word(translation_window[word_index][i],translation_window[k][j])/
tlmodel->get_wordcount(translation_window[k][j]))*translation_weighs[j];
- }
+ }
target_vote+=aux_vote;
if(debug) {
@@ -977,7 +977,7 @@
if(debug) {
- for(int i=0; i0) && (lexchoices_count[i]>winner_vote)) {
winner_vote=lexchoices_count[i];
winner=i;
- }
+ }
}
- if (debug)
+ if (debug)
wcerr<& vcontext, const wstring& reduced_lexchoice) {
map::iterator itc;
@@ -1018,7 +1018,7 @@
//We get the module of the lexchoice vector, ||lexchoice vector||
double module_lexchoice_vector=lextor_data->get_module_lexchoice_vector(reduced_lexchoice);
- if (module_vcontext==0) {
+ if (module_vcontext==0) {
wcerr< (&prob), sizeof(double));
EndianDoubleUtil::write(os, sum);
int nwritten_words=0;
- for(it_w_lch_set=it_lch_set->second.begin();
- it_w_lch_set!=it_lch_set->second.end();
+ for(it_w_lch_set=it_lch_set->second.begin();
+ it_w_lch_set!=it_lch_set->second.end();
it_w_lch_set++) {
WORD_DATA_TYPE word=it_w_lch_set->first;
COUNT_DATA_TYPE count=it_w_lch_set->second;
@@ -257,7 +257,7 @@
}
}
-void
+void
LexTorData::read_stopwords(wistream& is) {
while (!is.eof()) {
wstring w;
@@ -270,9 +270,9 @@
}
n_stopwords=stopwords.size();
wcerr<::iterator it;
int nlexchoices=0;
@@ -312,19 +312,19 @@
return words;
}
-set
+set
LexTorData::get_lexical_choices(const wstring& word) {
return lexical_choices[StringUtils::tolower(word)];
}
-void
+void
LexTorData::set_nwords_per_set(int i){
n_words_per_set=i;
wcerr< >& context) {
wcerr<=p) {
wcerr< >
+vector >
LexTorData::get_cooccurrence_vector(const string& lexical_choice) {
vector > v;
WORD_DATA_TYPE ind_lexchoice=word2index[StringUtils::tolower(lexical_choice)];
map::iterator it;
- for(it=lexchoice_set[ind_lexchoice].begin(); it!= lexchoice_set[ind_lexchoice].end(); it++)
+ for(it=lexchoice_set[ind_lexchoice].begin(); it!= lexchoice_set[ind_lexchoice].end(); it++)
v.push_back(*it);
-
+
return v;
}
*/
-double
+double
LexTorData::get_module_lexchoice_vector(const wstring& lexical_choice) {
WORD_DATA_TYPE ind_lexchoice=word2index[StringUtils::tolower(lexical_choice)];
map::iterator it;
double module=0;
- for(it=lexchoice_set[ind_lexchoice].begin(); it!= lexchoice_set[ind_lexchoice].end(); it++)
+ for(it=lexchoice_set[ind_lexchoice].begin(); it!= lexchoice_set[ind_lexchoice].end(); it++)
module+=(it->second)*(it->second);
module=sqrt(module);
@@ -485,7 +485,7 @@
return module;
}
-double
+double
LexTorData::cosine(const wstring& reduced_lexch1, const wstring& reduced_lexch2) {
WORD_DATA_TYPE ind_lexchoice1=word2index[StringUtils::tolower(reduced_lexch1)];
WORD_DATA_TYPE ind_lexchoice2=word2index[StringUtils::tolower(reduced_lexch2)];
diff -Nru apertium-3.5.1/apertium/lextor_data.h apertium-3.5.2/apertium/lextor_data.h
--- apertium-3.5.1/apertium/lextor_data.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/lextor_data.h 2018-09-11 10:38:25.000000000 +0000
@@ -38,7 +38,7 @@
using namespace std;
/** Class LexTorData. (Lexical Selector Data class)
- */
+ */
class LexTorData{
private:
@@ -62,7 +62,7 @@
//For a given lexical choice it contains the sum of all co-appearing words
map lexchoice_sum;
- //For a given lexical choice it contains its probability
+ //For a given lexical choice it contains its probability
//map lexchoice_prob;
//Set of stopwords
@@ -73,16 +73,16 @@
//For a given word it contains its set of lexical-choices (when available)
map > lexical_choices;
-
+
set reduced_lexical_choices;
void new_word_register(const wstring& w);
public:
LexTorData();
-
+
LexTorData(const LexTorData& ltd);
-
+
~LexTorData();
COUNT_DATA_TYPE vote_from_word(const wstring& lexical_choice, const wstring& word);
@@ -110,7 +110,7 @@
void set_nwords_per_set(int i);
- void set_cooccurrence_context(const wstring& lexical_choice,
+ void set_cooccurrence_context(const wstring& lexical_choice,
const vector >& context);
//vector >
@@ -123,7 +123,7 @@
set get_lexical_choices(const wstring& word);
- //Used to ensure that none of the stopwords are in the set
+ //Used to ensure that none of the stopwords are in the set
//of words from which co-occurrence models are being estimated
void ensure_stopwords_ok();
diff -Nru apertium-3.5.1/apertium/lextor_eval.cc apertium-3.5.2/apertium/lextor_eval.cc
--- apertium-3.5.1/apertium/lextor_eval.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/lextor_eval.cc 2018-09-11 10:38:25.000000000 +0000
@@ -57,7 +57,7 @@
LexTorEval::~LexTorEval() {
}
-void
+void
LexTorEval::print_evaluation() {
wcerr<reduce(ltword.get_lexical_choice(winner,false));
wstring word=lextor_data->reduce(ltword.get_word_string());
@@ -98,9 +98,9 @@
// return;
//}
- if (wref.find(L">__IGNORE") != wstring::npos)
+ if (wref.find(L">__IGNORE") != wstring::npos)
ignore=true;
-
+
if (!ignore) {
nwords+=1.0;
reduced_wref=lextor_data->reduce(wref);
diff -Nru apertium-3.5.1/apertium/lextor_eval.h apertium-3.5.2/apertium/lextor_eval.h
--- apertium-3.5.1/apertium/lextor_eval.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/lextor_eval.h 2018-09-11 10:38:25.000000000 +0000
@@ -49,8 +49,8 @@
wistream* refer;
set words2ignore;
-public:
-
+public:
+
LexTorEval(wistream *iref);
~LexTorEval();
diff -Nru apertium-3.5.1/apertium/lextor.h apertium-3.5.2/apertium/lextor.h
--- apertium-3.5.1/apertium/lextor.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/lextor.h 2018-09-11 10:38:25.000000000 +0000
@@ -30,7 +30,7 @@
using namespace std;
/** Class LexTor (Lexical Selector class)
- */
+ */
class LexTor {
private:
@@ -53,9 +53,9 @@
static double angleth;
LexTor();
-
+
LexTor(const LexTor& lt);
-
+
~LexTor();
void set_lextor_data(LexTorData* ltd);
@@ -67,20 +67,20 @@
void trainwrd(wistream& wis, int left, int right, double weigth_exponent=0);
- void trainlch(wistream& wis, int left, int right, LexTorData& wordmodel,
+ void trainlch(wistream& wis, int left, int right, LexTorData& wordmodel,
FSTProcessor& dic, FSTProcessor& bildic, double weigth_exponent=0);
- void lexical_selector(wistream& wis, FSTProcessor &fstp, int left, int right,
+ void lexical_selector(wistream& wis, FSTProcessor &fstp, int left, int right,
double weigth_exponent=0, LexTorEval* lteval=NULL);
/** NOTE on the weigth_exponent parameter: This parameter is used to
change the influence of surrounding words on the decision to
take on an ambiguous word (word with more than one lexical
choice). For example, if a decision is being take on word w_i,
- the the weigth of the surrounding words is:
- Score(w_i-2) = count(w_i-2)/pow(2,weigth_exponent),
- Score(w_i-1) = count(w_i-1)/pow(1,weigth_exponent),
- Score(w_i+1) = count(w_i+1)/pow(1,weigth_exponent),
+ the the weigth of the surrounding words is:
+ Score(w_i-2) = count(w_i-2)/pow(2,weigth_exponent),
+ Score(w_i-1) = count(w_i-1)/pow(1,weigth_exponent),
+ Score(w_i+1) = count(w_i+1)/pow(1,weigth_exponent),
Score(w_i+2) = count(w_i+2)/pow(2,weigth_exponent).
*/
};
diff -Nru apertium-3.5.1/apertium/lextor_word.cc apertium-3.5.2/apertium/lextor_word.cc
--- apertium-3.5.1/apertium/lextor_word.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/lextor_word.cc 2018-09-11 10:38:25.000000000 +0000
@@ -25,7 +25,7 @@
word = L"";
default_choice = 0;
}
-
+
LexTorWord::LexTorWord(const LexTorWord& ltw) {
word=ltw.word;
ignored_string=ltw.ignored_string;
@@ -38,26 +38,26 @@
ignored_string=L"";
extract_lexical_choices(fstp);
}
-
+
LexTorWord::~LexTorWord() {
}
-wstring
+wstring
LexTorWord::get_word_string() {
return word;
}
-int
+int
LexTorWord::n_lexical_choices() {
return lexical_choices.size();
}
-wstring
+wstring
LexTorWord::get_lexical_choice(int choice, bool include_ignored) {
if (word == L"") {
if (include_ignored)
return ignored_string;
- else
+ else
return L"";
}
@@ -79,7 +79,7 @@
return lexical_choices[choice];
}
-wstring
+wstring
LexTorWord::translate(FSTProcessor& bildic, int lexchoice) {
wstring t;
@@ -103,7 +103,7 @@
return t;
}
-void
+void
LexTorWord::extract_lexical_choices(FSTProcessor *fstp) {
lexical_choices=StringUtils::split_wstring(fstp->biltrans(word,false), L"/");
@@ -125,7 +125,7 @@
}
}
-LexTorWord*
+LexTorWord*
LexTorWord::next_word(wistream& is, FSTProcessor *fstp) {
LexTorWord w;
wchar_t c, prev_c=L' ';
@@ -145,7 +145,7 @@
if(fstp!=NULL)
w.extract_lexical_choices(fstp);
return new LexTorWord(w);
- } else
+ } else
return NULL;
}
}
diff -Nru apertium-3.5.1/apertium/lextor_word.h apertium-3.5.2/apertium/lextor_word.h
--- apertium-3.5.1/apertium/lextor_word.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/lextor_word.h 2018-09-11 10:38:25.000000000 +0000
@@ -30,7 +30,7 @@
using namespace std;
/** Class LexTorWord. (Lexical Selector Word)
- */
+ */
class LexTorWord{
private:
@@ -38,16 +38,16 @@
wstring ignored_string;
vector lexical_choices;
int default_choice;
-
+
void extract_lexical_choices(FSTProcessor *fstp);
public:
LexTorWord();
-
+
LexTorWord(const LexTorWord& ltw);
LexTorWord(const wstring& str, FSTProcessor *fstp);
-
+
~LexTorWord();
/** Return the lexical choice at position 'choice', if 'choice' is not
@@ -55,7 +55,7 @@
*/
wstring get_lexical_choice(int choice=-1, bool include_ignored=true);
- /** Returns the number of lexical choices for this word
+ /** Returns the number of lexical choices for this word
*/
int n_lexical_choices();
diff -Nru apertium-3.5.1/apertium/lswpost.cc apertium-3.5.2/apertium/lswpost.cc
--- apertium-3.5.1/apertium/lswpost.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/lswpost.cc 2018-09-11 10:38:25.000000000 +0000
@@ -86,7 +86,7 @@
LSWPoST::LSWPoST(TaggerDataLSW t) {
tdlsw = t;
- eos = (tdlsw.getTagIndex())[L"TAG_SENT"];
+ eos = (tdlsw.getTagIndex())[L"TAG_SENT"];
}
LSWPoST::~LSWPoST() {}
@@ -94,9 +94,9 @@
LSWPoST::LSWPoST(TaggerDataLSW *tdlsw) : tdlsw(*tdlsw) {}
void
-LSWPoST::set_eos(TTag t) {
- eos = t;
-}
+LSWPoST::set_eos(TTag t) {
+ eos = t;
+}
void
LSWPoST::init_probabilities(MorphoStream &morpho_stream) {
@@ -108,7 +108,7 @@
set::iterator iter_left, iter_mid, iter_right;
vector > > para_matrix(N, vector >(N, vector(N, 0)));
int num_valid_seq = 0;
-
+
word = new TaggerWord(); // word for tags left
word->add_tag(eos, L"sent", tdlsw.getPreferRules());
tags_left = word->get_tags(); // tags left
@@ -329,13 +329,13 @@
}
}
-void
+void
LSWPoST::tagger(MorphoStream &morpho_stream, FILE *Output, const bool &First) {
TaggerWord *word_left = NULL, *word_mid = NULL, *word_right = NULL;
set tags_left, tags_mid, tags_right;
set::iterator iter_left, iter_mid, iter_right;
- morpho_stream.setNullFlush(null_flush);
-
+ morpho_stream.setNullFlush(null_flush);
+
word_left = new TaggerWord(); // word left
word_left->add_tag(eos, L"sent", tdlsw.getPreferRules());
word_left->set_show_sf(show_sf);
@@ -387,7 +387,7 @@
fflush(Output);
morpho_stream.setEndOfFile(false);
}
-
+
delete word_left;
word_left = word_mid;
tags_left = tags_mid;
diff -Nru apertium-3.5.1/apertium/Makefile.am apertium-3.5.2/apertium/Makefile.am
--- apertium-3.5.1/apertium/Makefile.am 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/Makefile.am 2018-09-11 10:38:25.000000000 +0000
@@ -527,13 +527,13 @@
@echo "Creating apertium-gen-deformat script"
@echo "#!$(BASH)" > $@
@cat deformat-header.sh >> $@
- @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/format.dtd --noout \$$FILE1 && \\" >> $@
+ @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/format.dtd --noout \"\$$FILE1\" && \\" >> $@
@if [ `basename $(XSLTPROC)` == xsltproc ]; \
- then echo "$(XSLTPROC) --stringparam mode \$$MODE \"$(apertiumdir)\"/deformat.xsl \$$FILE1 >/tmp/\$$\$$.deformat.l && \\"; \
- else echo "$(XSLTPROC) \"$(apertiumdir)\"/deformat.xsl \$$FILE1 \"\\\$$mode=\$$MODE\" >/tmp/\$$\$$.deformat.l && \\"; \
+ then echo "$(XSLTPROC) --stringparam mode \$$MODE \"$(apertiumdir)\"/deformat.xsl \"\$$FILE1\" >/tmp/\$$\$$.deformat.l && \\"; \
+ else echo "$(XSLTPROC) \"$(apertiumdir)\"/deformat.xsl \"\$$FILE1\" \"\\\$$mode=\$$MODE\" >/tmp/\$$\$$.deformat.l && \\"; \
fi >> $@
@echo "$(FLEX) \$$FLEXOPTS -o/tmp/\$$\$$.lex.cc /tmp/\$$\$$.deformat.l && \\" >> $@
- @echo "$(CXX) -DGENFORMAT $(CXXFLAGS) -w $(APERTIUM_CFLAGS) -I $(apertiuminclude) -o \$$FILE2 /tmp/\$$\$$.lex.cc $(APERTIUM_LIBS) 2>/dev/null && \\" >> $@
+ @echo "$(CXX) -DGENFORMAT $(CXXFLAGS) -w $(APERTIUM_CFLAGS) -I $(apertiuminclude) -o \"\$$FILE2\" /tmp/\$$\$$.lex.cc $(APERTIUM_LIBS) 2>/dev/null && \\" >> $@
@echo "rm /tmp/\$$\$$.deformat.l /tmp/\$$\$$.lex.cc" >> $@
@chmod a+x $@
@@ -541,10 +541,10 @@
@echo "Creating apertium-gen-reformat script"
@echo "#!$(BASH)" > $@
@cat gen-header.sh >> $@
- @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/format.dtd --noout \$$FILE1 && \\" >> $@
- @echo "$(XSLTPROC) \"$(apertiumdir)\"/reformat.xsl \$$FILE1 >/tmp/\$$\$$.reformat.l && \\" >> $@
+ @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/format.dtd --noout \"\$$FILE1\" && \\" >> $@
+ @echo "$(XSLTPROC) \"$(apertiumdir)\"/reformat.xsl \"\$$FILE1\" >/tmp/\$$\$$.reformat.l && \\" >> $@
@echo "$(FLEX) \$$FLEXOPTS -o/tmp/\$$\$$.lex.cc /tmp/\$$\$$.reformat.l && \\" >> $@
- @echo "$(CXX) -DGENFORMAT $(CXXFLAGS) -w $(APERTIUM_CFLAGS) -I $(apertiuminclude) -o \$$FILE2 /tmp/\$$\$$.lex.cc $(APERTIUM_LIBS) 2>/dev/null &&\\" >> $@
+ @echo "$(CXX) -DGENFORMAT $(CXXFLAGS) -w $(APERTIUM_CFLAGS) -I $(apertiuminclude) -o \"\$$FILE2\" /tmp/\$$\$$.lex.cc $(APERTIUM_LIBS) 2>/dev/null &&\\" >> $@
@echo "rm /tmp/\$$\$$.reformat.l /tmp/\$$\$$.lex.cc" >> $@
@chmod a+x $@
@@ -566,7 +566,6 @@
@echo "Creating apertium script"
@echo "#!$(BASH)" > $@
@echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@
- @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@
@echo "DEFAULT_DIRECTORY=\"$(prefix)/share/apertium\"" >>$@
@cat apertium-header.sh >>$@
@chmod a+x $@
@@ -575,8 +574,6 @@
@echo "Creating apertium-unformat script"
@echo "#!$(BASH)" > $@
@echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@
- @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@
- @echo "DEFAULT_DIRECTORY=\"$(prefix)/share/apertium\"" >>$@
@cat apertium-unformat-header.sh >>$@
@chmod a+x $@
diff -Nru apertium-3.5.1/apertium/mediawiki-format.xml apertium-3.5.2/apertium/mediawiki-format.xml
--- apertium-3.5.1/apertium/mediawiki-format.xml 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/mediawiki-format.xml 2018-09-11 10:38:25.000000000 +0000
@@ -74,11 +74,11 @@
-
+
-
+
@@ -86,11 +86,11 @@
-
+
-
+
@@ -121,11 +121,11 @@
-
+
-
+
@@ -144,15 +144,15 @@
-
+
-
+
-
+
diff -Nru apertium-3.5.1/apertium/modes.rnc apertium-3.5.2/apertium/modes.rnc
--- apertium-3.5.1/apertium/modes.rnc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/modes.rnc 2018-09-11 10:38:25.000000000 +0000
@@ -1,18 +1,18 @@
# Copyright (C) 2005-2016 Universitat d'Alacant / Universidad de Alicante
-#
+#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see .
-#
+#
# DTD for the modes.xml file
modes = element modes { attlist.modes, mode+ }
diff -Nru apertium-3.5.1/apertium/modes.rng apertium-3.5.2/apertium/modes.rng
--- apertium-3.5.1/apertium/modes.rng 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/modes.rng 2018-09-11 10:38:25.000000000 +0000
@@ -1,20 +1,20 @@
diff -Nru apertium-3.5.1/apertium/morpho_stream.h apertium-3.5.2/apertium/morpho_stream.h
--- apertium-3.5.1/apertium/morpho_stream.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/morpho_stream.h 2018-09-11 10:38:25.000000000 +0000
@@ -14,10 +14,10 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, see .
*/
-/**
+/**
* Word class and MorphoStream class definitions
*
- * @author Felipe Sánchez-Martínez
+ * @author Felipe Sánchez-Martínez
*/
#ifndef __MORPHOSTREAM_H
@@ -39,31 +39,31 @@
using namespace std;
-/** Class MorphoStream.
- * This class processes the output of class yyFlexLexer (lex.yy.cc), and
- * builds the TaggerWord objects managed by the tagger
+/** Class MorphoStream.
+ * This class processes the output of class yyFlexLexer (lex.yy.cc), and
+ * builds the TaggerWord objects managed by the tagger
*/
class MorphoStream {
public:
virtual ~MorphoStream();
/** Get next word in the input stream
- * @return A pointer to the next word in the input stream
+ * @return A pointer to the next word in the input stream
*/
virtual TaggerWord* get_next_word() = 0;
-
- /**
+
+ /**
* Set up the flag to detect '\0' characters
* @param nf the null_flush value
*/
virtual void setNullFlush(bool nf) = 0;
-
+
/**
- * Return true if the last reading is end of file of '\0' when null_flush
+ * Return true if the last reading is end of file of '\0' when null_flush
* is true
* @returns the value of end_of_file
*/
virtual bool getEndOfFile(void) = 0;
-
+
/**
* Sets a new value for the end_of_file_flag
* @param eof the new value for end_of_file
diff -Nru apertium-3.5.1/apertium/new2old.xsl apertium-3.5.2/apertium/new2old.xsl
--- apertium-3.5.1/apertium/new2old.xsl 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/new2old.xsl 2018-09-11 10:38:25.000000000 +0000
@@ -140,7 +140,7 @@
-
+
diff -Nru apertium-3.5.1/apertium/odt-format.xml apertium-3.5.2/apertium/odt-format.xml
--- apertium-3.5.1/apertium/odt-format.xml 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/odt-format.xml 2018-09-11 10:38:25.000000000 +0000
@@ -19,13 +19,13 @@
-
-
+
+
-
-
+
+
@@ -48,7 +48,7 @@
-
+
diff -Nru apertium-3.5.1/apertium/perceptron_spec.cc apertium-3.5.2/apertium/perceptron_spec.cc
--- apertium-3.5.1/apertium/perceptron_spec.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/perceptron_spec.cc 2018-09-11 10:38:25.000000000 +0000
@@ -108,22 +108,22 @@
LexicalUnit PerceptronSpec::token_wordoids_underflow;
LexicalUnit PerceptronSpec::token_wordoids_overflow;
-void
+void
PerceptronSpec::get_features(
const TaggedSentence &tagged, const Sentence &untagged,
int token_idx, int wordoid_idx,
- UnaryFeatureVec &feat_vec_out) const
+ UnaryFeatureVec &feat_vec_out) const
{
size_t i;
global_results.clear();
- if (global_pred.size() > 0)
+ if (global_pred.size() > 0)
{
Machine machine(
*this, global_pred, 0, false,
tagged, untagged, token_idx, wordoid_idx);
StackValue result = machine.getValue();
assert(result.type == BVAL);
- if (!result.boolVal())
+ if (!result.boolVal())
{
return;
}
@@ -660,7 +660,7 @@
void
PerceptronSpec::Machine::getFeature(
- UnaryFeatureVec &feat_vec_out)
+ UnaryFeatureVec &feat_vec_out)
{
for (; bytecode_iter != feat.end(); bytecode_iter++) {
Opcode op = (Bytecode){.intbyte=static_cast(*bytecode_iter)}.op;
@@ -722,10 +722,10 @@
PerceptronSpec::StackValue
PerceptronSpec::Machine::getValue()
{
- for (; bytecode_iter != feat.end(); bytecode_iter++)
+ for (; bytecode_iter != feat.end(); bytecode_iter++)
{
Opcode op = (Bytecode){.intbyte=static_cast(*bytecode_iter)}.op;
- if (execCommonOp(op))
+ if (execCommonOp(op))
{
continue;
}
diff -Nru apertium-3.5.1/apertium/postchunk.cc apertium-3.5.2/apertium/postchunk.cc
--- apertium-3.5.1/apertium/postchunk.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/postchunk.cc 2018-09-11 10:38:25.000000000 +0000
@@ -43,7 +43,7 @@
{
xmlFreeDoc(doc);
doc = NULL;
- }
+ }
}
Postchunk::Postchunk() :
@@ -71,7 +71,7 @@
destroy();
}
-void
+void
Postchunk::readData(FILE *in)
{
alphabet.read(in);
@@ -80,18 +80,18 @@
Transducer t;
t.read(in, alphabet.size());
-
- map finals;
-
+
+ map finals;
+
// finals
for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++)
{
int key = Compression::multibyte_read(in);
finals[key] = Compression::multibyte_read(in);
- }
-
+ }
+
me = new MatchExe(t, finals);
-
+
// attr_items
bool recompile_attrs = Compression::string_read(in) != string(pcre_version());
for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++)
@@ -128,15 +128,15 @@
wstring const cad_v = Compression::wstring_read(in);
lists[cad_k].insert(UtfConverter::toUtf8(cad_v));
listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v)));
- }
- }
+ }
+ }
}
void
Postchunk::read(string const &transferfile, string const &datafile)
{
readPostchunk(transferfile);
-
+
// datafile
FILE *in = fopen(datafile.c_str(), "rb");
if(!in)
@@ -153,15 +153,15 @@
Postchunk::readPostchunk(string const &in)
{
doc = xmlReadFile(in.c_str(), NULL, 0);
-
+
if(doc == NULL)
{
wcerr << "Error: Could not parse file '" << in << "'." << endl;
exit(EXIT_FAILURE);
}
-
+
root_element = xmlDocGetRootElement(doc);
-
+
// search for macros & rules
for(xmlNode *i = root_element->children; i != NULL; i = i->next)
{
@@ -175,7 +175,7 @@
{
collectRules(i);
}
- }
+ }
}
}
@@ -231,7 +231,7 @@
}
-string
+string
Postchunk::evalString(xmlNode *element)
{
map::iterator it;
@@ -247,17 +247,17 @@
return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]);
}
break;
-
+
case ti_lu_count:
return StringUtils::itoa_string(tmpword.size());
case ti_var:
return variables[ti.getContent()];
-
+
case ti_lit_tag:
case ti_lit:
return ti.getContent();
-
+
case ti_b:
if(ti.getPos() >= 0 && checkIndex(element, ti.getPos(), lblank))
{
@@ -267,7 +267,7 @@
return " ";
}
break;
-
+
case ti_get_case_from:
if(checkIndex(element, ti.getPos(), lword))
{
@@ -275,14 +275,14 @@
evalString((xmlNode *) ti.getPointer()));
}
break;
-
+
case ti_case_of_tl:
if(checkIndex(element, ti.getPos(), lword))
{
return caseOf(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]));
}
break;
-
+
default:
return "";
}
@@ -310,8 +310,8 @@
}
else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag"))
{
- evalStringCache[element] = TransferInstr(ti_lit_tag,
- tags((const char *) element->properties->children->content), 0);
+ evalStringCache[element] = TransferInstr(ti_lit_tag,
+ tags((const char *) element->properties->children->content), 0);
}
else if(!xmlStrcmp(element->name, (const xmlChar *) "lit"))
{
@@ -368,11 +368,11 @@
pos = atoi((const char *) i->children->content);
}
}
-
+
evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos);
}
else if(!xmlStrcmp(element->name, (const xmlChar *) "concat"))
- {
+ {
string value;
for(xmlNode *i = element->children; i != NULL; i = i->next)
{
@@ -393,7 +393,7 @@
myword.append(evalString(i));
}
}
-
+
if(myword != "")
{
return "^"+myword+"$";
@@ -406,15 +406,15 @@
else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu"))
{
string value;
-
+
bool first_time = true;
-
+
for(xmlNode *i = element->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
{
string myword;
-
+
for(xmlNode *j = i->children; j != NULL; j = j->next)
{
if(j->type == XML_ELEMENT_NODE)
@@ -422,7 +422,7 @@
myword.append(evalString(j));
}
}
-
+
if(!first_time)
{
if(myword != "" && myword[0] != '#') //'+#' problem
@@ -437,7 +437,7 @@
first_time = false;
}
}
-
+
value.append(myword);
}
}
@@ -451,7 +451,7 @@
return "";
}
}
-
+
else
{
wcerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl;
@@ -501,7 +501,7 @@
myword.append(evalString(k));
}
}
-
+
if(!first_time)
{
if(myword != "")
@@ -516,7 +516,7 @@
first_time = false;
}
}
- fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output);
+ fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output);
}
}
fputwc_unlocked(L'$', output);
@@ -609,11 +609,11 @@
case ti_var:
variables[ti.getContent()] = evalString(rightSide);
return;
-
+
case ti_clip_tl:
word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide));
- return;
-
+ return;
+
default:
return;
}
@@ -640,11 +640,11 @@
pos = atoi((const char *) i->children->content);
}
}
-
- word[pos]->setChunkPart(attr_items[(const char *) part],
+
+ word[pos]->setChunkPart(attr_items[(const char *) part],
evalString(rightSide));
- evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part,
+ evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part,
pos, NULL);
}
}
@@ -657,7 +657,7 @@
{
if(!xmlStrcmp(i->name, (const xmlChar *) "n"))
{
- name = (char *) i->children->content;
+ name = (char *) i->children->content;
break;
}
}
@@ -709,7 +709,7 @@
}
}
- string const result = copycase(evalString(rightSide),
+ string const result = copycase(evalString(rightSide),
word[pos]->chunkPart(attr_items[(const char *) part]));
word[pos]->setChunkPart(attr_items[(const char *) part], result);
@@ -737,7 +737,7 @@
break;
}
}
-
+
if (npar <= 0)
{
throw "Postchunk::processCallMacro() assumes npar > 0, but got npar <= 0";
@@ -746,7 +746,7 @@
InterchunkWord **myword = NULL;
if(npar > 0)
{
- myword = new InterchunkWord *[npar+1];
+ myword = new InterchunkWord *[npar+1];
}
string **myblank = NULL;
if(npar > 0)
@@ -755,7 +755,7 @@
}
myword[0] = word[0];
-
+
bool indexesOK = true;
int idx = 1;
int lastpos = 0;
@@ -773,7 +773,7 @@
{
myblank[idx-1] = blank[lastpos];
}
-
+
idx++;
lastpos = pos;
}
@@ -782,7 +782,7 @@
swap(myword, word);
swap(myblank, blank);
swap(npar, lword);
-
+
if(indexesOK) {
for(xmlNode *i = macro->children; i != NULL; i = i->next)
{
@@ -814,7 +814,7 @@
if(!xmlStrcmp(i->name, (const xmlChar *) "when"))
{
bool picked_option = false;
-
+
for(xmlNode *j = i->children; j != NULL; j = j->next)
{
if(j->type == XML_ELEMENT_NODE)
@@ -839,7 +839,7 @@
if(picked_option)
{
return;
- }
+ }
}
else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise"))
{
@@ -867,7 +867,7 @@
return processBeginsWith(localroot);
}
else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list"))
- {
+ {
return processBeginsWithList(localroot);
}
else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with"))
@@ -893,7 +893,7 @@
else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not"))
{
return processNot(localroot);
- }
+ }
else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in"))
{
return processIn(localroot);
@@ -911,7 +911,7 @@
for(xmlNode *i = localroot->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
- {
+ {
if(value == NULL)
{
value = i;
@@ -928,7 +928,7 @@
if(localroot->properties != NULL)
{
- if(!xmlStrcmp(localroot->properties->children->content,
+ if(!xmlStrcmp(localroot->properties->children->content,
(const xmlChar *) "yes"))
{
set &myset = listslow[(const char *) idlist];
@@ -963,7 +963,7 @@
{
return processLogical(i);
}
- }
+ }
return false;
}
@@ -1053,7 +1053,7 @@
Postchunk::beginsWith(string const &s1, string const &s2) const
{
int const limit = s2.size(), constraint = s1.size();
-
+
if(constraint < limit)
{
return false;
@@ -1073,7 +1073,7 @@
Postchunk::endsWith(string const &s1, string const &s2) const
{
int const limit = s2.size(), constraint = s1.size();
-
+
if(constraint < limit)
{
return false;
@@ -1193,7 +1193,7 @@
string needle = evalString(first);
set::iterator it, limit;
- if(localroot->properties == NULL ||
+ if(localroot->properties == NULL ||
xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes"))
{
it = lists[(const char *) idlist].begin();
@@ -1205,7 +1205,7 @@
it = listslow[(const char *) idlist].begin();
limit = listslow[(const char *) idlist].end();
}
-
+
for(; it != limit; it++)
{
if(beginsWith(needle, *it))
@@ -1241,7 +1241,7 @@
string needle = evalString(first);
set::iterator it, limit;
- if(localroot->properties == NULL ||
+ if(localroot->properties == NULL ||
xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes"))
{
it = lists[(const char *) idlist].begin();
@@ -1253,7 +1253,7 @@
it = listslow[(const char *) idlist].begin();
limit = listslow[(const char *) idlist].end();
}
-
+
for(; it != limit; it++)
{
if(endsWith(needle, *it))
@@ -1323,16 +1323,16 @@
{
result = StringUtils::toupper(t_word);
}
-
+
if(firstupper)
{
result[0] = towupper(result[0]);
}
-
+
return UtfConverter::toUtf8(result);
}
-string
+string
Postchunk::caseOf(string const &str)
{
wstring const s = UtfConverter::fromUtf8(str);
@@ -1369,7 +1369,7 @@
}
}
-wstring
+wstring
Postchunk::caseOf(wstring const &str)
{
if(str.size() > 1)
@@ -1426,7 +1426,7 @@
result += str[i];
}
}
-
+
result += '>';
return result;
@@ -1462,7 +1462,7 @@
return input_buffer.add(TransferToken(content, tt_eof));
}
if(val == L'\\')
- {
+ {
content += L'\\';
content += wchar_t(fgetwc_unlocked(in));
}
@@ -1503,11 +1503,11 @@
{
int val3 = wchar_t(fgetwc_unlocked(in));
ungetwc(val3, in);
-
+
content += L'}';
if(val3 == L'$')
{
- break;
+ break;
}
}
else
@@ -1556,7 +1556,7 @@
{
null_flush = false;
internal_null_flush = true;
-
+
while(!feof(in))
{
postchunk(in, out);
@@ -1567,10 +1567,10 @@
wcerr << L"Could not flush output " << errno << endl;
}
}
-
+
internal_null_flush = false;
null_flush = true;
-}
+}
void
Postchunk::postchunk(FILE *in, FILE *out)
@@ -1579,12 +1579,12 @@
{
postchunk_wrapper_null_flush(in, out);
}
-
+
int last = 0;
output = out;
ms.init(me->getInitial());
-
+
while(true)
{
if(ms.size() == 0)
@@ -1601,7 +1601,7 @@
unchunk(*tmpword[0], output);
tmpword.clear();
input_buffer.setPos(last);
- input_buffer.next();
+ input_buffer.next();
last = input_buffer.getPos();
ms.init(me->getInitial());
}
@@ -1617,7 +1617,7 @@
int val = ms.classifyFinals(me->getFinals());
if(val != -1)
{
- lastrule = rule_map[val-1];
+ lastrule = rule_map[val-1];
last = input_buffer.getPos();
if(trace)
@@ -1636,7 +1636,7 @@
}
TransferToken ¤t = readToken(in);
-
+
switch(current.getType())
{
case tt_word:
@@ -1699,7 +1699,7 @@
{
blank[i-2] = new string(UtfConverter::toUtf8(*tmpblank[i-1]));
}
-
+
word[i] = new InterchunkWord(UtfConverter::toUtf8(*tmpword[i-1]));
}
@@ -1770,11 +1770,11 @@
}
}
break;*/
-
+
case L'{': // ignore the unmodifiable part of the chunk
ms.step(L'$');
return;
-
+
default:
ms.step(towlower(word_str[i]), any_char);
break;
@@ -1878,11 +1878,11 @@
wstring case_info = caseOf(pseudolemma(chunk));
bool uppercase_all = false;
bool uppercase_first = false;
-
+
if(case_info == L"AA")
{
uppercase_all = true;
- }
+ }
else if(case_info == L"Aa")
{
uppercase_first = true;
@@ -1910,7 +1910,7 @@
if(iswdigit(chunk[i+1]))
{
// replace tag
- unsigned long value = wcstoul(chunk.c_str()+i+1,
+ unsigned long value = wcstoul(chunk.c_str()+i+1,
NULL, 0) - 1;
//atoi(chunk.c_str()+i+1)-1;
if(vectags.size() > value)
@@ -1986,16 +1986,16 @@
bool uppercase_all = false;
bool uppercase_first = false;
bool lastblank = true;
-
+
if(case_info == L"AA")
{
uppercase_all = true;
- }
+ }
else if(case_info == L"Aa")
{
uppercase_first = true;
}
-
+
for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++)
{
if(chunk[i] == L'^')
@@ -2007,7 +2007,7 @@
lastblank = false;
wstring *myword = new wstring();
wstring &ref = *myword;
-
+
while(chunk[++i] != L'$')
{
if(chunk[i] == L'\\')
@@ -2020,7 +2020,7 @@
if(iswdigit(chunk[i+1]))
{
// replace tag
- unsigned long value = wcstoul(chunk.c_str()+i+1,
+ unsigned long value = wcstoul(chunk.c_str()+i+1,
NULL, 0) - 1;
if(vectags.size() > value)
{
@@ -2064,7 +2064,7 @@
}
else if(chunk[i] == L'[')
{
- if (!(lastblank && blanks.back()))
+ if (!(lastblank && blanks.back()))
{
blanks.push_back(new wstring());
}
diff -Nru apertium-3.5.1/apertium/postchunk.dtd apertium-3.5.2/apertium/postchunk.dtd
--- apertium-3.5.1/apertium/postchunk.dtd 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/postchunk.dtd 2018-09-11 10:38:25.000000000 +0000
@@ -1,25 +1,25 @@
-
+ Draft of DTD for the structural transfer rule files
+
+ Sergio Ortiz, Gema Ramírez-Sánchez, Mireia Ginestí, Mikel L. Forcada,
+ 2005.07.29.
+-->
@@ -28,13 +28,13 @@
-
-
@@ -42,43 +42,43 @@
-
-
-
-
+
-
-
-
-
@@ -87,7 +87,7 @@
-
@@ -111,12 +111,12 @@
-
-
@@ -151,128 +151,128 @@
-->
-
-
-
-
-
-
-
-
-
-
-
+
-
-
+
-
-
+
-
-
+
-
-
+
-
-
+
-
-
-
-
+-->
-
-
-
-
@@ -379,15 +379,15 @@
-
-
+
@@ -396,13 +396,13 @@
+-->
diff -Nru apertium-3.5.1/apertium/postchunk.h apertium-3.5.2/apertium/postchunk.h
--- apertium-3.5.1/apertium/postchunk.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/postchunk.h 2018-09-11 10:38:25.000000000 +0000
@@ -39,7 +39,7 @@
class Postchunk
{
private:
-
+
Alphabet alphabet;
MatchExe *me;
MatchState ms;
@@ -65,7 +65,7 @@
xmlNode *lastrule;
unsigned int nwords;
-
+
map evalStringCache;
bool inword;
@@ -118,18 +118,18 @@
static vector getVecTags(wstring const &chunk);
static int beginChunk(wstring const &chunk);
static int endChunk(wstring const &chunk);
- static void splitWordsAndBlanks(wstring const &chunk,
+ static void splitWordsAndBlanks(wstring const &chunk,
vector &words,
vector &blanks);
static wstring pseudolemma(wstring const &chunk);
static wstring wordzero(wstring const &chunk);
- bool checkIndex(xmlNode *element, int index, int limit);
+ bool checkIndex(xmlNode *element, int index, int limit);
void postchunk_wrapper_null_flush(FILE *in, FILE *out);
public:
Postchunk();
~Postchunk();
-
+
void read(string const &transferfile, string const &datafile);
void postchunk(FILE *in, FILE *out);
bool getNullFlush(void);
diff -Nru apertium-3.5.1/apertium/postchunk.rnc apertium-3.5.2/apertium/postchunk.rnc
--- apertium-3.5.1/apertium/postchunk.rnc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/postchunk.rnc 2018-09-11 10:38:25.000000000 +0000
@@ -1,22 +1,22 @@
# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
-#
+#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see .
-#
-# Draft of DTD for the structural transfer rule files
-#
-# Sergio Ortiz, Gema RamÃrez-Sánchez, Mireia GinestÃ, Mikel L. Forcada,
-# 2005.07.29.
+#
+# Draft of DTD for the structural transfer rule files
+#
+# Sergio Ortiz, Gema RamÃrez-Sánchez, Mireia GinestÃ, Mikel L. Forcada,
+# 2005.07.29.
condition =
and
@@ -56,7 +56,7 @@
}
attlist.postchunk &= empty
# 'postchunk' is the root element containing the whole structural
-# postchunk rule file.
+# postchunk rule file.
section-def-cats =
element section-def-cats { attlist.section-def-cats, def-cat+ }
attlist.section-def-cats &= empty
@@ -70,19 +70,19 @@
# category items and has a unique name 'n', which is mandatory
cat-item = element cat-item { attlist.cat-item, empty }
attlist.cat-item &= attribute name { text }
-# In addition, a required attribute, "name", is used to specify
+# In addition, a required attribute, "name", is used to specify
# wich chunk name is detected by this cat-item
section-def-attrs =
element section-def-attrs { attlist.section-def-attrs, def-attr+ }
attlist.section-def-attrs &= empty
# The 'def-attrs' section defines the attributes that will be
-# identified in matched lexical forms
+# identified in matched lexical forms
def-attr = element def-attr { attlist.def-attr, attr-item+ }
attlist.def-attr &=
attribute n { xsd:ID },
attribute c { text }?
# Each def-attr defines one attribute in terms of a list of
-# attribute items and has a mandatory unique name n
+# attribute items and has a mandatory unique name n
attr-item = element attr-item { attlist.attr-item, empty }
attlist.attr-item &=
attribute tags { text }?,
@@ -111,18 +111,18 @@
attlist.def-list &=
attribute n { xsd:ID },
attribute c { text }?
-# The 'def-list' element defines a named list to search with the 'in'
+# The 'def-list' element defines a named list to search with the 'in'
# element. Attribute 'n' sets the name of the list
list-item = element list-item { attlist.list-item, empty }
attlist.list-item &=
attribute v { text },
attribute c { text }?
-# Attribute 'v' of 'list-item' element contains the value to be added to
-# the list being defined
+# Attribute 'v' of 'list-item' element contains the value to be added to
+# the list being defined
section-def-macros =
element section-def-macros { attlist.section-def-macros, def-macro+ }
attlist.section-def-macros &= empty
-#
+#
# The 'def-macros' section defines macros containing portions of
# code frequently used in the action part of rules
#
@@ -132,9 +132,9 @@
attribute npar { text },
attribute c { text }?
# Macro definition:
-#
+#
# A macro has a mandatory name (the value of 'n'), a number of parameters
-# (the value of 'npar') and a body containing arguments and statements.
+# (the value of 'npar') and a body containing arguments and statements.
section-rules = element section-rules { attlist.section-rules, rule+ }
attlist.section-rules &= empty
# The rules section contains a sequence of one or more rules
@@ -148,10 +148,10 @@
pattern = element pattern { attlist.pattern, pattern-item }
attlist.pattern &= empty
# The pattern is specified in terms of pattern items, each one
-# representing a lexical form in the matched pattern
+# representing a lexical form in the matched pattern
pattern-item = element pattern-item { attlist.pattern-item, empty }
attlist.pattern-item &= attribute n { xsd:IDREF }
-# Each attribute to be activated is referred to by its name in the def-cats section
+# Each attribute to be activated is referred to by its name in the def-cats section
action = element action { attlist.action, sentence* }
attlist.action &= attribute c { text }?
# Encloses the procedural part of a rule
@@ -159,31 +159,31 @@
attlist.choose &= attribute c { text }?
# The choose statement is a selection statement (similar to a case
# statement) composed of one or more tested cases and an optional
-# otherwise
+# otherwise
when = element when { attlist.when, test, sentence* }
attlist.when &= attribute c { text }?
-# Each tested case is a block of zero or more statements
+# Each tested case is a block of zero or more statements
otherwise = element otherwise { attlist.otherwise, sentence+ }
attlist.otherwise &= attribute c { text }?
-# The otherwise case is also a block of one or more statements
+# The otherwise case is also a block of one or more statements
test = element test { attlist.test, condition }
attlist.test &= attribute c { text }?
# The test in a tested case may be a conjunction, a disjunction, or
# a negation of simpler tests, as well as a simple equality test
and = element and { attlist.and, condition, condition+ }
attlist.and &= empty
-# Each conjuntion test contains two or more simpler tests
+# Each conjuntion test contains two or more simpler tests
or = element or { attlist.or, condition, condition+ }
attlist.or &= empty
-# Each disjunction test contains two or more simpler tests
+# Each disjunction test contains two or more simpler tests
not = element not { attlist.not, condition }
attlist.not &= empty
-# The negation of a simpler test is a test itself
+# The negation of a simpler test is a test itself
equal = element equal { attlist.equal, value, value }
attlist.equal &= attribute caseless { "no" | "yes" }?
# The simplest test is an equality test. The right part and the
# left part of the equality may both be a clip (see below), a
-# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
# a variable ('var') defined in the def-vars section. When the attribute
# 'caseless' is set to 'yes', the comparison is made without attending
# to the case.
@@ -191,7 +191,7 @@
attlist.begins-with &= attribute caseless { "no" | "yes" }?
# Tests if the left part contains the right part at the beginning.
# Both parts of the test may both be a clip (see below), a
-# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
# a variable ('var') defined in the def-vars section. When the attribute
# 'caseless' is set to 'yes', the comparison is made without attending
# to the case.
@@ -199,7 +199,7 @@
attlist.ends-with &= attribute caseless { "no" | "yes" }?
# Tests if the left part contains the right part at the end.
# Both parts of the test may both be a clip (see below), a
-# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
# a variable ('var') defined in the def-vars section. When the attribute
# 'caseless' is set to 'yes', the comparison is made without attending
# to the case.
@@ -208,7 +208,7 @@
attlist.begins-with-list &= attribute caseless { "no" | "yes" }?
# Tests if the left part contains the right part at the beginning.
# First parts of the test may be a clip (see below), a
-# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
# a variable ('var') defined in the def-vars section. The second part
# must be always a list. When the attribute
# 'caseless' is set to 'yes', the comparison is made without attending
@@ -218,7 +218,7 @@
attlist.ends-with-list &= attribute caseless { "no" | "yes" }?
# Tests if the left part contains the right part at the end.
# First parts of the test may be a clip (see below), a
-# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
# a variable ('var') defined in the def-vars section. The second part
# must be always a list. When the attribute
# 'caseless' is set to 'yes', the comparison is made without attending
@@ -230,7 +230,7 @@
attlist.contains-substring &= attribute caseless { "no" | "yes" }?
# Tests if the left part contains the right part.
# Both parts of the test may both be a clip (see below), a
-# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
# a variable ('var') defined in the def-vars section. When the attribute
# 'caseless' is set to 'yes', the comparison is made without attending
# to the case.
@@ -245,14 +245,14 @@
let = element let { attlist.let, container, value }
attlist.let &= empty
# An assignment statement ('let') assigns the value of a clip (see
-# below), a literal string ('lit'), a literal tag('lit-tag') or the
-# value of a global variable ('var') to either a global variable ('var')
+# below), a literal string ('lit'), a literal tag('lit-tag') or the
+# value of a global variable ('var') to either a global variable ('var')
# or a clip
append = element append { attlist.append, value+ }
attlist.append &= attribute n { xsd:IDREF }
# This instruction appends the value of a clip (see
-# below), a literal string ('lit'), a literal tag('lit-tag') or the
-# value of a global variable ('var') to either a global variable ('var')
+# below), a literal string ('lit'), a literal tag('lit-tag') or the
+# value of a global variable ('var') to either a global variable ('var')
# or a clip, identified by the "n" attribute
out = element out { attlist.out, (b | lu | mlu | var)+ }
attlist.out &= attribute c { text }?
@@ -260,7 +260,7 @@
modify-case =
element modify-case { attlist.modify-case, container, stringvalue }
attlist.modify-case &= empty
-# The first argument of 'modify-case' copy the case of the second
+# The first argument of 'modify-case' copy the case of the second
# argument.
call-macro = element call-macro { attlist.call-macro, with-param* }
attlist.call-macro &= attribute n { xsd:IDREF }
@@ -274,7 +274,7 @@
# it may be used with arguments 1 and 2 in a noun-adjective rule,
# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with
# arguments 1 and 3 in a noun-adverb-adjective rule, and with
-# arguments 2 and 1 in an adjective-noun rule
+# arguments 2 and 1 in an adjective-noun rule
clip = element clip { attlist.clip, empty }
attlist.clip &=
attribute pos { text },
@@ -282,10 +282,10 @@
attribute c { text }?
# A 'clip' is a substring of a source-language or target-language
# lexical form, extracted according to an attribute:
-#
+#
# * 'pos' is an index (1, 2, 3...) used to select a lexical form
# inside the rule;
-#
+#
# * the value of 'part' is the name of an attribute defined in
# def-attrs, but may take also the values 'lem' (referring to
# the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
@@ -305,7 +305,7 @@
# of the variable. When it is in an 'out', a 'test', or the right
# part of a 'let', it represents the value of the variable; when in
# the left part of a 'let' it represents the reference of the
-# variable.
+# variable.
get-case-from =
element get-case-from { attlist.get-case-from, (clip | lit | var) }
attlist.get-case-from &= attribute pos { text }
@@ -315,13 +315,13 @@
attlist.case-of &=
attribute pos { text },
attribute part { text }
-# A 'case-of' is a value representing the case of a "clip". This value
+# A 'case-of' is a value representing the case of a "clip". This value
# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA",
# (all uppercase).
-#
+#
# * 'pos' is an index (1, 2, 3...) used to select a lexical form
# inside the rule;
-#
+#
# * the value of 'part' is the name of an attribute defined in
# def-attrs, but may take also the values 'lem' (referring to
# the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
diff -Nru apertium-3.5.1/apertium/postchunk.rng apertium-3.5.2/apertium/postchunk.rng
--- apertium-3.5.1/apertium/postchunk.rng 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/postchunk.rng 2018-09-11 10:38:25.000000000 +0000
@@ -1,24 +1,24 @@
@@ -96,7 +96,7 @@
@@ -143,7 +143,7 @@
@@ -159,7 +159,7 @@
@@ -179,7 +179,7 @@
@@ -266,7 +266,7 @@
@@ -282,8 +282,8 @@
@@ -297,10 +297,10 @@
@@ -323,9 +323,9 @@
@@ -373,7 +373,7 @@
@@ -420,7 +420,7 @@
@@ -522,7 +522,7 @@
@@ -721,8 +721,8 @@
@@ -755,7 +755,7 @@
@@ -791,7 +791,7 @@
it may be used with arguments 1 and 2 in a noun-adjective rule,
with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with
arguments 1 and 3 in a noun-adverb-adjective rule, and with
- arguments 2 and 1 in an adjective-noun rule
+ arguments 2 and 1 in an adjective-noun rule
-->
@@ -809,15 +809,15 @@
@@ -861,7 +861,7 @@
of the variable. When it is in an 'out', a 'test', or the right
part of a 'let', it represents the value of the variable; when in
the left part of a 'let' it represents the reference of the
- variable.
+ variable.
-->
@@ -891,13 +891,13 @@
@@ -13,165 +13,165 @@
-
+
-
+
-
-
+
+
-
+
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
-
+
-
+
-
+
-
-
+
+
-
-
+
+
-
+
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
-
-
+
+
-
+
-
-
-
-
+
+
+
+
-
+
@@ -189,7 +189,7 @@
-
+
@@ -273,7 +273,7 @@
-
+
diff -Nru apertium-3.5.1/apertium/rtf-format-cp1251.xml apertium-3.5.2/apertium/rtf-format-cp1251.xml
--- apertium-3.5.1/apertium/rtf-format-cp1251.xml 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/rtf-format-cp1251.xml 2018-09-11 10:38:25.000000000 +0000
@@ -5,7 +5,7 @@
-
+
@@ -13,165 +13,165 @@
-
+
-
+
-
-
+
+
-
+
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
-
+
-
+
-
+
-
-
+
+
-
-
+
+
-
+
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
-
-
+
+
-
+
-
-
-
-
+
+
+
+
-
+
@@ -189,7 +189,7 @@
-
+
@@ -284,7 +284,7 @@
-
+
diff -Nru apertium-3.5.1/apertium/rtf-format.xml apertium-3.5.2/apertium/rtf-format.xml
--- apertium-3.5.1/apertium/rtf-format.xml 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/rtf-format.xml 2018-09-11 10:38:25.000000000 +0000
@@ -5,7 +5,7 @@
-
+
@@ -13,165 +13,165 @@
-
+
-
+
-
-
+
+
-
+
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
-
+
-
+
-
+
-
-
+
+
-
-
+
+
-
+
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
-
-
+
+
-
+
-
-
-
-
+
+
+
+
-
+
@@ -185,19 +185,19 @@
-
+
-
+
-
+
-
+
-
+
@@ -211,8 +211,8 @@
-
-
+
+
@@ -467,7 +467,7 @@
-
+
diff -Nru apertium-3.5.1/apertium/sentence_stream.cc apertium-3.5.2/apertium/sentence_stream.cc
--- apertium-3.5.1/apertium/sentence_stream.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/sentence_stream.cc 2018-09-11 10:38:25.000000000 +0000
@@ -89,6 +89,9 @@
bool skip_on_error, bool sent_seg)
: sent_seg(sent_seg), skipped(0)
{
+ std::random_device rd;
+ rnd.seed(rd());
+
TrainingSentence *training_sentence;
bool was_sentence_end = true;
unsigned int tagged_line = 0;
@@ -182,7 +185,7 @@
void TrainingCorpus::shuffle()
{
- random_shuffle(sentences.begin(), sentences.end());
+ std::shuffle(sentences.begin(), sentences.end(), rnd);
}
}
diff -Nru apertium-3.5.1/apertium/sentence_stream.h apertium-3.5.2/apertium/sentence_stream.h
--- apertium-3.5.1/apertium/sentence_stream.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/sentence_stream.h 2018-09-11 10:38:25.000000000 +0000
@@ -2,6 +2,7 @@
#define _SENTENCE_STREAM_H
#include
+#include
#include
#include
#include
@@ -42,6 +43,7 @@
bool contToEndOfSent(Stream &stream, StreamedType token,
unsigned int &line);
bool sent_seg;
+ std::mt19937 rnd;
public:
unsigned int skipped;
TrainingCorpus(Stream &tagged, Stream &untagged, bool skip_on_error, bool sent_seg);
diff -Nru apertium-3.5.1/apertium/stream.cc apertium-3.5.2/apertium/stream.cc
--- apertium-3.5.1/apertium/stream.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/stream.cc 2018-09-11 10:38:25.000000000 +0000
@@ -303,7 +303,7 @@
continue;
case L'/':
case L'#':
- //std::wcerr << L"[306] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ;
+ //std::wcerr << L"[306] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ;
case L'+':
if (ThePreviousCase->isPreviousCharacter) {
std::wstringstream Message;
@@ -388,7 +388,7 @@
std::abort();
case L'#':
- //std::wcerr << L"[391] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ;
+ //std::wcerr << L"[391] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ;
if (ThePreviousCase) {
switch (ThePreviousCase->ThePreviousCase) {
case L'[':
@@ -437,7 +437,7 @@
ThePreviousCase = PreviousCaseType(Character_);
push_back_Character(TheStreamedType, Lemma, Character_);
- //std::wcerr << L"[440] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ;
+ //std::wcerr << L"[440] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ;
continue;
}
diff -Nru apertium-3.5.1/apertium/string_utils.cc apertium-3.5.2/apertium/string_utils.cc
--- apertium-3.5.1/apertium/string_utils.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/string_utils.cc 2018-09-11 10:38:25.000000000 +0000
@@ -26,9 +26,9 @@
#endif
//Delete white spaces from the end and the begining of the string
-wstring
-StringUtils::trim(wstring const &str)
-{
+wstring
+StringUtils::trim(wstring const &str)
+{
if(str == L"")
{
return L"";
@@ -50,12 +50,12 @@
{
end++;
}
-
+
return str.substr(begin, end-begin);
}
vector
-StringUtils::split_wstring(wstring const &input, wstring const &delimiter)
+StringUtils::split_wstring(wstring const &input, wstring const &delimiter)
{
unsigned pos;
int new_pos;
@@ -80,7 +80,7 @@
return result;
}
-wstring
+wstring
StringUtils::vector2wstring(vector const &v)
{
wstring s = L"";
@@ -93,7 +93,7 @@
return s;
}
-wstring
+wstring
StringUtils::substitute(wstring const &source, wstring const &olds, wstring const &news) {
wstring s = source;
diff -Nru apertium-3.5.1/apertium/string_utils.h apertium-3.5.2/apertium/string_utils.h
--- apertium-3.5.1/apertium/string_utils.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/string_utils.h 2018-09-11 10:38:25.000000000 +0000
@@ -36,7 +36,7 @@
class StringUtils {
public:
-
+
static wstring trim(wstring const &str);
static vector split_wstring(wstring const &input, wstring const &delimiter);
@@ -47,9 +47,9 @@
static wstring substitute(const wstring &source, const wstring &olds, const wstring &news);
static wstring itoa(int n);
-
+
static string itoa_string(int n);
-
+
static wstring ftoa(double f);
static wstring tolower(wstring const &s);
diff -Nru apertium-3.5.1/apertium/tagger_data.cc apertium-3.5.2/apertium/tagger_data.cc
--- apertium-3.5.1/apertium/tagger_data.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_data.cc 2018-09-11 10:38:25.000000000 +0000
@@ -31,7 +31,7 @@
enforce_rules = o.enforce_rules;
prefer_rules = o.prefer_rules;
constants = o.constants;
- output = o.output;
+ output = o.output;
plist = o.plist;
}
@@ -92,7 +92,7 @@
TaggerData::setForbidRules(vector &fr)
{
forbid_rules = fr;
-}
+}
map &
TaggerData::getTagIndex()
@@ -111,7 +111,7 @@
{
tag_index = ti;
}
-
+
vector &
TaggerData::getArrayTags()
{
@@ -192,7 +192,7 @@
void
TaggerData::setConstants(ConstantManager const &c)
-{
+{
constants = c;
}
diff -Nru apertium-3.5.1/apertium/tagger_data.h apertium-3.5.2/apertium/tagger_data.h
--- apertium-3.5.1/apertium/tagger_data.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_data.h 2018-09-11 10:38:25.000000000 +0000
@@ -44,14 +44,14 @@
Collection output;
PatternList plist;
vector discard;
-
+
void copy(TaggerData const &o);
public:
TaggerData();
virtual ~TaggerData();
TaggerData(TaggerData const &o);
TaggerData & operator =(TaggerData const &o);
-
+
set & getOpenClass();
const set & getOpenClass() const;
void setOpenClass(set const &oc);
@@ -59,11 +59,11 @@
vector & getForbidRules();
const vector & getForbidRules() const;
void setForbidRules(vector &fr);
-
+
map & getTagIndex();
const map & getTagIndex() const;
void setTagIndex(map const &ti);
-
+
vector & getArrayTags();
const vector & getArrayTags() const;
void setArrayTags(vector const &at);
@@ -75,7 +75,7 @@
vector & getPreferRules();
const vector & getPreferRules() const;
void setPreferRules(vector const &pr);
-
+
vector & getDiscardRules();
const vector & getDiscardRules() const;
void setDiscardRules(vector const &dr);
@@ -83,11 +83,11 @@
ConstantManager & getConstants();
const ConstantManager & getConstants() const;
void setConstants(ConstantManager const &c);
-
+
virtual Collection & getOutput();
const virtual Collection & getOutput() const;
void setOutput(Collection const &c);
-
+
void setPatternList(PatternList const &pl);
PatternList & getPatternList();
const PatternList & getPatternList() const;
diff -Nru apertium-3.5.1/apertium/tagger_data_hmm.cc apertium-3.5.2/apertium/tagger_data_hmm.cc
--- apertium-3.5.1/apertium/tagger_data_hmm.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_data_hmm.cc 2018-09-11 10:38:25.000000000 +0000
@@ -80,7 +80,7 @@
b = NULL;
N = 0;
M = 0;
-
+
TaggerData::copy(o);
}
@@ -95,15 +95,15 @@
}
return *this;
}
-
+
void
-TaggerDataHMM::setProbabilities(int const myN, int const myM,
+TaggerDataHMM::setProbabilities(int const myN, int const myM,
double **myA, double **myB)
{
this->destroy();
N = myN;
M = myM;
-
+
if(N != 0 && M != 0)
{
// NxN matrix
@@ -114,12 +114,12 @@
if(myA != NULL)
{
for(int j = 0; j != N; j++) // ToDo: N should be M? Check use of N and M in this function
- {
+ {
a[i][j] = myA[i][j];
}
}
}
-
+
// NxM matrix
b = new double * [N];
for(int i = 0; i != N; i++)
@@ -138,24 +138,24 @@
{
a = NULL;
b = NULL;
- }
+ }
}
-double **
+double **
TaggerDataHMM::getA()
{
return a;
}
-double **
+double **
TaggerDataHMM::getB()
{
return b;
}
-int
+int
TaggerDataHMM::getN()
-{
+{
return N;
}
@@ -177,7 +177,7 @@
val += Compression::multibyte_read(in);
open_class.insert(val);
}
-
+
// forbid_rules
for(int i = Compression::multibyte_read(in); i != 0; i--)
{
@@ -187,21 +187,21 @@
forbid_rules.push_back(aux);
}
-
+
// array_tags
for(int i = Compression::multibyte_read(in); i != 0; i--)
{
array_tags.push_back(Compression::wstring_read(in));
}
-
+
// tag_index
for(int i = Compression::multibyte_read(in); i != 0; i--)
{
- wstring tmp = Compression::wstring_read(in);
+ wstring tmp = Compression::wstring_read(in);
tag_index[tmp] = Compression::multibyte_read(in);
}
- // enforce_rules
+ // enforce_rules
for(int i = Compression::multibyte_read(in); i != 0; i--)
{
TEnforceAfterRule aux;
@@ -223,13 +223,13 @@
constants.read(in);
// output
- output.read(in);
+ output.read(in);
// dimensions
N = Compression::multibyte_read(in);
M = Compression::multibyte_read(in);
-
+
a = new double * [N];
b = new double * [N];
for(int i = 0; i != N; i++)
@@ -237,7 +237,7 @@
a[i] = new double[N];
b[i] = new double[M];
}
-
+
// read a
for(int i = 0; i != N; i++)
{
@@ -268,16 +268,16 @@
// read pattern list
plist.read(in);
-
+
// read discards on ambiguity
discard.clear();
- unsigned int limit = Compression::multibyte_read(in);
+ unsigned int limit = Compression::multibyte_read(in);
if(feof(in))
{
return;
}
-
+
for(unsigned int i = 0; i < limit; i++)
{
discard.push_back(Compression::wstring_read(in));
@@ -287,17 +287,17 @@
void
TaggerDataHMM::write(FILE *out)
{
-
+
// open_class
- Compression::multibyte_write(open_class.size(), out);
+ Compression::multibyte_write(open_class.size(), out);
int val = 0;
for(set::const_iterator it = open_class.begin(), limit = open_class.end();
it != limit; it++)
{
- Compression::multibyte_write(*it-val, out);
+ Compression::multibyte_write(*it-val, out);
val = *it;
}
-
+
// forbid_rules
Compression::multibyte_write(forbid_rules.size(), out);
for(unsigned int i = 0, limit = forbid_rules.size(); i != limit; i++)
@@ -305,7 +305,7 @@
Compression::multibyte_write(forbid_rules[i].tagi, out);
Compression::multibyte_write(forbid_rules[i].tagj, out);
}
-
+
// array_tags
Compression::multibyte_write(array_tags.size(), out);
for(unsigned int i = 0, limit = array_tags.size(); i != limit; i++)
@@ -321,7 +321,7 @@
Compression::wstring_write(it->first, out);
Compression::multibyte_write(it->second, out);
}
-
+
// enforce_rules
Compression::multibyte_write(enforce_rules.size(), out);
for(unsigned int i = 0, limit = enforce_rules.size(); i != limit; i++)
@@ -340,9 +340,9 @@
{
Compression::wstring_write(prefer_rules[i], out);
}
-
+
// constants
- constants.write(out);
+ constants.write(out);
// output
output.write(out);
@@ -359,7 +359,7 @@
}
// b matrix, writing only useful values
-
+
int nval = 0;
for(int i = 0; i != N; i++)
{
@@ -384,13 +384,13 @@
EndianDoubleUtil::write(out, b[i][j]);
}
}
- }
-
+ }
+
// write pattern list
plist.write(out);
// write discard list
-
+
if(discard.size() != 0)
{
Compression::multibyte_write(discard.size(), out);
@@ -398,6 +398,6 @@
{
Compression::wstring_write(discard[i], out);
}
- }
+ }
}
diff -Nru apertium-3.5.1/apertium/tagger_data_hmm.h apertium-3.5.2/apertium/tagger_data_hmm.h
--- apertium-3.5.1/apertium/tagger_data_hmm.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_data_hmm.h 2018-09-11 10:38:25.000000000 +0000
@@ -34,15 +34,15 @@
TaggerDataHMM(TaggerDataHMM const &o);
TaggerDataHMM(TaggerData const &o);
TaggerDataHMM & operator =(TaggerDataHMM const &o);
-
- virtual void setProbabilities(int const myN, int const myM,
+
+ virtual void setProbabilities(int const myN, int const myM,
double **myA = NULL, double **myB = NULL);
virtual double ** getA();
virtual double ** getB();
virtual int getN();
virtual int getM();
-
+
virtual void read(FILE *in);
virtual void write(FILE *out);
};
diff -Nru apertium-3.5.1/apertium/tagger_data_lsw.cc apertium-3.5.2/apertium/tagger_data_lsw.cc
--- apertium-3.5.1/apertium/tagger_data_lsw.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_data_lsw.cc 2018-09-11 10:38:25.000000000 +0000
@@ -35,7 +35,7 @@
delete [] d;
}
d = NULL;
-
+
N = 0;
}
@@ -104,9 +104,9 @@
return d;
}
-int
+int
TaggerDataLSW::getN()
-{
+{
return N;
}
@@ -122,7 +122,7 @@
val += Compression::multibyte_read(in);
open_class.insert(val);
}
-
+
// forbid_rules
for(int i = Compression::multibyte_read(in); i != 0; i--)
{
@@ -132,21 +132,21 @@
forbid_rules.push_back(aux);
}
-
+
// array_tags
for(int i = Compression::multibyte_read(in); i != 0; i--)
{
array_tags.push_back(Compression::wstring_read(in));
}
-
+
// tag_index
for(int i = Compression::multibyte_read(in); i != 0; i--)
{
- wstring tmp = Compression::wstring_read(in);
+ wstring tmp = Compression::wstring_read(in);
tag_index[tmp] = Compression::multibyte_read(in);
}
- // enforce_rules
+ // enforce_rules
for(int i = Compression::multibyte_read(in); i != 0; i--)
{
TEnforceAfterRule aux;
@@ -168,7 +168,7 @@
constants.read(in);
// output
- output.read(in);
+ output.read(in);
// dimensions
N = Compression::multibyte_read(in);
@@ -197,19 +197,19 @@
int k = Compression::multibyte_read(in);
d[i][j][k] = EndianDoubleUtil::read(in);
}
-
+
// read pattern list
plist.read(in);
-
+
// read discards on ambiguity
discard.clear();
- unsigned int limit = Compression::multibyte_read(in);
+ unsigned int limit = Compression::multibyte_read(in);
if(feof(in))
{
return;
}
-
+
for(unsigned int i = 0; i < limit; i++)
{
discard.push_back(Compression::wstring_read(in));
@@ -219,17 +219,17 @@
void
TaggerDataLSW::write(FILE *out)
{
-
+
// open_class
- Compression::multibyte_write(open_class.size(), out);
+ Compression::multibyte_write(open_class.size(), out);
int val = 0;
for(set::const_iterator it = open_class.begin(), limit = open_class.end();
it != limit; it++)
{
- Compression::multibyte_write(*it-val, out);
+ Compression::multibyte_write(*it-val, out);
val = *it;
}
-
+
// forbid_rules
Compression::multibyte_write(forbid_rules.size(), out);
for(unsigned int i = 0, limit = forbid_rules.size(); i != limit; i++)
@@ -237,7 +237,7 @@
Compression::multibyte_write(forbid_rules[i].tagi, out);
Compression::multibyte_write(forbid_rules[i].tagj, out);
}
-
+
// array_tags
Compression::multibyte_write(array_tags.size(), out);
for(unsigned int i = 0, limit = array_tags.size(); i != limit; i++)
@@ -253,7 +253,7 @@
Compression::wstring_write(it->first, out);
Compression::multibyte_write(it->second, out);
}
-
+
// enforce_rules
Compression::multibyte_write(enforce_rules.size(), out);
for(unsigned int i = 0, limit = enforce_rules.size(); i != limit; i++)
@@ -272,9 +272,9 @@
{
Compression::wstring_write(prefer_rules[i], out);
}
-
+
// constants
- constants.write(out);
+ constants.write(out);
// output
output.write(out);
@@ -306,12 +306,12 @@
}
}
}
-
+
// write pattern list
plist.write(out);
// write discard list
-
+
if(discard.size() != 0)
{
Compression::multibyte_write(discard.size(), out);
@@ -319,6 +319,6 @@
{
Compression::wstring_write(discard[i], out);
}
- }
+ }
}
diff -Nru apertium-3.5.1/apertium/tagger_data_lsw.h apertium-3.5.2/apertium/tagger_data_lsw.h
--- apertium-3.5.1/apertium/tagger_data_lsw.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_data_lsw.h 2018-09-11 10:38:25.000000000 +0000
@@ -24,7 +24,7 @@
private:
int N;
double ***d;
-
+
void destroy();
public:
@@ -33,12 +33,12 @@
TaggerDataLSW(TaggerDataLSW const &o);
TaggerDataLSW(TaggerData const &o);
TaggerDataLSW & operator =(TaggerDataLSW const &o);
-
+
void setProbabilities(int const myN, double ***myD = NULL);
virtual double *** getD();
virtual int getN();
-
+
void read(FILE *in);
void write(FILE *out);
};
diff -Nru apertium-3.5.1/apertium/tagger.dtd apertium-3.5.2/apertium/tagger.dtd
--- apertium-3.5.1/apertium/tagger.dtd 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger.dtd 2018-09-11 10:38:25.000000000 +0000
@@ -1,23 +1,23 @@
-
@@ -30,9 +30,9 @@
-
@@ -41,8 +41,8 @@
c CDATA #IMPLIED
closed CDATA #IMPLIED>
-
@@ -72,7 +72,7 @@
@@ -81,23 +81,23 @@
c CDATA #IMPLIED>
-
-
@@ -111,8 +111,8 @@
-
@@ -120,23 +120,23 @@
-
-
@@ -151,7 +151,7 @@
-
diff -Nru apertium-3.5.1/apertium/tagger.rnc apertium-3.5.2/apertium/tagger.rnc
--- apertium-3.5.1/apertium/tagger.rnc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger.rnc 2018-09-11 10:38:25.000000000 +0000
@@ -1,20 +1,20 @@
# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
-#
+#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see .
-#
+#
# DTD for the tagset and the rules to enforce the state to state
-# transition probabilities used by the part-of-speech tagger.
+# transition probabilities used by the part-of-speech tagger.
# 2005.07.29.
tagger =
@@ -31,15 +31,15 @@
# language specified through the mandatory attribute 'name'
tagset = element tagset { attlist.tagset, def-label+, def-mult* }
attlist.tagset &= empty
-# The 'tagset' section defines the correspondance between simple
-# or multiple morphological categories defining a lexical form and the coarser
+# The 'tagset' section defines the correspondance between simple
+# or multiple morphological categories defining a lexical form and the coarser
# ones with which the part-of-speech tagger works
def-label = element def-label { attlist.def-label, tags-item+ }
attlist.def-label &=
attribute name { text },
attribute c { text }?,
attribute closed { text }?
-# Each 'def-label' defines one coarse tag in terms of a list of fine tags
+# Each 'def-label' defines one coarse tag in terms of a list of fine tags
# and has a mandatory unique name. The optional attribute 'closed="true"' may be used
# to specify if the defined fine tags belong to a closed list.
# c is for comments and is ignored
@@ -48,37 +48,37 @@
attribute tags { text },
attribute lemma { text }?
# Each 'tags-item' may be a dot-separated subsequence of the morphological tags
-# corresponding to a coarse tag optionally in association with a given lemma
+# corresponding to a coarse tag optionally in association with a given lemma
def-mult = element def-mult { attlist.def-mult, sequence+ }
attlist.def-mult &=
attribute name { text },
attribute c { text }?,
attribute closed { text }?
# Each 'def-mult' defines one coarse tag in terms of a sequence of coarse
-# tags previously defined as 'def-labels' or a sequence of fine tags. A mandatory
-# name is required for each 'def-mult' which may also has an optional attribute
+# tags previously defined as 'def-labels' or a sequence of fine tags. A mandatory
+# name is required for each 'def-mult' which may also has an optional attribute
# 'closed="true"' if it belongs to a closed list
# c is for comments and is ignored
sequence =
element sequence { attlist.sequence, (tags-item | label-item)+ }
attlist.sequence &= empty
-# Element 'sequence' encloses a set of tags or labels which defines
+# Element 'sequence' encloses a set of tags or labels which defines
# a unit with more than one label
label-item = element label-item { attlist.label-item, empty }
attlist.label-item &=
attribute label { text },
attribute c { text }?
-# Each 'label' of the 'label-item' correspond to a coarse tag previously
+# Each 'label' of the 'label-item' correspond to a coarse tag previously
# defined as a 'def-label' by a name.
# c is for comments and is ignored
forbid = element forbid { attlist.forbid, label-sequence+ }
attlist.forbid &= empty
-# Element 'forbid' contains sequences of morphological categories that are not
+# Element 'forbid' contains sequences of morphological categories that are not
# allowed in a given language
label-sequence =
element label-sequence { attlist.label-sequence, label-item+ }
attlist.label-sequence &= attribute c { text }?
-# Each 'label-sequence' is restricted to two 'label-items'
+# Each 'label-sequence' is restricted to two 'label-items'
# c is for comments and is ignored
enforce-rules =
element enforce-rules { attlist.enforce-rules, enforce-after+ }
@@ -89,22 +89,22 @@
attlist.enforce-after &=
attribute label { text },
attribute c { text }?
-# Each 'enforce-after' encloses the set of coarse tags ('label-set') that must follow
+# Each 'enforce-after' encloses the set of coarse tags ('label-set') that must follow
# the one defined in 'label', as a mandatory attribute
# c is for comments and is ignored
label-set = element label-set { attlist.label-set, label-item+ }
attlist.label-set &= attribute c { text }?
-# The set of 'label-items' enforced after a 'label' are enclosed inside element 'label-set'
+# The set of 'label-items' enforced after a 'label' are enclosed inside element 'label-set'
# c is for comments and is ignored
preferences = element preferences { attlist.preferences, prefer+ }
attlist.preferences &= empty
-# Element 'preferences' allows to decide amongst two or more fine tag sequences
-# which are grouped in the same coarse tag.
+# Element 'preferences' allows to decide amongst two or more fine tag sequences
+# which are grouped in the same coarse tag.
prefer = element prefer { attlist.prefer, empty }
attlist.prefer &=
attribute tags { text },
attribute c { text }?
-# Each 'prefer' element has a mandatory attribute 'tags' made of a sequence of fine tags
+# Each 'prefer' element has a mandatory attribute 'tags' made of a sequence of fine tags
# c is for comments and is ignored
discard-on-ambiguity =
element discard-on-ambiguity {
@@ -118,5 +118,5 @@
attribute tags { text },
attribute c { text }?
start = tagger
-# Each 'discard' element has a mandatory attribute 'tags' made of a sequence of fine tags
+# Each 'discard' element has a mandatory attribute 'tags' made of a sequence of fine tags
# c is for comments and is ignored
diff -Nru apertium-3.5.1/apertium/tagger.rng apertium-3.5.2/apertium/tagger.rng
--- apertium-3.5.1/apertium/tagger.rng 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger.rng 2018-09-11 10:38:25.000000000 +0000
@@ -1,22 +1,22 @@
@@ -60,8 +60,8 @@
@@ -82,7 +82,7 @@
@@ -122,8 +122,8 @@
@@ -142,7 +142,7 @@
@@ -158,7 +158,7 @@
@@ -174,7 +174,7 @@
@@ -191,7 +191,7 @@
@@ -219,7 +219,7 @@
@@ -237,7 +237,7 @@
@@ -252,8 +252,8 @@
@@ -268,7 +268,7 @@
@@ -305,6 +305,6 @@
diff -Nru apertium-3.5.1/apertium/tagger_utils.cc apertium-3.5.2/apertium/tagger_utils.cc
--- apertium-3.5.1/apertium/tagger_utils.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_utils.cc 2018-09-11 10:38:25.000000000 +0000
@@ -44,12 +44,12 @@
exit(1);
}
-void tagger_utils::file_name_error (string const &s) {
+void tagger_utils::file_name_error (string const &s) {
wcerr << "Error: " << s << endl;
exit(1);
}
-char * tagger_utils::itoa(int i) {
+char * tagger_utils::itoa(int i) {
static char buf[512];
sprintf(buf,"%d",i);
return buf;
@@ -65,62 +65,62 @@
v[i].clear();
}
-int tagger_utils::ntokens_multiword(wstring const &s)
+int tagger_utils::ntokens_multiword(wstring const &s)
{
wchar_t *news = new wchar_t[s.size()+1];
wcscpy(news, s.c_str());
news[s.size()] = 0;
wcerr << news << endl;
-
+
wchar_t const *delim = L"_";
wchar_t *ptr;
int n=0;
-
+
if (wcstok(news, delim, &ptr))
- n++;
+ n++;
while (wcstok(NULL, delim, &ptr))
n++;
-
+
delete[] news;
-
- return n;
+
+ return n;
}
-
+
int tagger_utils::nguiones_fs(wstring const & s) {
wchar_t *news = new wchar_t[s.size()+1];
wcscpy(news, s.c_str());
news[s.size()] = 0;
- wcerr << news << endl;
+ wcerr << news << endl;
wchar_t const *delim = L"-";
wchar_t *ptr;
int n=0;
-
+
if (wcstok(news, delim, &ptr))
- n++;
+ n++;
while (wcstok(NULL, delim, &ptr))
n++;
-
+
delete[] news;
-
- return n;
-}
-wstring tagger_utils::trim(wstring s)
+ return n;
+}
+
+wstring tagger_utils::trim(wstring s)
{
if (s.length()==0)
return L"";
-
+
for (unsigned int i=0; i<(s.length()-1); i++) {
if ((s.at(i)==L' ')&&(s.at(i+1)==L' ')) {
s.erase(i,1);
i--;
}
}
-
+
if ((s.length()>0)&&(s.at(s.length()-1)==L' '))
s.erase(s.length()-1,1);
if ((s.length()>0)&&(s.at(0)==L' '))
- s.erase(0,1);
+ s.erase(0,1);
return s;
}
@@ -251,7 +251,7 @@
ostream& operator<< (ostream& os, const map & f){
typename map ::const_iterator it;
os<first<<' '<second;
return os;
}
@@ -260,7 +260,7 @@
istream& operator>> (istream& is, map & f) {
int n, i, k;
f.clear();
- is>>n;
+ is>>n;
for (k=0; k>i; // warning: does not work if both
is>>f[i]; // lines merged in a single one
diff -Nru apertium-3.5.1/apertium/tagger_utils.h apertium-3.5.2/apertium/tagger_utils.h
--- apertium-3.5.1/apertium/tagger_utils.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_utils.h 2018-09-11 10:38:25.000000000 +0000
@@ -64,7 +64,7 @@
/** Return the number of tokens in the multiword unit
*/
int ntokens_multiword(wstring const &s);
-
+
/** Devuelve el nº de guiones que contiene la cadena pasada como argumento
*/
int nguiones_fs(wstring const &cadena);
diff -Nru apertium-3.5.1/apertium/tagger_word.cc apertium-3.5.2/apertium/tagger_word.cc
--- apertium-3.5.1/apertium/tagger_word.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_word.cc 2018-09-11 10:38:25.000000000 +0000
@@ -66,7 +66,7 @@
superficial_form = sf;
}
-wstring&
+wstring&
TaggerWord::get_superficial_form() {
return superficial_form;
}
@@ -81,7 +81,7 @@
{
string utfpattern = UtfConverter::toUtf8(pattern);
string regexp = "";
-
+
while(true)
{
size_t pos = utfpattern.find("<*>");
@@ -112,7 +112,7 @@
//Take a look at the prefer rules
for(int i=0; i < (int) prefer_rules.size(); i++)
{
- if (match(lf, prefer_rules[i]))
+ if (match(lf, prefer_rules[i]))
{
lexical_forms[t]=lf;
break;
@@ -136,16 +136,16 @@
TaggerWord::get_string_tags() {
wstring st;
set::iterator itag = tags.begin();
-
- st=L"{";
+
+ st=L"{";
for(itag=tags.begin(); itag!=tags.end(); itag++) {
if (itag!=tags.begin())
st+=L',';
st+=array_tags[*itag];
}
- st += L'}';
-
- return st;
+ st += L'}';
+
+ return st;
}
wstring
@@ -154,7 +154,7 @@
if (show_ignored_string)
ret.append(ignored_string);
-
+
if(t==TAG_kEOF)
return ret;
@@ -170,7 +170,7 @@
if(get_show_sf()){ // append the superficial form
ret.append(superficial_form);
- ret+=L'/';
+ ret+=L'/';
}
}
@@ -188,12 +188,12 @@
} else {
ret.append(lexical_forms[t]);
}
-
+
if (ret != ignored_string) {
if (plus_cut)
ret+=L'+';
else {
- ret += L'$';
+ ret += L'$';
}
}
@@ -203,20 +203,20 @@
// //o '/'. De no hacerlo en la traducción aparecerán dos palabras sin blanco alguno.
// ret+=" "; //Quizá este no sea el sitio apropiado para hacer esto, lo suyo sería un módulo
// //antes del tagger o del anmor.
-
+
return ret;
}
-wstring
+wstring
TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) {
wstring ret=L"";
if (show_ignored_string)
ret.append(ignored_string);
-
+
if(t==TAG_kEOF)
return ret;
-
+
if (!previous_plus_cut)
{
if(TaggerWord::generate_marks && isAmbiguous())
@@ -228,9 +228,9 @@
ret += L'^';
}
}
-
+
ret.append(superficial_form);
-
+
if (lexical_forms.size()==0) { // This is an UNKNOWN WORD
ret+=L"/*";
ret.append(superficial_form);
@@ -247,7 +247,7 @@
}
}
}
-
+
if (ret != ignored_string) {
if (plus_cut)
ret+=L"+";
@@ -255,7 +255,7 @@
ret+=L"$";
}
}
-
+
return ret;
}
@@ -263,10 +263,10 @@
wstring
TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) {
wstring ret;
-
+
if(t==TAG_kEOF)
return ret;
-
+
if (lexical_forms.size()==0) { //This is an unknown word
ret.append(L"*^");
ret.append(superficial_form);
@@ -277,12 +277,12 @@
ret += L'^';
ret.append(lexical_forms[t]);
}
-
+
if (ret.length() != 0) {
if (plus_cut)
ret+=L'+';
else {
- ret +=L'$';
+ ret +=L'$';
}
}
@@ -294,7 +294,7 @@
ignored_string.append(s);
}
-void
+void
TaggerWord::set_plus_cut(const bool &c) {
plus_cut=c;
}
@@ -310,7 +310,7 @@
return os;
}
-void
+void
TaggerWord::setArrayTags(vector const &at)
{
array_tags = at;
@@ -368,7 +368,7 @@
{
newsettag.insert(it->first);
}
-
+
if(lexical_forms.size() == 1)
{
newsettag.insert(lexical_forms.begin()->first);
@@ -377,7 +377,7 @@
it++;
}
if(tags.size() != newsettag.size())
- {
+ {
this->tags = newsettag;
}
}
diff -Nru apertium-3.5.1/apertium/tagger_word.h apertium-3.5.2/apertium/tagger_word.h
--- apertium-3.5.1/apertium/tagger_word.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tagger_word.h 2018-09-11 10:38:25.000000000 +0000
@@ -33,16 +33,16 @@
* It stores the superficial form and all possible tags that it can receive.
* It has the fine tags delivered by the morphological analyzer and the coarse
* ones used by the PoS tagger.
- */
+ */
class TaggerWord{
private:
- wstring superficial_form;
-
+ wstring superficial_form;
+
set tags; //Set of all possible tags
- map lexical_forms; //For a given coarse tag it stores the fine tag
+ map lexical_forms; //For a given coarse tag it stores the fine tag
//delevered by the morphological analyzer
wstring ignored_string;
-
+
bool plus_cut; //Flag to distinguish the way in which the word was ended.
//If it was done by '$' its value should be false
//If it was done by '+' its value should be true
@@ -59,31 +59,31 @@
static bool show_ignored_string;
- /**
- * Constructor
+ /**
+ * Constructor
*/
TaggerWord(bool prev_plus_cut=false);
-
- /**
+
+ /**
* Copy constructor
*/
TaggerWord(const TaggerWord &w);
-
- /**
- * Destructor
+
+ /**
+ * Destructor
*/
virtual ~TaggerWord();
-
+
/** Set the superficial form of the word.
* @param s the superficial form
*/
void set_superficial_form(const wstring &s);
-
+
/** Get the superficial form of the word
*
*/
wstring& get_superficial_form();
-
+
/** Add a new tag to the set of all possible tags of the word.
* @param t the coarse tag
* @param lf the lexical form (fine tag)
@@ -92,35 +92,35 @@
/** Get the set of tags of this word.
* @return set of tags.
- */
+ */
virtual set& get_tags();
-
+
/** Get a wstring with the set of tags
*/
virtual wstring get_string_tags();
-
+
/** Get the lexical form (fine tag) for a given tag (coarse one)
* @param t the tag
* @return the lexical form of tag t
*/
- virtual wstring get_lexical_form(TTag &t, int const TAG_kEOF);
+ virtual wstring get_lexical_form(TTag &t, int const TAG_kEOF);
wstring get_all_chosen_tag_first(TTag &t, int const TAG_kEOF);
-
+
/** Get the lexical form (fine tag) for a given tag (coarse one)
* @param t the tag
* @return the lexical form of tag t without other text that
* is ignored.
- */
- wstring get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF);
-
+ */
+ wstring get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF);
+
/** Add text to the ignored string
*
- */
+ */
void add_ignored_string(wstring const &s);
-
+
/** Set the flag plus_cut to a certain value. If this flag is set to true means
- * that there were a '+' between this word and the next one
+ * that there were a '+' between this word and the next one
*/
void set_plus_cut(const bool &c);
@@ -132,20 +132,20 @@
/** Get the value of the plus_cut flag */
bool get_plus_cut();
-
+
/** Output operator
*/
friend wostream& operator<< (wostream& os, TaggerWord &w);
-
+
static void setArrayTags(vector const &at);
void print();
-
+
void outputOriginal(FILE *output);
-
- bool isAmbiguous() const; // CAUTION: unknown words are not considered to
+
+ bool isAmbiguous() const; // CAUTION: unknown words are not considered to
// be ambiguous by this method
-
+
void discardOnAmbiguity(wstring const &tags);
};
diff -Nru apertium-3.5.1/apertium/tmx_aligner_tool.cc apertium-3.5.2/apertium/tmx_aligner_tool.cc
--- apertium-3.5.1/apertium/tmx_aligner_tool.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_aligner_tool.cc 2018-09-11 10:38:25.000000000 +0000
@@ -449,7 +449,7 @@
( dictionary, huSentenceListPretty, enSentenceList, alignParameters, std::cout );
// std::wcerr << "Quality " << globalQuality << std::endl ;
-
+
}
else
{
@@ -556,7 +556,7 @@
args.read( argC, argV, remains );
AlignParameters alignParameters;
-
+
if (args.getSwitchCompact("text"))
{
alignParameters.justSentenceIds = false;
@@ -666,7 +666,7 @@
{
const char* batchFilename = remains[1] ;
std::ifstream bis(batchFilename);
-
+
while (bis.good()&&!bis.eof())
{
std::string line;
diff -Nru apertium-3.5.1/apertium/tmx_aligner_tool.h apertium-3.5.2/apertium/tmx_aligner_tool.h
--- apertium-3.5.1/apertium/tmx_aligner_tool.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_aligner_tool.h 2018-09-11 10:38:25.000000000 +0000
@@ -33,7 +33,7 @@
namespace TMXAligner{
void alignerToolWithFilenames(const DictionaryItems& dictionary,
- const std::string& huFilename,
+ const std::string& huFilename,
const std::string& enFilename,
const AlignParameters& alignParameters,
const std::string& outputFilename = "" );
diff -Nru apertium-3.5.1/apertium/tmx_alignment.h apertium-3.5.2/apertium/tmx_alignment.h
--- apertium-3.5.1/apertium/tmx_alignment.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_alignment.h 2018-09-11 10:38:25.000000000 +0000
@@ -56,7 +56,7 @@
// OBSOLETE:
// Has the exactly same relation to BisentenceList as
-// a TrailValues has to a Trail. But note that these
+// a TrailValues has to a Trail. But note that these
// scores mark the confidence in a bisentence. This is
// very different from the confidence in a rundle.
// typedef std::vector BisentenceValues;
diff -Nru apertium-3.5.1/apertium/tmx_align_parameters.h apertium-3.5.2/apertium/tmx_align_parameters.h
--- apertium-3.5.1/apertium/tmx_align_parameters.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_align_parameters.h 2018-09-11 10:38:25.000000000 +0000
@@ -32,10 +32,10 @@
std::string handAlignFilename;
bool utfCharCountingMode;
-
+
std::string autoDictionaryDumpFilename; // Empty string means do not dump.
-AlignParameters() : justSentenceIds(true),
+AlignParameters() : justSentenceIds(true),
justBisentences(false), cautiousMode(false),
realignType(NoRealign),
qualityThreshold(-100000),
diff -Nru apertium-3.5.1/apertium/tmx_book_to_matrix.cc apertium-3.5.2/apertium/tmx_book_to_matrix.cc
--- apertium-3.5.1/apertium/tmx_book_to_matrix.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_book_to_matrix.cc 2018-09-11 10:38:25.000000000 +0000
@@ -154,7 +154,7 @@
// But we make silly mistakes because we give higher scores to some invalid 1-to-1 segments like this:
// Kocogtam. -Like I said, I was out jogging-- -ObviousIy, you weren't jogging.
// Remember the day that they threw you out?
- //
+ //
// Hopefully Gale-Church scoring compensates for this. Sometimes does not compensate enough.
score /= ( (hu.size()=(unsigned char)192))
{
++length;
diff -Nru apertium-3.5.1/apertium/tmx_book_to_matrix.h apertium-3.5.2/apertium/tmx_book_to_matrix.h
--- apertium-3.5.1/apertium/tmx_book_to_matrix.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_book_to_matrix.h 2018-09-11 10:38:25.000000000 +0000
@@ -36,7 +36,7 @@
double scoreByTranslation( const Phrase& hu, const Phrase& en, const TransLex& transLex );
// This is much-much slower, but instead of identity, uses a many-to-many dictionary.
-// For performance reasons, by convention does not calculate the similarity if the
+// For performance reasons, by convention does not calculate the similarity if the
// alignMatrix element contains outsideOfRadiusValue, a big negative number.
void sentenceListsToAlignMatrixTranslation(
const SentenceList& huSentenceListPretty, const SentenceList& enSentenceList,
diff -Nru apertium-3.5.1/apertium/tmx_builder.cc apertium-3.5.2/apertium/tmx_builder.cc
--- apertium-3.5.1/apertium/tmx_builder.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_builder.cc 2018-09-11 10:38:25.000000000 +0000
@@ -48,14 +48,14 @@
lang2 = l2;
// default values of the parameters
-
+
max_edit = 50;
diagonal_width = 10;
window_size = 100;
step = 75;
percent=0.85;
edit_distance_percent=0.30;
-
+
freference = NULL;
}
@@ -63,11 +63,11 @@
{
}
-wstring
+wstring
TMXBuilder::restOfBlank(FILE *input)
{
wstring result = L"[";
-
+
while(true)
{
wint_t val = fgetwc(input);
@@ -86,25 +86,25 @@
}
result += static_cast(val);
break;
-
+
case L']':
result += L']';
return result;
-
+
default:
result += static_cast(val);
break;
}
}
-
+
return L"";
}
-wstring
+wstring
TMXBuilder::nextBlank(FILE *input)
{
wstring result = L"";
-
+
while(true)
{
wint_t val = fgetwc(input);
@@ -118,11 +118,11 @@
fgetwc(input);
break;
case L'[':
-
+
result = restOfBlank(input);
return result;
}
- }
+ }
}
bool
@@ -130,7 +130,7 @@
{
wstring s1 = nextBlank(f1), s2 = nextBlank(f2);
if(!lazy)
- {
+ {
while(!feof(f1) && !feof(f2))
{
if(s1 != s2)
@@ -140,7 +140,7 @@
s1 = nextBlank(f1);
s2 = nextBlank(f2);
}
- }
+ }
else
{
while(!feof(f1) && !feof(f2))
@@ -151,7 +151,7 @@
}
s1 = nextBlank(f1);
s2 = nextBlank(f2);
- }
+ }
}
return true;
}
@@ -183,7 +183,7 @@
fclose(f1);
return false;
}
-
+
bool retval = compatible(f1, f2, lazy);
fclose(f1);
@@ -196,7 +196,7 @@
{
wstring current_tu = L"";
wstring tmp;
-
+
while(true)
{
wint_t symbol = fgetwc_unlocked(input);
@@ -230,27 +230,27 @@
default:
current_tu += static_cast(symbol);
break;
-
+
case L'[':
tmp = restOfBlank(input);
if(tmp.substr(0,2) == L"[ ")
{
current_tu.append(L" ");
- }
+ }
current_tu.append(L"");
if(tmp.substr(tmp.size()-2, 2) == L" ]")
{
current_tu.append(L" ");
- }
+ }
break;
-
+
case L'.':
current_tu += L'.';
symbol = fgetwc_unlocked(input);
if(symbol != L'[' && !iswspace(symbol))
{
- if(!feof(input))
+ if(!feof(input))
{
ungetwc(symbol, input);
}
@@ -271,14 +271,14 @@
return current_tu.substr(0, idx+1);*/
}
break;
-
+
case L'?':
case L'!':
current_tu += static_cast(symbol);
return current_tu;
}
}
-
+
return current_tu;
}
@@ -286,7 +286,7 @@
TMXBuilder::xmlize(wstring const &str)
{
wstring result = L"";
-
+
for(size_t i = 0, limit = str.size(); i < limit; i++)
{
switch(str[i])
@@ -303,23 +303,23 @@
result.append(L"<");
}
break;
-
+
case L'>':
result.append(L">");
break;
-
+
case L'&':
result.append(L"&");
break;
-
+
default:
result += str[i];
break;
}
}
-
+
// remove leading 's
-
+
bool cambio = true;
while(cambio == true)
{
@@ -336,7 +336,7 @@
}
}
// remove trailing 's
-
+
cambio = true;
while(cambio == true)
{
@@ -352,10 +352,10 @@
cambio = true;
}
}
-
+
// remove trailing punctuation
-
+
for(unsigned int i = result.size()-1; result.size() > 0 && i > 0; i--)
{
if(!isRemovablePunct(result[i]))
@@ -371,10 +371,10 @@
}
return result;
-}
+}
-void
-TMXBuilder::generate(string const &file1, string const &file2,
+void
+TMXBuilder::generate(string const &file1, string const &file2,
string const &outfile)
{
FILE *output = stdout;
@@ -412,8 +412,8 @@
#ifdef _MSC_VER
_setmode(_fileno(f1), _O_U8TEXT);
_setmode(_fileno(f2), _O_U8TEXT);
-#endif
-
+#endif
+
generateTMX(f1, f2, output);
}
@@ -421,12 +421,12 @@
TMXBuilder::reverseList(vector const &v)
{
vector retval(v.size());
-
+
for(int j = v.size() - 1, i = 0; j >=0; j--, i++)
{
retval[i] = v[j];
}
-
+
return retval;
}
@@ -434,30 +434,30 @@
TMXBuilder::sentenceList(FILE *file)
{
vector retval;
-
+
while(true)
{
wstring f = nextTU(file);
if(feof(file))
{
break;
- }
+ }
retval.push_back(f);
}
-
+
return retval;
-}
+}
vector
TMXBuilder::extractFragment(vector const &text, unsigned int base, unsigned int width)
{
vector result;
-
+
for(unsigned int i = base; i < (base + width) && i < text.size(); i++)
{
result.push_back(text[i]);
}
-
+
return result;
}
@@ -483,7 +483,7 @@
{
return 3;
}
-}
+}
void
TMXBuilder::generateTMX(FILE *f1, FILE *f2, FILE *output)
@@ -528,7 +528,7 @@
if(secure_zone && similar(tu1, tu2))
{
printTU(output, tu1, tu2);
- }
+ }
}
void
@@ -559,7 +559,7 @@
TMXAligner::DictionaryItems dict;
AlignParameters ap;
-
+
ap.justSentenceIds = false;
ap.utfCharCountingMode = false;
ap.realignType=AlignParameters::NoRealign;
@@ -576,7 +576,7 @@
{
break;
}
-
+
if(val == L'\t')
{
conta++;
@@ -596,7 +596,7 @@
partes[conta] += val;
}
}
-
+
unlink(left.c_str());
unlink(right.c_str());
unlink(out.c_str());
@@ -612,10 +612,10 @@
if(freference != NULL)
{
lista3 = reverseList(sentenceList(freference));
- }
+ }
while(true)
- {
+ {
vector l1 = extractFragment(lista1, base_i, window_size);
vector l2 = extractFragment(lista2, base_j, window_size) , l3;
@@ -638,12 +638,12 @@
unsigned int const ncols = l2.size() + 1;
unsigned int i = nrows - 1;
unsigned int j = ncols - 1;
-
-
+
+
// printTable(table, nrows, ncols);
-
+
bool newBase = false;
-
+
while(true)
{
@@ -655,7 +655,7 @@
case 1:
i--;
j--;
-
+
if(l3.size() == 0)
{
if((newBase || l1.size() < step) && similar(l1[i], l2[j]))
@@ -669,16 +669,16 @@
{
printTU(output, l1[i], l2[j]);
}
- }
+ }
break;
-
- case 2:
+
+ case 2:
i--;
if(i > 2 && argmin(table[(i-1)*ncols + j-1],
- table[(i-1)*ncols + j],
- table[i*ncols + j-1]) == 3 &&
+ table[(i-1)*ncols + j],
+ table[i*ncols + j-1]) == 3 &&
argmin(table[(i-1)*ncols + j-2],
- table[(i-1)*ncols + j-1],
+ table[(i-1)*ncols + j-1],
table[i*ncols + j-2]) != 1)
{
if(l3.size() == 0)
@@ -694,19 +694,19 @@
{
printTU(output, l1[i], l2[j]);
}
- }
- }
+ }
+ }
// wcerr << L"[" << i << L" " << j << L"]" << endl;
break;
-
+
case 3:
j--;
if(j > 2 && argmin(table[(i-1)*ncols + j-1],
- table[(i-1)*ncols + j],
- table[i*ncols + j-1]) == 1 &&
+ table[(i-1)*ncols + j],
+ table[i*ncols + j-1]) == 1 &&
argmin(table[(i-1)*ncols + j-2],
- table[(i-1)*ncols + j-1],
+ table[(i-1)*ncols + j-1],
table[i*ncols + j-2]) != 3)
{
if(l3.size() == 0)
@@ -722,32 +722,32 @@
{
printTU(output, l1[i], l2[j]);
}
- }
- }
+ }
+ }
break;
-
+
default:
// error
break;
}
-
+
if(i == step && !newBase)
{
base_i += i;
base_j += j;
newBase = true;
}
-
+
if(i == 0 || j == 0)
{
break;
}
}
-
- delete[] table;
-
+
+ delete[] table;
+
if(l1.size() < window_size)
{
break;
@@ -755,39 +755,39 @@
}*/
}
-int
+int
TMXBuilder::weight(wstring const &s)
{
return s.size()*2; // just the size of the string
}
-int *
-TMXBuilder::levenshteinTable(vector &l1, vector &l2,
+int *
+TMXBuilder::levenshteinTable(vector &l1, vector &l2,
unsigned int diagonal_width, unsigned int max_edit)
-{
+{
unsigned int const nrows = l1.size() + 1;
unsigned int const ncols = l2.size() + 1;
-
+
int *table = new int[nrows * ncols];
-
+
table[0] = 0;
-
+
for(unsigned int i = 1; i < nrows; i++)
{
table[i*ncols] = table[(i-1)*ncols] + weight(l1[i-1]);
}
-
+
for(unsigned int j = 1; j < ncols; j++)
{
table[j] = table[j-1] + weight(l2[j-1]);
}
-
+
for(unsigned int i = 1; i < nrows; i++)
{
for(unsigned int j = 1; j < ncols; j++)
{
int ed = 0;
-
+
if(i > (j + diagonal_width))
{
ed = table[i*ncols]+table[j];
@@ -800,20 +800,20 @@
{
ed = editDistance(l1[i-1], l2[j-1], max_edit);
}
-
+
table[i*ncols+j] = min3(table[(i-1)*ncols + j-1] + ed,
table[(i-1)*ncols + j] + weight(l2[j-1]),
table[i*ncols + j-1] + weight(l1[i-1]));
}
}
-
+
return table;
}
wstring
TMXBuilder::filter(wstring const &tu)
{
- bool has_text = false;
+ bool has_text = false;
unsigned int count_blank = 0;
for(unsigned int i = 0, limit = tu.size(); i != limit; i++)
@@ -821,19 +821,19 @@
if(iswalpha(tu[i]))
{
has_text = true;
- }
+ }
else if(has_text && iswspace(tu[i]))
{
count_blank++;
}
- }
+ }
if(!has_text || count_blank <= 2 || tu.size() == 0)
{
return L"";
}
- return xmlize(tu);
+ return xmlize(tu);
}
void
@@ -845,15 +845,15 @@
if(tu1_filtered != L"" && tu2_filtered != L"")
{
- fprintf(output, "\n %s\n",
- UtfConverter::toUtf8(lang1).c_str(),
+ fprintf(output, "\n %s\n",
+ UtfConverter::toUtf8(lang1).c_str(),
UtfConverter::toUtf8(tu1_filtered).c_str());
-
+
fprintf(output, " %s\n\n",
- UtfConverter::toUtf8(lang2).c_str(),
- UtfConverter::toUtf8(tu2_filtered).c_str());
+ UtfConverter::toUtf8(lang2).c_str(),
+ UtfConverter::toUtf8(tu2_filtered).c_str());
}
-}
+}
int
TMXBuilder::min3(int i1, int i2, int i3)
@@ -897,11 +897,11 @@
{
int const nrows = min2(s1.size() + 1, max_edit);
int const ncols = min2(s2.size() + 1, max_edit);
-
+
int *table = new int[nrows*ncols];
-
+
table[0] = 0;
-
+
for(int i = 1; i < nrows; i++)
{
table[i*ncols] = i;
@@ -911,7 +911,7 @@
{
table[j] = j;
}
-
+
for(int i = 1; i < nrows; i++)
{
for(int j = 1; j < ncols; j++)
@@ -921,7 +921,7 @@
{
coste = 1;
}
-
+
table[i*ncols+j] = min3(table[(i-1)*ncols+(j-1)]+coste,
table[(i-1)*ncols+j] + 2,
table[i*ncols+(j-1)] + 2);
@@ -984,7 +984,7 @@
TMXBuilder::similar(wstring const &s1, wstring const &s2)
{
unsigned int l1 = s1.size();
- unsigned int l2 = s2.size();
+ unsigned int l2 = s2.size();
if((l1 <= low_limit) && (l2 <= low_limit))
{
@@ -997,7 +997,7 @@
int ed = editDistance(s1, s2, maxlength);
if(double(ed) < edit_distance_percent*double(maxlength))
- {
+ {
return double(minlength)/double(maxlength) > percent;
}
else
@@ -1023,5 +1023,5 @@
{
_setmode(_fileno(freference), _O_U8TEXT);
}
-#endif
+#endif
}
diff -Nru apertium-3.5.1/apertium/tmx_builder.h apertium-3.5.2/apertium/tmx_builder.h
--- apertium-3.5.1/apertium/tmx_builder.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_builder.h 2018-09-11 10:38:25.000000000 +0000
@@ -47,17 +47,17 @@
static vector reverseList(vector const &v);
static vector sentenceList(FILE *file);
static int argmin(int nw, int n, int w);
- static int * levenshteinTable(vector &l1, vector &l2,
+ static int * levenshteinTable(vector &l1, vector &l2,
unsigned int diagonal_width, unsigned int max_edit);
void printTU(FILE *output, wstring const &tu1, wstring const &tu2) const;
static wstring filter(wstring const &s);
- static int weight(wstring const &s);
+ static int weight(wstring const &s);
static void printTable(int *table, unsigned int nrows, unsigned int ncols);
static int editDistance(wstring const &s1, wstring const &s2, unsigned int max_edit);
static int min3(int i1, int i2, int i3);
static int min2(int i1, int i2);
void printTUCond(FILE *output, wstring const &s1, wstring const &s2, bool secure_zone);
- static vector extractFragment(vector const &text, unsigned int base,
+ static vector extractFragment(vector const &text, unsigned int base,
unsigned int width);
static bool isRemovablePunct(wchar_t const &c);
@@ -68,9 +68,9 @@
TMXBuilder(wstring const &l1, wstring const &l2);
~TMXBuilder();
static bool check(string const &file1, string const &file2, bool lazy = false);
- void generate(string const &file1, string const &file2,
+ void generate(string const &file1, string const &file2,
string const &outfile="");
-
+
void setMaxEdit(int me);
void setDiagonalWidth(int dw);
void setWindowSize(int ws);
diff -Nru apertium-3.5.1/apertium/tmx_dictionary.cc apertium-3.5.2/apertium/tmx_dictionary.cc
--- apertium-3.5.1/apertium/tmx_dictionary.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_dictionary.cc 2018-09-11 10:38:25.000000000 +0000
@@ -240,7 +240,7 @@
{
is.ignore();
}
-
+
if (is.peek()=='\n')
{
is.ignore();
@@ -467,7 +467,7 @@
std::set stopwords;
cStyleStringsToStringSet( huStopwordsC, stopwords );
-
+
for ( size_t i=0; i
class DicTree
{
@@ -53,11 +53,11 @@
// This structure stores a very sparse set-system of words.
// (A dictionary of complex expressions.)
-//
+//
// It supports the following query:
-// It receives a set of words S. It gives back the sets
+// It receives a set of words S. It gives back the sets
// of the set system that are contained in this set S.
-//
+//
// For it to be effective, we must be careful during the building phase:
// words in vector 'words' must be ordered by INCREASING frequency. Rare words first.
@@ -185,7 +185,7 @@
for ( typename Pebbles::const_iterator jt=pebbles.begin(); jt!=pebbles.end(); ++jt )
{
const DicTree* subTree = (*jt)->lookup(word) ;
-
+
if (!subTree)
continue;
diff -Nru apertium-3.5.1/apertium/tmx_quasi_diagonal.h apertium-3.5.2/apertium/tmx_quasi_diagonal.h
--- apertium-3.5.1/apertium/tmx_quasi_diagonal.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_quasi_diagonal.h 2018-09-11 10:38:25.000000000 +0000
@@ -135,7 +135,7 @@
{
return rows[y];
}
-
+
T& cell( int y, int x )
{
if ((y<0)||(y>=height))
diff -Nru apertium-3.5.1/apertium/tmx_trail_postprocessors.cc apertium-3.5.2/apertium/tmx_trail_postprocessors.cc
--- apertium-3.5.1/apertium/tmx_trail_postprocessors.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_trail_postprocessors.cc 2018-09-11 10:38:25.000000000 +0000
@@ -32,7 +32,7 @@
dynMatrix[trail[j+1].first][trail[j+1].second] ;
}
-BisentenceListScores::BisentenceListScores( const BisentenceList& bisentenceList_, const AlignMatrix& dynMatrix_ )
+BisentenceListScores::BisentenceListScores( const BisentenceList& bisentenceList_, const AlignMatrix& dynMatrix_ )
: bisentenceList(bisentenceList_), dynMatrix(dynMatrix_) {}
double BisentenceListScores::operator()( int j ) const
@@ -52,7 +52,7 @@
// Division is by the maximum of the Hungarian and English intervals.
// This is a somewhat arbritary decision, and goes very badly with the
// scoring of the knight's moves. But we really have no better choice.
-//
+//
// Also, the method applies some very ugly hacks to avoid the effect of
// paragraph-delimiters. It strips both intervals of
s, and
// modifies the dynMatrix-based score assuming that all
s got paired.
@@ -62,7 +62,7 @@
int huDiff = end.first - start.first ;
int enDiff = end.second - start.second ;
- double score =
+ double score =
dynMatrix[start.first][start.second]
-
dynMatrix[end. first][end. second] ;
@@ -156,7 +156,7 @@
{
if (
(pos==0)
- ||
+ ||
( oneToOne(bestTrail,pos-1) && oneToOne(bestTrail,pos) )
)
{
@@ -177,10 +177,10 @@
// Egy zero-to-nonzero hole valamelyik oldalan levo rundle-t kiirtom, ha a
// rundle torlese kozeliti az uj hezagban a magyar karakterszam / angol karakterszam
// hanyadost egyhez. A bal es a jobb kozul azt valasztom, amelyik tobbet javit.
-//
+//
// Meg akkor is olvasztok, ha ezzel kicsit rontok, mivel a valodi zero-to-one eleg ritka.
// Legalabbis regenyekben. Az improvementSlack konstansnak domainfuggonek kellene lennie.
-void spaceOutBySentenceLength( Trail& bestTrail,
+void spaceOutBySentenceLength( Trail& bestTrail,
const SentenceList& huSentenceListPretty,
const SentenceList& enSentenceList,
bool utfCharCountingMode )
@@ -216,7 +216,7 @@
double huRightBlock = characterLength( bestTrail[i+1].first, bestTrail[i+2].first, huSentenceListPretty, utfCharCountingMode );
double huMiddleBlock = characterLength( bestTrail[i].first, bestTrail[i+1].first, huSentenceListPretty, utfCharCountingMode );
double huLeftBlock = characterLength( bestTrail[i-1].first, bestTrail[i].first, huSentenceListPretty, utfCharCountingMode );
-
+
double enRightBlock = characterLength( bestTrail[i+1].second, bestTrail[i+2].second, enSentenceList, utfCharCountingMode );
double enMiddleBlock = characterLength( bestTrail[i].second, bestTrail[i+1].second, enSentenceList, utfCharCountingMode );
double enLeftBlock = characterLength( bestTrail[i-1].second, bestTrail[i].second, enSentenceList, utfCharCountingMode );
diff -Nru apertium-3.5.1/apertium/tmx_trail_postprocessors.h apertium-3.5.2/apertium/tmx_trail_postprocessors.h
--- apertium-3.5.1/apertium/tmx_trail_postprocessors.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_trail_postprocessors.h 2018-09-11 10:38:25.000000000 +0000
@@ -45,7 +45,7 @@
// Division is by the maximum of the Hungarian and English intervals.
// This is a somewhat arbritary decision, and goes very badly with the
// scoring of the knight's moves. But we really have no better choice.
- //
+ //
// Also, the method applies some very ugly hacks to avoid the effect of
// paragraph-delimiters. It strips both intervals of
s, and
// modifies the dynMatrix-based score assuming that all
s got paired.
@@ -90,7 +90,7 @@
// This of course dramatically improves precision while slightly degrading recall.
void cautiouslyFilterTrail( Trail& bestTrail );
-void spaceOutBySentenceLength( Trail& bestTrail,
+void spaceOutBySentenceLength( Trail& bestTrail,
const SentenceList& huSentenceListPretty,
const SentenceList& enSentenceList,
bool utfCharCountingMode );
@@ -112,8 +112,8 @@
// The function gets a nonconst reference to bestTrail.
// On the other hand, it gets a const reference to bestTrail, through trailScoresInterval.
// Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval.
-void postprocessTrail( Trail& bestTrail,
- const TrailScoresInterval& trailScoresInterval,
+void postprocessTrail( Trail& bestTrail,
+ const TrailScoresInterval& trailScoresInterval,
double qualityThreshold );
diff -Nru apertium-3.5.1/apertium/tmx_translate.cc apertium-3.5.2/apertium/tmx_translate.cc
--- apertium-3.5.1/apertium/tmx_translate.cc 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/tmx_translate.cc 2018-09-11 10:38:25.000000000 +0000
@@ -39,9 +39,9 @@
}
}
-void buildDumbDictionaryUsingFrequencies(
- const DictionaryItems& dictionary,
- FrequencyMap& enFreq,
+void buildDumbDictionaryUsingFrequencies(
+ const DictionaryItems& dictionary,
+ FrequencyMap& enFreq,
DumbDictionary& dumbDictionary )
{
dumbDictionary.clear();
@@ -197,7 +197,7 @@
Phrase phrase;
trivialTranslateWord( dumbDictionary, originalWord, phrase );
-
+
for (size_t k=0; k old_finals = transducer.getFinals(); // copy for later removal
+ map old_finals = transducer.getFinals(); // copy for later removal
map finals_rules; // node id -> rule number
- map >& transitions = transducer.getTransitions();
+ map > >& transitions = transducer.getTransitions();
// Find all arcs with "final_symbols" in the transitions, let their source node instead be final,
// and extract the rule number from the arc. Record relation between source node and rule number
// in finals_rules. It is now no longer safe to minimize -- but we already did that.
const wstring rule_sym_pre = L" >::const_iterator it = transitions.begin(),
+ for(map > >::const_iterator it = transitions.begin(),
limit = transitions.end(); it != limit; ++it)
{
const int src = it->first;
- for(multimap::const_iterator arc = it->second.begin(),
+ for(multimap >::const_iterator arc = it->second.begin(),
arclimit = it->second.end(); arc != arclimit; ++arc)
{
const int symbol = arc->first;
- const int trg = arc->second;
+ const int trg = arc->second.first;
+ const double wgt = arc->second.second;
if(final_symbols.count(symbol) == 0) {
continue;
}
@@ -157,15 +158,15 @@
continue;
}
const int rule_num = stoi(s.substr(rule_sym_pre.size()));
- transducer.setFinal(src);
+ transducer.setFinal(src, wgt);
finals_rules[src] = rule_num;
}
}
// Remove the old finals:
- for(set::const_iterator it = old_finals.begin(), limit = old_finals.end();
+ for(map::const_iterator it = old_finals.begin(), limit = old_finals.end();
it != limit; ++it)
{
- transducer.setFinal(*it, false);
+ transducer.setFinal(it->first, it->second, false);
}
transducer.write(output, alphabet.size());
diff -Nru apertium-3.5.1/apertium/transfer_data.h apertium-3.5.2/apertium/transfer_data.h
--- apertium-3.5.1/apertium/transfer_data.h 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/transfer_data.h 2018-09-11 10:38:25.000000000 +0000
@@ -31,13 +31,13 @@
private:
void copy(TransferData const &o);
void destroy();
-
+
map attr_items;
map macros;
map, Ltstr> lists;
map variables;
set final_symbols;
-
+
Alphabet alphabet;
Transducer transducer;
@@ -47,17 +47,17 @@
~TransferData();
TransferData(TransferData const &o);
TransferData & operator =(TransferData const &o);
-
+
Alphabet & getAlphabet();
Transducer & getTransducer();
- map & getAttrItems();
+ map & getAttrItems();
map seen_rules;
map & getMacros();
map, Ltstr> & getLists();
map & getVariables();
-
+
/**
* Encode the rule count in an arc label/symbol (later extracted by
* write()), recording that it's been used in final_symbols, and
diff -Nru apertium-3.5.1/apertium/transfer.dtd apertium-3.5.2/apertium/transfer.dtd
--- apertium-3.5.1/apertium/transfer.dtd 2018-03-26 15:22:45.000000000 +0000
+++ apertium-3.5.2/apertium/transfer.dtd 2018-09-11 10:38:25.000000000 +0000
@@ -1,25 +1,25 @@
-
+ Draft of DTD for the structural transfer rule files
+
+ Sergio Ortiz, Gema RamÃrez-Sánchez, Mireia GinestÃ, Mikel L. Forcada,
+ 2005.07.29.
+-->
@@ -29,7 +29,7 @@
-
-
@@ -45,16 +45,16 @@
-
-
-
-
+
-
-
-
-
@@ -99,7 +99,7 @@
-
@@ -123,12 +123,12 @@
-
-
@@ -156,8 +156,8 @@
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
+
-
-
+
-
-
+
-
-
+
-
-
+
-
-
-
-
+-->
-
-
@@ -369,40 +369,40 @@
queue CDATA #IMPLIED
link-to CDATA #IMPLIED
c CDATA #IMPLIED>
-
-
-
@@ -410,15 +410,15 @@
-
-
+
@@ -428,16 +428,16 @@
side (sl|tl) #REQUIRED
part CDATA #REQUIRED>
@@ -469,8 +469,8 @@
namefrom CDATA #IMPLIED
case CDATA #IMPLIED
c CDATA #IMPLIED>
-
@@ -168,7 +168,7 @@
tag sequence defining each possible lexical form. For example,
tags="n.f" would match all lexical forms containing this tag
sequence, such as "^casa$".
-
+
In addition, an optional attribute, "lemma", may be used to
define lexical forms having a particular substring in their lemma
-->
@@ -185,7 +185,7 @@
@@ -205,7 +205,7 @@
@@ -292,7 +292,7 @@
@@ -308,8 +308,8 @@
@@ -323,10 +323,10 @@
@@ -349,9 +349,9 @@
@@ -383,7 +383,7 @@
@@ -448,7 +448,7 @@
@@ -550,7 +550,7 @@
@@ -749,8 +749,8 @@
@@ -774,8 +774,8 @@
@@ -788,7 +788,7 @@
@@ -824,7 +824,7 @@
it may be used with arguments 1 and 2 in a noun-adjective rule,
with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with
arguments 1 and 3 in a noun-adverb-adjective rule, and with
- arguments 2 and 1 in an adjective-noun rule
+ arguments 2 and 1 in an adjective-noun rule
-->
@@ -854,24 +854,24 @@
@@ -916,7 +916,7 @@
of the variable. When it is in an 'out', a 'test', or the right
part of a 'let', it represents the value of the variable; when in
the left part of a 'let' it represents the reference of the
- variable.
+ variable.
-->
@@ -952,16 +952,16 @@
@@ -1055,7 +1055,7 @@
+ -->