diff -u dutch-2.20.19/aspell/nl.dat dutch-2.20.19/aspell/nl.dat --- dutch-2.20.19/aspell/nl.dat +++ dutch-2.20.19/aspell/nl.dat @@ -1,6 +1,7 @@ name nl charset iso8859-1 -special - -*- ' **- +data-encoding utf-8 +special - -*- ' **- & -*- soundslike generic run-together-limit 2 run-together-min 3 diff -u dutch-2.20.19/aspell/nl_affix.dat dutch-2.20.19/aspell/nl_affix.dat --- dutch-2.20.19/aspell/nl_affix.dat +++ dutch-2.20.19/aspell/nl_affix.dat @@ -1,5 +1,5 @@ -SET ISO8859-1 -TRY esianrtolcdugmphbyfvkwjkqxz-ëéèêïîäàâöüû'ESIANRTOLCDUGMPHBYFVKWJKQXZ +SET UTF-8 +TRY esianrtolcdugmphbyfvkwjkqxz-ëéèêïîäàâöüû'ESIANRTOLCDUGMPHBYFVKWJKQXZ # combined layout for BE and NL keyboards NL en BE KEY qwertyuiop|asdfghjkl|zxcvbnm|qawsedrftgyhujikolp|azsxdcfvgbhnjmk|aze|qsd|lm|wx|aqz|qws| @@ -7,7 +7,7 @@ # wordchars is (or is supposed to be) only for command line WORDCHARS ' # . is not useful as wordchar -# the way applicatiobs offer words to the Hunspell interface apperently differs, +# the way applications offer words to the Hunspell interface apperently differs, # causing problems with abbreviations and end-of-sentence words. # Issue has been communicated with Hunspell's author. @@ -185,11 +185,11 @@ # make calculation of alternatives better for words with accents MAP 5 -MAP eéèëê -MAP aáà -MAP iíìïî -MAP oóòöô -MAP uüúùû +MAP eéèëê +MAP aáà +MAP iíìïî +MAP oóòöô +MAP uüúùû REP 315 REP g ch @@ -203,20 +203,20 @@ REP e ij REP ie y REP ie y' -REP ï ii -REP ee é -REP é ee -REP ee ë -REP ae ë -REP je ë -REP eë e-e +REP ï ii +REP ee é +REP é ee +REP ee ë +REP ae ë +REP je ë +REP eë e-e REP o eau REP o's eaus REP oo eau REP ok eauc REP joe u REP ui eu -REP iu uï +REP iu uï REP ies isch REP s sch REP ss zz @@ -285,8 +285,8 @@ REP eid ijdt REP ters teress REP iep ype -REP sen sciën -REP sens sciënt +REP sen sciën +REP sens sciënt REP kons conc REP konsekw consequ REP akkom accomm @@ -308,20 +308,20 @@ REP ceedees cd's REP grafitti graffiti REP kangaroe kangoeroe -REP kassiere caissière -REP britanië Brittannië -REP britanie Brittannië +REP kassiere caissière +REP britanië Brittannië +REP britanie Brittannië REP capucino cappuccino REP celcius Celsius REP kado cadeau REP kado's cadeaus REP kadootje cadeautje -REP comittee comité -REP committee comité -REP komitee comité -REP komittee comité -REP kommitee comité -REP kommittee comité +REP comittee comité +REP committee comité +REP komitee comité +REP komittee comité +REP kommitee comité +REP kommittee comité REP kwis quiz REP kwissen quizzen REP loupe loep @@ -335,7 +335,7 @@ REP lazanja lasagne REP attack attaque REP uptodate up-to-date -REP caffeine cafeïne +REP caffeine cafeïne REP enigsinds enigszins REP klup club REP wiskid whizzkid @@ -344,7 +344,7 @@ REP korrekt correct REP kritikus criticus REP gestressed gestrest -REP zoëven zo-even +REP zoëven zo-even REP dislek dyslec REP dislektie dyslexie REP disleksie dyslexie @@ -480,7 +480,7 @@ REP mn mijn REP zn zijn REP enz enz. -REP ok oké +REP ok oké REP dr haar REP dr er REP ondermeer onder_meer @@ -503,7 +503,7 @@ -REP ok oké +REP ok oké REP o eau REP k c REP openhaard open_haard REP vrijetijd vrije_tijd REP spercibo sperziebo -REP voorzover voor_zover \ No newline at end of file +REP voorzover voor_zover diff -u dutch-2.20.19/convert dutch-2.20.19/convert --- dutch-2.20.19/convert +++ dutch-2.20.19/convert @@ -18,13 +18,14 @@ # ispell LATIN1ISPELL=ispell/dutch.mwl # aspell -LATIN1ASPELL=aspell/nl +ASPELL=aspell/nl # We need to work around shortcommings in various spelling software: -# - Only hunspell can deal with UTF-8, so for the others we need to -# convert it latin1. This currently means replacing the -# subscripts with normal numbers and removing words with €. +# - ispell can not handle UTF-8, so we need to convert the list to latin1. +# aspell now supports UTF-8, but can't handle subscript. +# This currently means replacing the subscripts with normal numbers, +# and for ispell remove words with €. # - No spelling software can deal with words with a space or dash (-) in it. # We also add the word splitted, which also means it will approve # words that can never be used alone. @@ -80,28 +81,28 @@ # Splitting the words generated some words ending in a - # like "kijk- en luistergeld" generates the word "kijk-". +# It contains words that start with - like "-werk". +# It contains words with ², ³, or starting with @ or & # It also generated empty lines. Remove those words and empty lines. # Also remove words with a '/' in it, since they can't be munched. -grep -v -E -- '-$|^$|/' $UTF8FIXEDLIST > tmp.txt +grep -v -E -- '-$|^$|/|^-|²|³|^@|^&' $UTF8FIXEDLIST > tmp.txt cp tmp.txt $UTF8FIXEDLIST -# Remove words with € -grep -v '€' $UTF8FIXEDLIST > tmp.txt +# Replace the subscript 2 and 8 with normal numbers sort, remove duplicates +sed -e 's/\xE2\x82\x82/2/g' -e 's/\xE2\x82\x88/8/g' $UTF8FIXEDLIST |sort |uniq > tmp.txt cp tmp.txt $UTF8FIXEDLIST -# Replace the subscript 2 and 8 with normal numbers -# sort, remove duplicates and convert to latin1. -sed -e 's/\xE2\x82\x82/2/g' -e 's/\xE2\x82\x88/8/g' $UTF8FIXEDLIST |sort |uniq | iconv -f utf-8 -t latin1 > $LATIN1FIXEDLIST +# aspell +grep -av "[0-9 \.'\+]" $UTF8FIXEDLIST > $ASPELL.wl +prezip $ASPELL.wl +gzip -9nf $ASPELL.cwl # ispell +# Remove words with € and convert to latin1. +grep -v "[0-9 \.'\+-\&€]" $UTF8FIXEDLIST | iconv -f utf-8 -t latin1 > $LATIN1FIXEDLIST munchlist -l $ISPELL_AFF -v -w "&'\`-_" $LATIN1FIXEDLIST > $LATIN1ISPELL gzip -9nf $LATIN1ISPELL -# aspell -grep -av "[0-9 \.'\+]" $LATIN1FIXEDLIST > $LATIN1ASPELL.wl -prezip $LATIN1ASPELL.wl -gzip -9nf $LATIN1ASPELL.cwl - # cleanup temp files rm tmp.txt rm $UTF8WORDLIST diff -u dutch-2.20.19/debian/changelog dutch-2.20.19/debian/changelog --- dutch-2.20.19/debian/changelog +++ dutch-2.20.19/debian/changelog @@ -1,3 +1,13 @@ +dutch (1:2.20.19-2) unstable; urgency=medium + + * Convert aspell files from latin1 to UTF-8 + * Fix list of accepted aspell workds (Closes: #979558) + - Allow & in the middle of words in aspell + - Remove words starting with a -, @ or &, and words containig ² or ³. + * Remove words with -, ' and & for ispell (Closes: #783501) + + -- Kurt Roeckx Tue, 12 Jan 2021 23:13:12 +0100 + dutch (1:2.20.19-1) unstable; urgency=medium [ OndÅ™ej Nový ]