diff -Nru tegaki-tools-0.3/ChangeLog tegaki-tools-0.3.1/ChangeLog --- tegaki-tools-0.3/ChangeLog 2009-11-01 12:07:16.000000000 +0000 +++ tegaki-tools-0.3.1/ChangeLog 2010-03-23 07:29:23.000000000 +0000 @@ -1,3 +1,69 @@ +commit 5b0b86343c91cbf3d7ee683e82f41f9df75edf4d +Author: Mathieu Blondel +Date: Mon Mar 22 15:32:33 2010 +0900 + + [all] Set current version to 0.3.1. + +commit ae808ddbdffa3750a71d2636fe475d5d774516f1 +Author: Mathieu Blondel +Date: Wed Dec 9 17:16:34 2009 +0900 + + [tegaki-stats] Display total number of strokes. + +commit 07a78dd10e599ecbf6d255704551955e0e696a2d +Author: Mathieu Blondel +Date: Wed Dec 9 16:13:38 2009 +0900 + + [tegaki-tool] Ask whether to overwrite existing db or not. + +commit ec14631fab52e09b073e26fd58112e96f5a56d64 +Author: Mathieu Blondel +Date: Wed Dec 9 14:12:37 2009 +0900 + + [tegaki-tools] Various optimizations. + +commit 690a5a89ac2dd301e3efc673912d8ad8c6be0cac +Author: Mathieu Blondel +Date: Wed Dec 2 20:24:23 2009 +0900 + + [tegaki-tools] Use CharacterCollection new save method. + +commit f5fa2429291939d7358a61aeafcd178e5fccd4c4 +Author: Mathieu Blondel +Date: Tue Dec 1 17:52:55 2009 +0900 + + [tegaki-build] Don't import Recognizer. + +commit 90fd8fa22706942945682795acaab65f1280a86c +Author: Mathieu Blondel +Date: Tue Dec 1 17:30:06 2009 +0900 + + [tegaki-tools] Support for new CharacterCollection features. + +commit 4cb5520040c22a617d87dc5b6c26ed1a8c16ad75 +Author: Christoph Burgmer +Date: Sun Nov 22 12:15:18 2009 +0100 + + [tegaki-bootstrap] always provide max_samples, mix exact and decomposition transformations; make max_samples default to 1; optimize + +commit 8a7cbb7c31c6672876d71d9f8915052023036fb8 +Author: Christoph Burgmer +Date: Fri Nov 20 13:31:37 2009 +0100 + + [tegaki-bootstrap] Optimize memory footprint. + +commit 5a73c1b7118429ba59ddeedf3a38dc3b5fb84511 +Author: Christoph Burgmer +Date: Fri Nov 20 00:55:15 2009 +0100 + + [tegaki-extractcomponents] Add a tool to extract handwriting of a component. + +commit 909dac8225b23cddeeb898723a02818f1cac4ea6 +Author: Christoph Burgmer +Date: Fri Nov 20 00:22:18 2009 +0100 + + [tegaki-bootstrap] Add option to exclude direct transformations. Also optimize memory useage for 'big' values of "-m". + commit 23428970b5125e6effc233ff3261e6cb368ff93d Author: Mathieu Blondel Date: Sun Nov 1 20:56:48 2009 +0900 diff -Nru tegaki-tools-0.3/debian/changelog tegaki-tools-0.3.1/debian/changelog --- tegaki-tools-0.3/debian/changelog 2010-05-10 00:36:17.000000000 +0100 +++ tegaki-tools-0.3.1/debian/changelog 2010-03-26 15:41:31.000000000 +0000 @@ -1,3 +1,11 @@ +tegaki-tools (0.3.1-1) unstable; urgency=low + + * New upstream release. + * debian/control: bump standards version to 3.8.4. + * debian/source/format: 3.0. + + -- LI Daobing Fri, 26 Mar 2010 23:39:15 +0800 + tegaki-tools (0.3-1) unstable; urgency=low * New upstream release. diff -Nru tegaki-tools-0.3/debian/control tegaki-tools-0.3.1/debian/control --- tegaki-tools-0.3/debian/control 2010-05-10 00:36:17.000000000 +0100 +++ tegaki-tools-0.3.1/debian/control 2010-03-26 15:39:11.000000000 +0000 @@ -4,7 +4,7 @@ Maintainer: LI Daobing Build-Depends: debhelper (>= 5), python Build-Depends-Indep: python-support (>= 0.6) -Standards-Version: 3.8.3 +Standards-Version: 3.8.4 Homepage: http://www.tegaki.org/ Vcs-Browser: https://code.launchpad.net/~lidaobing/tegaki/tegaki-tools Vcs-Bzr: lp:~lidaobing/tegaki/tegaki-tools diff -Nru tegaki-tools-0.3/debian/source/format tegaki-tools-0.3.1/debian/source/format --- tegaki-tools-0.3/debian/source/format 1970-01-01 01:00:00.000000000 +0100 +++ tegaki-tools-0.3.1/debian/source/format 2010-05-10 00:36:18.000000000 +0100 @@ -0,0 +1 @@ +3.0 (quilt) diff -Nru tegaki-tools-0.3/PKG-INFO tegaki-tools-0.3.1/PKG-INFO --- tegaki-tools-0.3/PKG-INFO 2009-11-01 12:07:16.000000000 +0000 +++ tegaki-tools-0.3.1/PKG-INFO 2010-03-23 07:29:23.000000000 +0000 @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: tegaki-tools -Version: 0.3 +Version: 0.3.1 Summary: A set of command-line tools for Tegaki. Home-page: http://www.tegaki.org Author: Mathieu Blondel diff -Nru tegaki-tools-0.3/src/tegaki-bootstrap tegaki-tools-0.3.1/src/tegaki-bootstrap --- tegaki-tools-0.3/src/tegaki-bootstrap 2009-11-01 06:54:59.000000000 +0000 +++ tegaki-tools-0.3.1/src/tegaki-bootstrap 2010-02-27 04:52:04.000000000 +0000 @@ -55,12 +55,12 @@ """ import sys import locale +import random +random.seed(12345) # provide deterministic results from optparse import OptionParser from tegaki.character import CharacterCollection, Writing, Character -from tegakitools.tomoe import tomoe_dict_to_character_collection -from tegakitools.kuchibue import kuchibue_to_character_collection from tegakitools.charcol import * try: @@ -113,8 +113,10 @@ self._include = options.include self._exclude = options.exclude self._max_samples = options.max_samples + assert self._max_samples self._locale = options.locale self._character_domain = options.character_domain + self._no_exact_transformation = options.no_exact_transformation self._quiet = options.quiet try: @@ -125,34 +127,25 @@ self._cjk = CharacterLookup(self._locale, self._character_domain) def _get_charcol(self): - if not hasattr(self, '_charcol'): - self._charcol = get_aggregated_charcol( - ((TYPE_CHARCOL, self._charcols), - (TYPE_DIRECTORY, self._directories), - (TYPE_TOMOE, self._tomoe), - (TYPE_KUCHIBUE, self._kuchibue))) - - self._charcol.include_characters_from_files(self._include) - self._charcol.exclude_characters_from_files(self._exclude) - - # max samples - if self._max_samples: - self._charcol.remove_samples(keep_at_most=self._max_samples) + _charcol = get_aggregated_charcol(((TYPE_CHARCOL, self._charcols), + (TYPE_DIRECTORY, self._directories), + (TYPE_TOMOE, self._tomoe), + (TYPE_KUCHIBUE, self._kuchibue))) - return self._charcol + _charcol.include_characters_from_files(self._include) + _charcol.exclude_characters_from_files(self._exclude) - def run(self): - charcol = self._get_charcol() + # max samples + _charcol.remove_samples(keep_at_most=self._max_samples) - if charcol.get_total_n_characters() == 0: - raise TegakiBootstrapError("Empty input collection provided") + return _charcol + def run(self): # do the bootstrapping - to_charcol = self.bootstrap(charcol) + to_charcol = self.bootstrap() # max samples - if self._max_samples: - to_charcol.remove_samples(keep_at_most=self._max_samples) + #to_charcol.remove_samples(keep_at_most=self._max_samples) # output if not self._output_path: @@ -164,46 +157,69 @@ if self._output_path.endswith(".bz2"): bz2 = True to_charcol.write(self._output_path, gzip=gzip, bz2=bz2) - def bootstrap(self, charcol): - exact_transformations = 0 - decomposition_transformations = 0 - decomposition_fertilities = [] - missing_transformations = 0 + def bootstrap(self): + n_chars = 0 + n_exact_transformations = 0 + char_n_exact_transformations = 0 + n_decomposition_transformations = 0 + char_n_decomposition_transformations = 0 + + char_n_missing_transformations = 0 + char_n_underrepresented = 0 to_charcol = CharacterCollection() missing_char_dict = {} missing_single_characters = [] # iterate through all characters of the target character set - count = 0 for target_char in self._cjk.getDomainCharacterIterator(): #for target_char in iter([u'亄', u'乿', u'仜', u'伳']): # DEBUG - count += 1 - if count % 100 == 0: + n_chars += 1 + if n_chars % 100 == 0: sys.stdout.write('.') sys.stdout.flush() - charSet = target_char.encode('utf8') + char_set = target_char.encode('utf8') source_character_lookup = self._get_source_character_lookup() - if target_char in source_character_lookup: - to_charcol.add_set(charSet) - for character in source_character_lookup[target_char]: - to_charcol.append_character(charSet, character) - exact_transformations += 1 - else: + exact_transformations = 0 + if (target_char in source_character_lookup + and not self._no_exact_transformation): + char_n_exact_transformations += 1 + + to_charcol.add_set(char_set) + source_chars = source_character_lookup[target_char] + for character in source_chars[:self._max_samples]: + exact_transformations += 1 + to_charcol.append_character(char_set, character) + + n_exact_transformations += exact_transformations + n_total_transformation = exact_transformations + + # fill up with decomposition transformations? + need_n_chars = self._max_samples - exact_transformations + + decomposition_transformations = 0 + if need_n_chars > 0: writing_objects, missing_chars \ - = self.get_writings_from_decomposition(target_char) + = self.get_writings_from_decomposition(target_char, + force_decomposition=True) if writing_objects: + char_n_decomposition_transformations += 1 + + writing_objects = writing_objects[:need_n_chars] for writing in writing_objects: + decomposition_transformations += 1 + character = Character() character.set_writing(writing) character.set_unicode(target_char) - to_charcol.append_character(charSet, character) + to_charcol.append_character(char_set, character) - decomposition_transformations += 1 - decomposition_fertilities.append(len(writing_objects)) - else: + n_total_transformation += decomposition_transformations + + if n_total_transformation == 0: if missing_chars: + # list components that can help us build this transform. for missing in missing_chars: if missing not in missing_char_dict: missing_char_dict[missing] = [] @@ -211,28 +227,49 @@ else: missing_single_characters.append(target_char) - missing_transformations += 1 + n_decomposition_transformations += decomposition_transformations + + # if no direct transformation exists we have no data at all + if n_total_transformation == 0: + char_n_missing_transformations += 1 + elif n_total_transformation < self._max_samples: + # we have data, just not enough + char_n_underrepresented += 1 sys.stdout.write('\n') if not self._quiet: _, default_encoding = locale.getdefaultlocale() - total = exact_transformations + decomposition_transformations \ - + missing_transformations - print 'Exact transformation count: %d (%d%%)' \ - % (exact_transformations, 100 * exact_transformations / total) - print 'Decomposition transformation count: %d (%d%%)' \ - % (decomposition_transformations, - 100 * decomposition_transformations / total) - if decomposition_fertilities: - decomposition_fertility = (sum(decomposition_fertilities) \ - / len(decomposition_fertilities)) - else: - decomposition_fertility = 1 - print 'Decomposition fertility: %d' % decomposition_fertility - print 'Missing transformations: %d (%d%%)' \ - % (missing_transformations, - 100 * missing_transformations / total) + total = n_exact_transformations + n_decomposition_transformations + + print 'Total characters: %d' % n_chars + print 'Total transformation (instances): %d' % total + + print 'Characters with exact transformations: %d (%d%%)' \ + % (char_n_exact_transformations, + 100 * char_n_exact_transformations / n_chars) + print 'Total exact transformations: %d (%d%%)' \ + % (n_exact_transformations, + 100 * n_exact_transformations / total) + print 'Average exact transformations: %f' \ + % (1. * n_exact_transformations / n_chars) + + print 'Characters with decomposition transformations: %d (%d%%)' \ + % (char_n_decomposition_transformations, + 100 * char_n_decomposition_transformations / n_chars) + print 'Total decomposition transformations: %d (%d%%)' \ + % (n_decomposition_transformations, + 100 * n_decomposition_transformations / total) + print 'Average decomposition transformations: %f' \ + % (1. * n_decomposition_transformations / n_chars) + + print 'Characters missing transformations: %d (%d%%)' \ + % (char_n_missing_transformations, + 100 * char_n_missing_transformations / n_chars) + if self._max_samples > 1: + print 'Characters with less than %d instances: %d (%d%%)' \ + % (self._max_samples, char_n_underrepresented, + 100 * char_n_underrepresented / n_chars) # missing single characters # Extend by those with components, that have a component with low @@ -289,7 +326,12 @@ def _get_source_character_lookup(self): if not hasattr(self, '_source_character_lookup'): self._source_character_lookup = {} - for character in self._get_charcol().get_all_characters(): + + charcol = self._get_charcol() + if charcol.get_total_n_characters() == 0: + raise TegakiBootstrapError("Empty input collection provided") + + for character in charcol.get_all_characters(): char = character.get_utf8().decode('utf8') if char not in self._source_character_lookup: self._source_character_lookup[char] = [] @@ -297,23 +339,29 @@ return self._source_character_lookup - def get_writings_from_decomposition(self, char): + def get_writings_from_decomposition(self, char, force_decomposition=False): writing_objects = [] - if char in self._get_source_character_lookup(): - writing_objects = [character.get_writing() \ - for character in self._get_source_character_lookup()[char]] - elif (CharacterLookup.isRadicalChar(char) + + source_char_lookup = self._get_source_character_lookup() + + exact_transformations = 0 + if (not force_decomposition and char in source_char_lookup): + writing_objects.extend([character.get_writing() \ + for character in source_char_lookup[char]]) + + if (CharacterLookup.isRadicalChar(char) and char not in self.RADICALS_NON_VISUAL_EQUIVALENCE): try: equivChar = self._cjk.getRadicalFormEquivalentCharacter(char) - if equivChar in self._get_source_character_lookup(): - writing_objects = [character.get_writing() for character \ - in self._get_source_character_lookup()[equivChar]] + if equivChar in source_char_lookup: + writing_objects.extend([character.get_writing() + for character in source_char_lookup[equivChar]]) except UnsupportedError: pass + # add decompositions, limit to upper bound max_samples missing_chars = [] - if not writing_objects: + if len(writing_objects) < self._max_samples: decompositions = self._cjk.getDecompositionEntries(char) for decomposition in decompositions: writing_objs, _, missing = self._get_writing_from_entry( @@ -322,8 +370,10 @@ missing_chars.extend(missing) writing_objects.extend(writing_objs) - if writing_objects: - missing_chars = [] + if len(writing_objects) >= self._max_samples: + break + + writing_objects = writing_objects[:self._max_samples] return writing_objects, missing_chars @@ -365,8 +415,12 @@ # merge writing_objects = [] if not missing_chars: - for writing_objs in TegakiBootstrap.cross( - *writing_objects_list): + compound_writings = TegakiBootstrap.cross(*writing_objects_list) + # shuffle to provide more variation + random.shuffle(compound_writings) + compound_writings = compound_writings[:self._max_samples] + + for writing_objs in compound_writings: writing = self.merge_writing_objects(character, writing_objs) writing_objects.append(writing) @@ -395,20 +449,20 @@ assert False @classmethod - def merge_writing_objects(cls, idsChar, writing_objects): - if idsChar not in cls.COMPONENT_TRANSFORMATION: + def merge_writing_objects(cls, ids_char, writing_objects): + if ids_char not in cls.COMPONENT_TRANSFORMATION: raise ValueError("Not supported") # [u'⿴', u'⿻', u'⿷'] - assert (CharacterLookup.isBinaryIDSOperator(idsChar) \ + assert (CharacterLookup.isBinaryIDSOperator(ids_char) \ and len(writing_objects) == 2) \ - or (CharacterLookup.isTrinaryIDSOperator(idsChar) \ + or (CharacterLookup.isTrinaryIDSOperator(ids_char) \ and len(writing_objects) == 3) - assert len(cls.COMPONENT_TRANSFORMATION[idsChar]) \ + assert len(cls.COMPONENT_TRANSFORMATION[ids_char]) \ == len(writing_objects) - transformations = cls.COMPONENT_TRANSFORMATION[idsChar] + transformations = cls.COMPONENT_TRANSFORMATION[ids_char] # reverse transformations where inner part is written first - if idsChar in [u'⿺', u'⿶']: + if ids_char in [u'⿺', u'⿶']: writing_objects.reverse() transformations = transformations[:] transformations.reverse() @@ -419,7 +473,7 @@ obj = writing_objects[idx].copy() obj.resize(xrate, yrate) - obj.move_rel(dx * obj.get_width(), dy* obj.get_height()) + obj.move_rel(dx * obj.get_width(), dy * obj.get_height()) obj.resize(resultingWriting.get_width() / obj.get_width(), resultingWriting.get_height() / obj.get_height()) for s in obj.get_strokes(True): @@ -465,7 +519,7 @@ default=[], help="File containing characters to exclude") parser.add_option("-m", "--max-samples", - type="int", dest="max_samples", + type="int", dest="max_samples", default=1, help="Maximum number of samples per character") @@ -479,6 +533,10 @@ help="Character domain of target characters") +parser.add_option("-x", "--no-exact", dest="no_exact_transformation", + action="store_true", + help="Don't use exact transformations" \ + + ", use only decompositions") parser.add_option("-q", "--quiet", dest="quiet", action="store_true", help="Don't print any statistics") diff -Nru tegaki-tools-0.3/src/tegaki-build tegaki-tools-0.3.1/src/tegaki-build --- tegaki-tools-0.3/src/tegaki-build 2009-09-08 02:25:28.000000000 +0100 +++ tegaki-tools-0.3.1/src/tegaki-build 2010-03-23 07:07:56.000000000 +0000 @@ -24,13 +24,12 @@ import os from optparse import OptionParser -from tegaki.character import CharacterCollection +from tegaki.charcol import CharacterCollection from tegaki.trainer import Trainer, TrainerError -from tegaki.recognizer import Recognizer -from tegakitools.tomoe import tomoe_dict_to_character_collection +from tegakitools.charcol import * -VERSION = '0.3' +VERSION = '0.3.1' class TegakiBuildError(Exception): pass @@ -40,7 +39,12 @@ def __init__(self, options, args): self._directories = options.directories self._charcols = options.charcols + self._databases = options.databases self._tomoe = options.tomoe + self._kuchibue = options.kuchibue + self._include = options.include + self._exclude = options.exclude + self._max_samples = options.max_samples self._list = options.list if not self._list: self._trainer = args[0] @@ -61,28 +65,26 @@ # read meta file try: - meta = Recognizer.read_meta_file(self._meta) + meta = Trainer.read_meta_file(self._meta) except IOError, e: raise TegakiBuildError, str(e) - # add the directories provided - for directory in self._directories: - charcol += CharacterCollection.from_character_directory(directory) - - # add the character collections provided - for charcol_path in self._charcols: - _charcol = CharacterCollection() - gzip = False; bz2 = False - if charcol_path.endswith(".gz"): gzip = True - if charcol_path.endswith(".bz2"): bz2 = True - _charcol.read(charcol_path, gzip=gzip, bz2=bz2) - charcol += _charcol - - # add tomoe dictionaries provided - for tomoe in self._tomoe: - charcol += tomoe_dict_to_character_collection(tomoe) + charcol = get_aggregated_charcol( + ((TYPE_CHARCOL, self._charcols), + (TYPE_CHARCOL_DB, self._databases), + (TYPE_DIRECTORY, self._directories), + (TYPE_TOMOE, self._tomoe), + (TYPE_KUCHIBUE, self._kuchibue))) + + + charcol.include_characters_from_files(self._include) + charcol.exclude_characters_from_files(self._exclude) + + # max samples + if self._max_samples: + charcol.remove_samples(keep_at_most=self._max_samples) - if len(charcol.get_all_characters()) == 0: + if charcol.get_total_n_characters() == 0: raise TegakiBuildError, "No character samples to train!" trainer = self._get_trainer() @@ -112,6 +114,7 @@ parser = OptionParser(usage=usage, version="%prog " + VERSION, description="Train a model") + parser.add_option("-d", "--directory", action="append", type="string", dest="directories", default=[], @@ -120,14 +123,37 @@ action="append", type="string", dest="charcols", default=[], help="character collection XML files") +parser.add_option("-b", "--db", + action="append", type="string", dest="databases", + default=[], + help="character collection XML files") parser.add_option("-t", "--tomoe-dict", action="append", type="string", dest="tomoe", default=[], help="Tomoe XML dictionary files") +parser.add_option("-k", "--kuchibue", + action="append", type="string", dest="kuchibue", + default=[], + help="Kuchibue unipen database") + + parser.add_option("-l", "--list", action="store_true",dest="list", default=False, help="List available trainers") + +parser.add_option("-i", "--include", + action="append", type="string", dest="include", + default=[], + help="File containing characters to include") +parser.add_option("-e", "--exclude", + action="append", type="string", dest="exclude", + default=[], + help="File containing characters to exclude") +parser.add_option("-m", "--max-samples", + type="int", dest="max_samples", + help="Maximum number of samples per character") + (options, args) = parser.parse_args() try: diff -Nru tegaki-tools-0.3/src/tegaki-convert tegaki-tools-0.3.1/src/tegaki-convert --- tegaki-tools-0.3/src/tegaki-convert 2009-09-17 17:20:49.000000000 +0100 +++ tegaki-tools-0.3.1/src/tegaki-convert 2010-03-23 07:07:56.000000000 +0000 @@ -24,12 +24,11 @@ import os from optparse import OptionParser -from tegaki.character import CharacterCollection +from tegaki.charcol import CharacterCollection -from tegakitools.tomoe import tomoe_dict_to_character_collection -from tegakitools.kuchibue import kuchibue_to_character_collection +from tegakitools.charcol import * -VERSION = '0.3' +VERSION = '0.3.1' class TegakiConvertError(Exception): pass @@ -39,60 +38,31 @@ def __init__(self, options, args): self._directories = options.directories self._charcols = options.charcols + self._databases = options.databases self._tomoe = options.tomoe self._kuchibue = options.kuchibue self._include = options.include self._exclude = options.exclude self._max_samples = options.max_samples - try: + if len(args) > 1: + raise TegakiConvertError, "tegaki-convert needs only 1 argument" + elif len(args) == 1: self._output_path = args[0] - except: + else: self._output_path = None def run(self): - charcol = CharacterCollection() + charcol = get_aggregated_charcol( + ((TYPE_CHARCOL, self._charcols), + (TYPE_CHARCOL_DB, self._databases), + (TYPE_DIRECTORY, self._directories), + (TYPE_TOMOE, self._tomoe), + (TYPE_KUCHIBUE, self._kuchibue)), self._output_path) - # add the directories provided - for directory in self._directories: - charcol += CharacterCollection.from_character_directory(directory) - - # add the character collections provided - for charcol_path in self._charcols: - _charcol = CharacterCollection() - gzip = False; bz2 = False - if charcol_path.endswith(".gz"): gzip = True - if charcol_path.endswith(".bz2"): bz2 = True - _charcol.read(charcol_path, gzip=gzip, bz2=bz2) - charcol += _charcol - - # add tomoe dictionaries provided - for tomoe in self._tomoe: - charcol += tomoe_dict_to_character_collection(tomoe) - - # add the kuchibue databases provided - for kuchibue in self._kuchibue: - charcol += kuchibue_to_character_collection(kuchibue) - - # characters to include - buf = "" - for inc_path in self._include: - f = open(inc_path) - buf += f.read() - f.close() - - if len(buf) > 0: - charcol.include_characters_from_text(buf) - - # characters to exclude - buf = "" - for exc_path in self._exclude: - f = open(exc_path) - buf += f.read() - f.close() - if len(buf) > 0: - charcol.exclude_characters_from_text(buf) + charcol.include_characters_from_files(self._include) + charcol.exclude_characters_from_files(self._exclude) # max samples if self._max_samples: @@ -103,10 +73,7 @@ # outputs to stdout if not output path specified print charcol.to_xml() else: - gzip = False; bz2 = False - if self._output_path.endswith(".gz"): gzip = True - if self._output_path.endswith(".bz2"): bz2 = True - charcol.write(self._output_path, gzip=gzip, bz2=bz2) + charcol.save(self._output_path) parser = OptionParser(usage="usage: %prog [options] [output-path]", version="%prog " + VERSION) @@ -119,6 +86,10 @@ action="append", type="string", dest="charcols", default=[], help="character collection XML files") +parser.add_option("-b", "--db", + action="append", type="string", dest="databases", + default=[], + help="character collection XML files") parser.add_option("-t", "--tomoe-dict", action="append", type="string", dest="tomoe", default=[], diff -Nru tegaki-tools-0.3/src/tegaki-eval tegaki-tools-0.3.1/src/tegaki-eval --- tegaki-tools-0.3/src/tegaki-eval 2009-10-19 08:58:33.000000000 +0100 +++ tegaki-tools-0.3.1/src/tegaki-eval 2010-03-23 07:07:56.000000000 +0000 @@ -25,12 +25,12 @@ import time from optparse import OptionParser -from tegaki.character import CharacterCollection +from tegaki.charcol import CharacterCollection from tegaki.recognizer import Recognizer, RecognizerError -from tegakitools.tomoe import tomoe_dict_to_character_collection +from tegakitools.charcol import * -VERSION = '0.3' +VERSION = '0.3.1' def harmonic_mean(x1, x2): if x1 == 0.0 and x2 == 0.0: @@ -48,9 +48,15 @@ def __init__(self, options, args): self._verbosity_level = options.verbosity_level self._directories = options.directories + self._databases = options.databases self._charcols = options.charcols self._tomoe = options.tomoe + self._kuchibue = options.kuchibue self._list = options.list + self._include = options.include + self._exclude = options.exclude + self._max_samples = options.max_samples + if not self._list: self._recognizer = args[0] self._model = args[1] @@ -67,25 +73,22 @@ in avail_recognizers]) def _recognize(self): - charcol = CharacterCollection() - - # add the directories provided - for directory in self._directories: - charcol += CharacterCollection.from_character_directory(directory) - - # add the character collections provided - for charcol_path in self._charcols: - _charcol = CharacterCollection() - gzip = False; bz2 = False - if charcol_path.endswith(".gz"): gzip = True - if charcol_path.endswith(".bz2"): bz2 = True - _charcol.read(charcol_path, gzip=gzip, bz2=bz2) - charcol += _charcol - - # add tomoe dictionaries provided - for tomoe in self._tomoe: - charcol += tomoe_dict_to_character_collection(tomoe) + charcol = get_aggregated_charcol( + ((TYPE_CHARCOL, self._charcols), + (TYPE_CHARCOL_DB, self._databases), + (TYPE_DIRECTORY, self._directories), + (TYPE_TOMOE, self._tomoe), + (TYPE_KUCHIBUE, self._kuchibue))) + + + charcol.include_characters_from_files(self._include) + charcol.exclude_characters_from_files(self._exclude) + + # max samples + if self._max_samples: + charcol.remove_samples(keep_at_most=self._max_samples) + # FIXME: don't load all characters in memory all_chars = charcol.get_all_characters() if len(all_chars) == 0: @@ -279,6 +282,8 @@ parser.add_option("-v", "--verbosity-level", type="int", dest="verbosity_level", default=0, help="verbosity level between 0 and 2") + + parser.add_option("-d", "--directory", action="append", type="string", dest="directories", default=[], @@ -287,14 +292,40 @@ action="append", type="string", dest="charcols", default=[], help="character collection XML files") +parser.add_option("-b", "--db", + action="append", type="string", dest="databases", + default=[], + help="character collection XML files") parser.add_option("-t", "--tomoe-dict", action="append", type="string", dest="tomoe", default=[], help="Tomoe XML dictionary files") +parser.add_option("-k", "--kuchibue", + action="append", type="string", dest="kuchibue", + default=[], + help="Kuchibue unipen database") + + parser.add_option("-l", "--list", action="store_true",dest="list", default=False, help="List available recognizers and models") + + +parser.add_option("-i", "--include", + action="append", type="string", dest="include", + default=[], + help="File containing characters to include") +parser.add_option("-e", "--exclude", + action="append", type="string", dest="exclude", + default=[], + help="File containing characters to exclude") +parser.add_option("-m", "--max-samples", + type="int", dest="max_samples", + help="Maximum number of samples per character") + + + (options, args) = parser.parse_args() try: diff -Nru tegaki-tools-0.3/src/tegaki-stats tegaki-tools-0.3.1/src/tegaki-stats --- tegaki-tools-0.3/src/tegaki-stats 2009-10-20 05:48:16.000000000 +0100 +++ tegaki-tools-0.3.1/src/tegaki-stats 2010-03-23 07:07:56.000000000 +0000 @@ -24,11 +24,10 @@ import os from optparse import OptionParser -from tegaki.character import CharacterCollection - +from tegaki.charcol import CharacterCollection from tegakitools.charcol import * -VERSION = '0.3' +VERSION = '0.3.1' class TegakiStatsError(Exception): pass @@ -41,6 +40,7 @@ def __init__(self, options, args): self._directories = options.directories self._charcols = options.charcols + self._databases = options.databases self._tomoe = options.tomoe self._kuchibue = options.kuchibue self._include = options.include @@ -51,6 +51,7 @@ def run(self): charcol = get_aggregated_charcol( ((TYPE_CHARCOL, self._charcols), + (TYPE_CHARCOL_DB, self._databases), (TYPE_DIRECTORY, self._directories), (TYPE_TOMOE, self._tomoe), (TYPE_KUCHIBUE, self._kuchibue))) @@ -76,14 +77,16 @@ print "Total number of samples: ", charcol.get_total_n_characters() - n_samples = [len(samp_by_class[k]) for k in samp_by_class.keys()] + print "Total number of strokes: ", charcol.get_total_n_strokes() + + n_samples = samp_by_class.values() avg = float(sum(n_samples)) / len(n_samples) print "Average number of samples per character/class: %0.2f" % avg if self._verbosity_level >= 2: print "\nNumber of samples for each character:" - for utf8, chars in samp_by_class.items(): - print "%s: %d" % (utf8, len(chars)) + for utf8, n_chars in samp_by_class.items(): + print "%s: %d" % (utf8, n_chars) print "\n" classes_by_sc = self._get_classes_by_stroke_count(charcol) @@ -112,17 +115,17 @@ def _get_samples_by_class(self, charcol): d = {} for set_name in charcol.get_set_list(): - for char in charcol.get_characters(set_name): - utf8 = char.get_utf8() - d[utf8] = d.get(utf8, []) + [char] + for row in charcol.get_character_rows(set_name): + utf8 = row['utf8'].encode("utf8") + d[utf8] = d.get(utf8, 0) + 1 return d def _get_classes_by_stroke_count(self, charcol): d = {} for set_name in charcol.get_set_list(): - for char in charcol.get_characters(set_name): - n_strokes = char.get_writing().get_n_strokes() - utf8 = char.get_utf8() + for row in charcol.get_character_rows(set_name): + n_strokes = row['n_strokes'] + utf8 = row['utf8'].encode("utf8") d[n_strokes] = d.get(n_strokes, []) if not utf8 in d[n_strokes]: d[n_strokes].append(utf8) @@ -131,9 +134,9 @@ def _get_stroke_counts_by_class(self, charcol): d = {} for set_name in charcol.get_set_list(): - for char in charcol.get_characters(set_name): - n_strokes = char.get_writing().get_n_strokes() - utf8 = char.get_utf8() + for row in charcol.get_character_rows(set_name): + n_strokes = row['n_strokes'] + utf8 = row['utf8'].encode("utf8") d[utf8] = d.get(utf8, []) if not n_strokes in d[utf8]: d[utf8].append(n_strokes) @@ -152,6 +155,10 @@ action="append", type="string", dest="charcols", default=[], help="character collection XML files") +parser.add_option("-b", "--db", + action="append", type="string", dest="databases", + default=[], + help="character collection XML files") parser.add_option("-t", "--tomoe-dict", action="append", type="string", dest="tomoe", default=[], diff -Nru tegaki-tools-0.3/src/tegakitools/charcol.py tegaki-tools-0.3.1/src/tegakitools/charcol.py --- tegaki-tools-0.3/src/tegakitools/charcol.py 2009-10-19 08:26:23.000000000 +0100 +++ tegaki-tools-0.3.1/src/tegakitools/charcol.py 2010-02-27 04:52:04.000000000 +0000 @@ -20,43 +20,73 @@ # Contributors to this file: # - Mathieu Blondel -from tegaki.character import CharacterCollection +import os + +from tegaki.charcol import CharacterCollection from tegakitools.tomoe import tomoe_dict_to_character_collection from tegakitools.kuchibue import kuchibue_to_character_collection -TYPE_CHARCOL, TYPE_DIRECTORY, TYPE_TOMOE, TYPE_KUCHIBUE = range(4) +TYPE_CHARCOL, TYPE_CHARCOL_DB, TYPE_DIRECTORY, TYPE_TOMOE, TYPE_KUCHIBUE = \ +range(5) + +def _get_charcol(charcol_type, charcol_path): + if charcol_type == TYPE_DIRECTORY: + # charcol_path is actually a directory here + return CharacterCollection.from_character_directory(charcol_path) + + elif charcol_type in (TYPE_CHARCOL, TYPE_CHARCOL_DB): + return CharacterCollection(charcol_path) + + elif charcol_type == TYPE_TOMOE: + return tomoe_dict_to_character_collection(charcol_path) -def get_aggregated_charcol(tuples): + elif charcol_type == TYPE_KUCHIBUE: + return kuchibue_to_character_collection(charcol_path) + + +def get_aggregated_charcol(tuples, dbpath=None): """ Create a character collection out of other character collections, character directories, tomoe dictionaries or kuchibue databases. - tuples: a list of tuples (TYPE, list) + tuples: a list of tuples (TYPE, path list) """ - charcol = CharacterCollection() - for typ, files in tuples: - if typ == TYPE_DIRECTORY: - # files should actually contain a list of directories - for d in files: - charcol += CharacterCollection.from_character_directory(d) - - elif typ == TYPE_CHARCOL: - for charcol_path in files: - _charcol = CharacterCollection() - gzip = False; bz2 = False - if charcol_path.endswith(".gz"): gzip = True - if charcol_path.endswith(".bz2"): bz2 = True - _charcol.read(charcol_path, gzip=gzip, bz2=bz2) - charcol += _charcol - - elif typ == TYPE_TOMOE: - for tomoe in files: - charcol += tomoe_dict_to_character_collection(tomoe) - - elif typ == TYPE_KUCHIBUE: - for kuchibue in files: - charcol += kuchibue_to_character_collection(kuchibue) + # number of files for each character collection type + n_files = [len(t[1]) for t in tuples] + + # we don't need to merge character collections if only one is provided + # this can save a lot of time for large collections + if sum(n_files) == 1 and dbpath is None: + idx = n_files.index(1) + return _get_charcol(tuples[idx][0], tuples[idx][1][0]) + + if dbpath is not None and dbpath.endswith(".chardb"): + if os.path.exists(dbpath): + print "%s exists already." % dbpath + print "Continuing will modify it..." + answer = raw_input("Continue anyway? (y/N)") + if answer == "y": + print "Overwrite to concatenate collections together " + \ + "in a new database" + print "Don't overwrite to append new characters or " + \ + "filter (-i,-e,-m) existing database" + answer = raw_input("Overwrite it? (y/N)") + if answer == "y": + os.unlink(dbpath) + else: + exit() + + charcol = CharacterCollection(dbpath) + #charcol.WRITE_BACK = False + #charcol.AUTO_COMMIT = True + else: + charcol = CharacterCollection() # in memory db + + charcols = [_get_charcol(typ, path) \ + for typ, paths in tuples for path in paths] + + charcol.merge(charcols) return charcol diff -Nru tegaki-tools-0.3/src/tegakitools/kuchibue.py tegaki-tools-0.3.1/src/tegakitools/kuchibue.py --- tegaki-tools-0.3/src/tegakitools/kuchibue.py 2009-09-18 09:32:19.000000000 +0100 +++ tegaki-tools-0.3.1/src/tegakitools/kuchibue.py 2010-02-27 04:52:04.000000000 +0000 @@ -26,8 +26,8 @@ import re import os -from tegaki.character import Point, Stroke, Writing, Character, \ - CharacterCollection +from tegaki.character import Point, Stroke, Writing, Character +from tegaki.charcol import CharacterCollection from unipen import UnipenParser from shiftjis import SHIFT_JIS_TABLE diff -Nru tegaki-tools-0.3/src/tegakitools/tomoe.py tegaki-tools-0.3.1/src/tegakitools/tomoe.py --- tegaki-tools-0.3/src/tegakitools/tomoe.py 2009-07-10 11:45:20.000000000 +0100 +++ tegaki-tools-0.3.1/src/tegakitools/tomoe.py 2010-02-27 04:52:04.000000000 +0000 @@ -20,8 +20,8 @@ # Contributors to this file: # - Mathieu Blondel -from tegaki.character import Point, Stroke, Writing, Character, \ - CharacterCollection, _XmlBase +from tegaki.character import Point, Stroke, Writing, Character, _XmlBase +from tegaki.charcol import CharacterCollection class TomoeXmlDictionaryReader(_XmlBase): diff -Nru tegaki-tools-0.3/src/tegakitools/unipen.py tegaki-tools-0.3.1/src/tegakitools/unipen.py --- tegaki-tools-0.3/src/tegakitools/unipen.py 2009-09-08 13:22:25.000000000 +0100 +++ tegaki-tools-0.3.1/src/tegakitools/unipen.py 2010-02-27 04:52:04.000000000 +0000 @@ -26,8 +26,8 @@ import re import os -from tegaki.character import Point, Stroke, Writing, Character, \ - CharacterCollection +from tegaki.character import Point, Stroke, Writing, Character +from tegaki.charcol import CharacterCollection class UnipenEventParser(object): """SAX-like event-based parser""" @@ -147,8 +147,17 @@ def get_character_collection(self): charcol = CharacterCollection() assert(len(self._labels) == len(self._characters)) + + # group characters with the same label into sets + sets = {} for i in range(len(self._characters)): - self._characters[i].set_utf8(self._labels[i]) - charcol.add_set(self._labels[i]) - charcol.append_character(self._labels[i], self._characters[i]) + utf8 = self._labels[i] + self._characters[i].set_utf8(utf8) + sets[utf8] = sets.get(utf8, []) + [self._characters[i]] + + charcol.add_sets(sets.keys()) + + for set_name, characters in sets.items(): + charcol.append_characters(set_name, characters) + return charcol