diff -Nru tegaki-tools-0.3/ChangeLog tegaki-tools-0.3.1/ChangeLog
--- tegaki-tools-0.3/ChangeLog	2009-11-01 12:07:16.000000000 +0000
+++ tegaki-tools-0.3.1/ChangeLog	2010-03-23 07:29:23.000000000 +0000
@@ -1,3 +1,69 @@
+commit 5b0b86343c91cbf3d7ee683e82f41f9df75edf4d
+Author: Mathieu Blondel <mathieu@mblondel.org>
+Date:   Mon Mar 22 15:32:33 2010 +0900
+
+    [all] Set current version to 0.3.1.
+
+commit ae808ddbdffa3750a71d2636fe475d5d774516f1
+Author: Mathieu Blondel <mathieu@mblondel.org>
+Date:   Wed Dec 9 17:16:34 2009 +0900
+
+    [tegaki-stats] Display total number of strokes.
+
+commit 07a78dd10e599ecbf6d255704551955e0e696a2d
+Author: Mathieu Blondel <mathieu@mblondel.org>
+Date:   Wed Dec 9 16:13:38 2009 +0900
+
+    [tegaki-tool] Ask whether to overwrite existing db or not.
+
+commit ec14631fab52e09b073e26fd58112e96f5a56d64
+Author: Mathieu Blondel <mathieu@mblondel.org>
+Date:   Wed Dec 9 14:12:37 2009 +0900
+
+    [tegaki-tools] Various optimizations.
+
+commit 690a5a89ac2dd301e3efc673912d8ad8c6be0cac
+Author: Mathieu Blondel <mathieu@mblondel.org>
+Date:   Wed Dec 2 20:24:23 2009 +0900
+
+    [tegaki-tools] Use CharacterCollection new save method.
+
+commit f5fa2429291939d7358a61aeafcd178e5fccd4c4
+Author: Mathieu Blondel <mathieu@mblondel.org>
+Date:   Tue Dec 1 17:52:55 2009 +0900
+
+    [tegaki-build] Don't import Recognizer.
+
+commit 90fd8fa22706942945682795acaab65f1280a86c
+Author: Mathieu Blondel <mathieu@mblondel.org>
+Date:   Tue Dec 1 17:30:06 2009 +0900
+
+    [tegaki-tools] Support for new CharacterCollection features.
+
+commit 4cb5520040c22a617d87dc5b6c26ed1a8c16ad75
+Author: Christoph Burgmer <cburgmer@ira.uka.de>
+Date:   Sun Nov 22 12:15:18 2009 +0100
+
+    [tegaki-bootstrap] always provide max_samples, mix exact and decomposition transformations; make max_samples default to 1; optimize
+
+commit 8a7cbb7c31c6672876d71d9f8915052023036fb8
+Author: Christoph Burgmer <cburgmer@ira.uka.de>
+Date:   Fri Nov 20 13:31:37 2009 +0100
+
+    [tegaki-bootstrap] Optimize memory footprint.
+
+commit 5a73c1b7118429ba59ddeedf3a38dc3b5fb84511
+Author: Christoph Burgmer <cburgmer@ira.uka.de>
+Date:   Fri Nov 20 00:55:15 2009 +0100
+
+    [tegaki-extractcomponents] Add a tool to extract handwriting of a component.
+
+commit 909dac8225b23cddeeb898723a02818f1cac4ea6
+Author: Christoph Burgmer <cburgmer@ira.uka.de>
+Date:   Fri Nov 20 00:22:18 2009 +0100
+
+    [tegaki-bootstrap] Add option to exclude direct transformations. Also optimize memory useage for 'big' values of "-m".
+
 commit 23428970b5125e6effc233ff3261e6cb368ff93d
 Author: Mathieu Blondel <mathieu@mblondel.org>
 Date:   Sun Nov 1 20:56:48 2009 +0900
diff -Nru tegaki-tools-0.3/debian/changelog tegaki-tools-0.3.1/debian/changelog
--- tegaki-tools-0.3/debian/changelog	2010-05-10 00:36:17.000000000 +0100
+++ tegaki-tools-0.3.1/debian/changelog	2010-03-26 15:41:31.000000000 +0000
@@ -1,3 +1,11 @@
+tegaki-tools (0.3.1-1) unstable; urgency=low
+
+  * New upstream release.
+  * debian/control: bump standards version to 3.8.4.
+  * debian/source/format: 3.0.
+
+ -- LI Daobing <lidaobing@debian.org>  Fri, 26 Mar 2010 23:39:15 +0800
+
 tegaki-tools (0.3-1) unstable; urgency=low
 
   * New upstream release.
diff -Nru tegaki-tools-0.3/debian/control tegaki-tools-0.3.1/debian/control
--- tegaki-tools-0.3/debian/control	2010-05-10 00:36:17.000000000 +0100
+++ tegaki-tools-0.3.1/debian/control	2010-03-26 15:39:11.000000000 +0000
@@ -4,7 +4,7 @@
 Maintainer: LI Daobing <lidaobing@debian.org>
 Build-Depends: debhelper (>= 5), python
 Build-Depends-Indep: python-support (>= 0.6)
-Standards-Version: 3.8.3
+Standards-Version: 3.8.4
 Homepage: http://www.tegaki.org/
 Vcs-Browser: https://code.launchpad.net/~lidaobing/tegaki/tegaki-tools
 Vcs-Bzr: lp:~lidaobing/tegaki/tegaki-tools
diff -Nru tegaki-tools-0.3/debian/source/format tegaki-tools-0.3.1/debian/source/format
--- tegaki-tools-0.3/debian/source/format	1970-01-01 01:00:00.000000000 +0100
+++ tegaki-tools-0.3.1/debian/source/format	2010-05-10 00:36:18.000000000 +0100
@@ -0,0 +1 @@
+3.0 (quilt)
diff -Nru tegaki-tools-0.3/PKG-INFO tegaki-tools-0.3.1/PKG-INFO
--- tegaki-tools-0.3/PKG-INFO	2009-11-01 12:07:16.000000000 +0000
+++ tegaki-tools-0.3.1/PKG-INFO	2010-03-23 07:29:23.000000000 +0000
@@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: tegaki-tools
-Version: 0.3
+Version: 0.3.1
 Summary: A set of command-line tools for Tegaki.
 Home-page: http://www.tegaki.org
 Author: Mathieu Blondel
diff -Nru tegaki-tools-0.3/src/tegaki-bootstrap tegaki-tools-0.3.1/src/tegaki-bootstrap
--- tegaki-tools-0.3/src/tegaki-bootstrap	2009-11-01 06:54:59.000000000 +0000
+++ tegaki-tools-0.3.1/src/tegaki-bootstrap	2010-02-27 04:52:04.000000000 +0000
@@ -55,12 +55,12 @@
 """
 import sys
 import locale
+import random
+random.seed(12345) # provide deterministic results
 from optparse import OptionParser
 
 from tegaki.character import CharacterCollection, Writing, Character
 
-from tegakitools.tomoe import tomoe_dict_to_character_collection
-from tegakitools.kuchibue import kuchibue_to_character_collection
 from tegakitools.charcol import *
 
 try:
@@ -113,8 +113,10 @@
         self._include = options.include
         self._exclude = options.exclude
         self._max_samples = options.max_samples
+        assert self._max_samples
         self._locale = options.locale
         self._character_domain = options.character_domain
+        self._no_exact_transformation = options.no_exact_transformation
         self._quiet = options.quiet
 
         try:
@@ -125,34 +127,25 @@
         self._cjk = CharacterLookup(self._locale, self._character_domain)
 
     def _get_charcol(self):
-        if not hasattr(self, '_charcol'):
-            self._charcol = get_aggregated_charcol(
-                                  ((TYPE_CHARCOL, self._charcols),
-                                   (TYPE_DIRECTORY, self._directories),
-                                   (TYPE_TOMOE, self._tomoe),
-                                   (TYPE_KUCHIBUE, self._kuchibue)))
-
-            self._charcol.include_characters_from_files(self._include)
-            self._charcol.exclude_characters_from_files(self._exclude)
-
-            # max samples
-            if self._max_samples:
-                self._charcol.remove_samples(keep_at_most=self._max_samples)
+        _charcol = get_aggregated_charcol(((TYPE_CHARCOL, self._charcols),
+                                           (TYPE_DIRECTORY, self._directories),
+                                           (TYPE_TOMOE, self._tomoe),
+                                           (TYPE_KUCHIBUE, self._kuchibue)))
 
-        return self._charcol
+        _charcol.include_characters_from_files(self._include)
+        _charcol.exclude_characters_from_files(self._exclude)
 
-    def run(self):
-        charcol = self._get_charcol()
+        # max samples
+        _charcol.remove_samples(keep_at_most=self._max_samples)
 
-        if charcol.get_total_n_characters() == 0:
-            raise TegakiBootstrapError("Empty input collection provided")
+        return _charcol
 
+    def run(self):
         # do the bootstrapping
-        to_charcol = self.bootstrap(charcol)
+        to_charcol = self.bootstrap()
 
         # max samples
-        if self._max_samples:
-            to_charcol.remove_samples(keep_at_most=self._max_samples)
+        #to_charcol.remove_samples(keep_at_most=self._max_samples)
 
         # output
         if not self._output_path:
@@ -164,46 +157,69 @@
             if self._output_path.endswith(".bz2"): bz2 = True
             to_charcol.write(self._output_path, gzip=gzip, bz2=bz2)
 
-    def bootstrap(self, charcol):
-        exact_transformations = 0
-        decomposition_transformations = 0
-        decomposition_fertilities = []
-        missing_transformations = 0
+    def bootstrap(self):
+        n_chars = 0
+        n_exact_transformations = 0
+        char_n_exact_transformations = 0
+        n_decomposition_transformations = 0
+        char_n_decomposition_transformations = 0
+
+        char_n_missing_transformations = 0
+        char_n_underrepresented = 0
 
         to_charcol = CharacterCollection()
 
         missing_char_dict = {}
         missing_single_characters = []
         # iterate through all characters of the target character set
-        count = 0
         for target_char in self._cjk.getDomainCharacterIterator():
         #for target_char in iter([u'亄', u'乿', u'仜', u'伳']): # DEBUG
-            count += 1
-            if count % 100 == 0:
+            n_chars += 1
+            if n_chars % 100 == 0:
                 sys.stdout.write('.')
                 sys.stdout.flush()
-            charSet = target_char.encode('utf8')
+            char_set = target_char.encode('utf8')
             source_character_lookup = self._get_source_character_lookup()
-            if target_char in source_character_lookup:
-                to_charcol.add_set(charSet)
-                for character in source_character_lookup[target_char]:
-                    to_charcol.append_character(charSet, character)
 
-                exact_transformations += 1
-            else:
+            exact_transformations = 0
+            if (target_char in source_character_lookup
+                and not self._no_exact_transformation):
+                char_n_exact_transformations += 1
+
+                to_charcol.add_set(char_set)
+                source_chars = source_character_lookup[target_char]
+                for character in source_chars[:self._max_samples]:
+                    exact_transformations += 1
+                    to_charcol.append_character(char_set, character)
+
+            n_exact_transformations += exact_transformations
+            n_total_transformation = exact_transformations
+
+            # fill up with decomposition transformations?
+            need_n_chars = self._max_samples - exact_transformations
+
+            decomposition_transformations = 0
+            if need_n_chars > 0:
                 writing_objects, missing_chars \
-                    = self.get_writings_from_decomposition(target_char)
+                    = self.get_writings_from_decomposition(target_char,
+                        force_decomposition=True)
                 if writing_objects:
+                    char_n_decomposition_transformations += 1
+
+                    writing_objects = writing_objects[:need_n_chars]
                     for writing in writing_objects:
+                        decomposition_transformations += 1
+
                         character = Character()
                         character.set_writing(writing)
                         character.set_unicode(target_char)
-                        to_charcol.append_character(charSet, character)
+                        to_charcol.append_character(char_set, character)
 
-                    decomposition_transformations += 1
-                    decomposition_fertilities.append(len(writing_objects))
-                else:
+                n_total_transformation += decomposition_transformations
+
+                if n_total_transformation == 0:
                     if missing_chars:
+                        # list components that can help us build this transform.
                         for missing in missing_chars:
                             if missing not in missing_char_dict:
                                 missing_char_dict[missing] = []
@@ -211,28 +227,49 @@
                     else:
                         missing_single_characters.append(target_char)
 
-                    missing_transformations += 1
+            n_decomposition_transformations += decomposition_transformations
+
+            # if no direct transformation exists we have no data at all
+            if n_total_transformation == 0:
+                char_n_missing_transformations += 1
+            elif n_total_transformation < self._max_samples:
+                # we have data, just not enough
+                char_n_underrepresented += 1
 
         sys.stdout.write('\n')
 
         if not self._quiet:
             _, default_encoding = locale.getdefaultlocale()
-            total = exact_transformations + decomposition_transformations \
-                + missing_transformations
-            print 'Exact transformation count: %d (%d%%)' \
-                % (exact_transformations, 100 * exact_transformations / total)
-            print 'Decomposition transformation count: %d (%d%%)' \
-                % (decomposition_transformations,
-                    100 * decomposition_transformations / total)
-            if decomposition_fertilities:
-                decomposition_fertility = (sum(decomposition_fertilities) \
-                    / len(decomposition_fertilities))
-            else:
-                decomposition_fertility = 1
-            print 'Decomposition fertility: %d' % decomposition_fertility
-            print 'Missing transformations: %d (%d%%)' \
-                % (missing_transformations,
-                    100 * missing_transformations / total)
+            total = n_exact_transformations + n_decomposition_transformations
+
+            print 'Total characters: %d' % n_chars
+            print 'Total transformation (instances): %d' % total
+
+            print 'Characters with exact transformations: %d (%d%%)' \
+                % (char_n_exact_transformations,
+                   100 * char_n_exact_transformations / n_chars)
+            print 'Total exact transformations: %d (%d%%)' \
+                % (n_exact_transformations,
+                   100 * n_exact_transformations / total)
+            print 'Average exact transformations: %f' \
+                % (1. * n_exact_transformations / n_chars)
+
+            print 'Characters with decomposition transformations: %d (%d%%)' \
+                % (char_n_decomposition_transformations,
+                   100 * char_n_decomposition_transformations / n_chars)
+            print 'Total decomposition transformations: %d (%d%%)' \
+                % (n_decomposition_transformations,
+                   100 * n_decomposition_transformations / total)
+            print 'Average decomposition transformations: %f' \
+                % (1. * n_decomposition_transformations / n_chars)
+
+            print 'Characters missing transformations: %d (%d%%)' \
+                % (char_n_missing_transformations,
+                    100 * char_n_missing_transformations / n_chars)
+            if self._max_samples > 1:
+                print 'Characters with less than %d instances: %d (%d%%)' \
+                    % (self._max_samples, char_n_underrepresented,
+                        100 * char_n_underrepresented / n_chars)
 
             # missing single characters
             # Extend by those with components, that have a component with low
@@ -289,7 +326,12 @@
     def _get_source_character_lookup(self):
         if not hasattr(self, '_source_character_lookup'):
             self._source_character_lookup = {}
-            for character in self._get_charcol().get_all_characters():
+
+            charcol = self._get_charcol()
+            if charcol.get_total_n_characters() == 0:
+                raise TegakiBootstrapError("Empty input collection provided")
+
+            for character in charcol.get_all_characters():
                 char = character.get_utf8().decode('utf8')
                 if char not in self._source_character_lookup:
                     self._source_character_lookup[char] = []
@@ -297,23 +339,29 @@
 
         return self._source_character_lookup
 
-    def get_writings_from_decomposition(self, char):
+    def get_writings_from_decomposition(self, char, force_decomposition=False):
         writing_objects = []
-        if char in self._get_source_character_lookup():
-            writing_objects = [character.get_writing() \
-                for character in self._get_source_character_lookup()[char]]
-        elif (CharacterLookup.isRadicalChar(char)
+
+        source_char_lookup = self._get_source_character_lookup()
+
+        exact_transformations = 0
+        if (not force_decomposition and char in source_char_lookup):
+            writing_objects.extend([character.get_writing() \
+                for character in source_char_lookup[char]])
+
+        if (CharacterLookup.isRadicalChar(char)
             and char not in self.RADICALS_NON_VISUAL_EQUIVALENCE):
             try:
                 equivChar = self._cjk.getRadicalFormEquivalentCharacter(char)
-                if equivChar in self._get_source_character_lookup():
-                    writing_objects = [character.get_writing() for character \
-                        in self._get_source_character_lookup()[equivChar]]
+                if equivChar in source_char_lookup:
+                    writing_objects.extend([character.get_writing()
+                        for character in source_char_lookup[equivChar]])
             except UnsupportedError:
                 pass
 
+        # add decompositions, limit to upper bound max_samples
         missing_chars = []
-        if not writing_objects:
+        if len(writing_objects) < self._max_samples:
             decompositions = self._cjk.getDecompositionEntries(char)
             for decomposition in decompositions:
                 writing_objs, _, missing = self._get_writing_from_entry(
@@ -322,8 +370,10 @@
                     missing_chars.extend(missing)
                 writing_objects.extend(writing_objs)
 
-        if writing_objects:
-            missing_chars = []
+                if len(writing_objects) >= self._max_samples:
+                    break
+
+        writing_objects = writing_objects[:self._max_samples]
 
         return writing_objects, missing_chars
 
@@ -365,8 +415,12 @@
             # merge
             writing_objects = []
             if not missing_chars:
-                for writing_objs in TegakiBootstrap.cross(
-                    *writing_objects_list):
+                compound_writings = TegakiBootstrap.cross(*writing_objects_list)
+                # shuffle to provide more variation
+                random.shuffle(compound_writings)
+                compound_writings = compound_writings[:self._max_samples]
+
+                for writing_objs in compound_writings:
                     writing = self.merge_writing_objects(character,
                         writing_objs)
                     writing_objects.append(writing)
@@ -395,20 +449,20 @@
         assert False
 
     @classmethod
-    def merge_writing_objects(cls, idsChar, writing_objects):
-        if idsChar not in cls.COMPONENT_TRANSFORMATION:
+    def merge_writing_objects(cls, ids_char, writing_objects):
+        if ids_char not in cls.COMPONENT_TRANSFORMATION:
             raise ValueError("Not supported") # [u'⿴', u'⿻', u'⿷']
 
-        assert (CharacterLookup.isBinaryIDSOperator(idsChar) \
+        assert (CharacterLookup.isBinaryIDSOperator(ids_char) \
             and len(writing_objects) == 2) \
-            or (CharacterLookup.isTrinaryIDSOperator(idsChar) \
+            or (CharacterLookup.isTrinaryIDSOperator(ids_char) \
             and len(writing_objects) == 3)
-        assert len(cls.COMPONENT_TRANSFORMATION[idsChar]) \
+        assert len(cls.COMPONENT_TRANSFORMATION[ids_char]) \
             == len(writing_objects)
 
-        transformations = cls.COMPONENT_TRANSFORMATION[idsChar]
+        transformations = cls.COMPONENT_TRANSFORMATION[ids_char]
         # reverse transformations where inner part is written first
-        if idsChar in [u'⿺', u'⿶']:
+        if ids_char in [u'⿺', u'⿶']:
             writing_objects.reverse()
             transformations = transformations[:]
             transformations.reverse()
@@ -419,7 +473,7 @@
 
             obj = writing_objects[idx].copy()
             obj.resize(xrate, yrate)
-            obj.move_rel(dx * obj.get_width(), dy* obj.get_height())
+            obj.move_rel(dx * obj.get_width(), dy * obj.get_height())
             obj.resize(resultingWriting.get_width() / obj.get_width(),
                 resultingWriting.get_height() / obj.get_height())
             for s in obj.get_strokes(True):
@@ -465,7 +519,7 @@
                   default=[],
                   help="File containing characters to exclude")
 parser.add_option("-m", "--max-samples",
-                  type="int", dest="max_samples",
+                  type="int", dest="max_samples", default=1,
                   help="Maximum number of samples per character")
 
 
@@ -479,6 +533,10 @@
                   help="Character domain of target characters")
 
 
+parser.add_option("-x", "--no-exact", dest="no_exact_transformation",
+                  action="store_true",
+                  help="Don't use exact transformations" \
+                    + ", use only decompositions")
 parser.add_option("-q", "--quiet", dest="quiet",
                   action="store_true",
                   help="Don't print any statistics")
diff -Nru tegaki-tools-0.3/src/tegaki-build tegaki-tools-0.3.1/src/tegaki-build
--- tegaki-tools-0.3/src/tegaki-build	2009-09-08 02:25:28.000000000 +0100
+++ tegaki-tools-0.3.1/src/tegaki-build	2010-03-23 07:07:56.000000000 +0000
@@ -24,13 +24,12 @@
 import os
 from optparse import OptionParser
 
-from tegaki.character import CharacterCollection
+from tegaki.charcol import CharacterCollection
 from tegaki.trainer import Trainer, TrainerError
-from tegaki.recognizer import Recognizer
 
-from tegakitools.tomoe import tomoe_dict_to_character_collection
+from tegakitools.charcol import *
 
-VERSION = '0.3'
+VERSION = '0.3.1'
 
 class TegakiBuildError(Exception):
     pass
@@ -40,7 +39,12 @@
     def __init__(self, options, args):
         self._directories = options.directories
         self._charcols = options.charcols
+        self._databases = options.databases
         self._tomoe = options.tomoe
+        self._kuchibue = options.kuchibue
+        self._include = options.include
+        self._exclude = options.exclude
+        self._max_samples = options.max_samples
         self._list = options.list
         if not self._list:
             self._trainer = args[0]
@@ -61,28 +65,26 @@
 
         # read meta file
         try:
-            meta = Recognizer.read_meta_file(self._meta)
+            meta = Trainer.read_meta_file(self._meta)
         except IOError, e:
             raise TegakiBuildError, str(e)
 
-        # add the directories provided
-        for directory in self._directories:
-            charcol += CharacterCollection.from_character_directory(directory)
-
-        # add the character collections provided
-        for charcol_path in self._charcols:
-            _charcol = CharacterCollection()
-            gzip = False; bz2 = False
-            if charcol_path.endswith(".gz"): gzip = True
-            if charcol_path.endswith(".bz2"): bz2 = True
-            _charcol.read(charcol_path, gzip=gzip, bz2=bz2)
-            charcol += _charcol
-
-        # add tomoe dictionaries provided
-        for tomoe in self._tomoe:
-            charcol += tomoe_dict_to_character_collection(tomoe)
+        charcol = get_aggregated_charcol(
+                        ((TYPE_CHARCOL, self._charcols),
+                         (TYPE_CHARCOL_DB, self._databases),
+                         (TYPE_DIRECTORY, self._directories),
+                         (TYPE_TOMOE, self._tomoe),
+                         (TYPE_KUCHIBUE, self._kuchibue)))
+
+
+        charcol.include_characters_from_files(self._include)
+        charcol.exclude_characters_from_files(self._exclude)
+
+        # max samples
+        if self._max_samples:
+            charcol.remove_samples(keep_at_most=self._max_samples)
 
-        if len(charcol.get_all_characters()) == 0:
+        if charcol.get_total_n_characters() == 0:
             raise TegakiBuildError, "No character samples to train!"
 
         trainer = self._get_trainer()
@@ -112,6 +114,7 @@
 parser = OptionParser(usage=usage, version="%prog " + VERSION,
                       description="Train a model")
 
+
 parser.add_option("-d", "--directory",
                   action="append", type="string", dest="directories",
                   default=[],
@@ -120,14 +123,37 @@
                   action="append", type="string", dest="charcols",
                   default=[],
                   help="character collection XML files")
+parser.add_option("-b", "--db",
+                  action="append", type="string", dest="databases",
+                  default=[],
+                  help="character collection XML files")
 parser.add_option("-t", "--tomoe-dict",
                   action="append", type="string", dest="tomoe",
                   default=[],
                   help="Tomoe XML dictionary files")
+parser.add_option("-k", "--kuchibue",
+                  action="append", type="string", dest="kuchibue",
+                  default=[],
+                  help="Kuchibue unipen database")
+
+
 parser.add_option("-l", "--list",
                   action="store_true",dest="list", default=False,
                   help="List available trainers")
 
+
+parser.add_option("-i", "--include",
+                  action="append", type="string", dest="include",
+                  default=[],
+                  help="File containing characters to include")
+parser.add_option("-e", "--exclude",
+                  action="append", type="string", dest="exclude",
+                  default=[],
+                  help="File containing characters to exclude")
+parser.add_option("-m", "--max-samples",
+                  type="int", dest="max_samples",
+                  help="Maximum number of samples per character")
+
 (options, args) = parser.parse_args()
 
 try:
diff -Nru tegaki-tools-0.3/src/tegaki-convert tegaki-tools-0.3.1/src/tegaki-convert
--- tegaki-tools-0.3/src/tegaki-convert	2009-09-17 17:20:49.000000000 +0100
+++ tegaki-tools-0.3.1/src/tegaki-convert	2010-03-23 07:07:56.000000000 +0000
@@ -24,12 +24,11 @@
 import os
 from optparse import OptionParser
 
-from tegaki.character import CharacterCollection
+from tegaki.charcol import CharacterCollection
 
-from tegakitools.tomoe import tomoe_dict_to_character_collection
-from tegakitools.kuchibue import kuchibue_to_character_collection
+from tegakitools.charcol import *
 
-VERSION = '0.3'
+VERSION = '0.3.1'
 
 class TegakiConvertError(Exception):
     pass
@@ -39,60 +38,31 @@
     def __init__(self, options, args):
         self._directories = options.directories
         self._charcols = options.charcols
+        self._databases = options.databases
         self._tomoe = options.tomoe
         self._kuchibue = options.kuchibue
         self._include = options.include
         self._exclude = options.exclude
         self._max_samples = options.max_samples
 
-        try:
+        if len(args) > 1:
+            raise TegakiConvertError, "tegaki-convert needs only 1 argument"
+        elif len(args) == 1:
             self._output_path = args[0]
-        except:
+        else:
             self._output_path = None
 
     def run(self):
-        charcol = CharacterCollection()
+        charcol = get_aggregated_charcol(
+                        ((TYPE_CHARCOL, self._charcols),
+                         (TYPE_CHARCOL_DB, self._databases),
+                         (TYPE_DIRECTORY, self._directories),
+                         (TYPE_TOMOE, self._tomoe),
+                         (TYPE_KUCHIBUE, self._kuchibue)), self._output_path)
 
-        # add the directories provided
-        for directory in self._directories:
-            charcol += CharacterCollection.from_character_directory(directory)
-
-        # add the character collections provided
-        for charcol_path in self._charcols:
-            _charcol = CharacterCollection()
-            gzip = False; bz2 = False
-            if charcol_path.endswith(".gz"): gzip = True
-            if charcol_path.endswith(".bz2"): bz2 = True
-            _charcol.read(charcol_path, gzip=gzip, bz2=bz2)
-            charcol += _charcol
-
-        # add tomoe dictionaries provided
-        for tomoe in self._tomoe:
-            charcol += tomoe_dict_to_character_collection(tomoe)
-
-        # add the kuchibue databases provided
-        for kuchibue in self._kuchibue:
-            charcol += kuchibue_to_character_collection(kuchibue)
-
-        # characters to include
-        buf = ""
-        for inc_path in self._include:
-            f = open(inc_path)
-            buf += f.read()
-            f.close()
-
-        if len(buf) > 0:
-            charcol.include_characters_from_text(buf)
-
-        # characters to exclude
-        buf = ""
-        for exc_path in self._exclude:
-            f = open(exc_path)
-            buf += f.read()
-            f.close()
 
-        if len(buf) > 0:
-            charcol.exclude_characters_from_text(buf)
+        charcol.include_characters_from_files(self._include)
+        charcol.exclude_characters_from_files(self._exclude)
 
         # max samples
         if self._max_samples:
@@ -103,10 +73,7 @@
             # outputs to stdout if not output path specified
             print charcol.to_xml()
         else:
-            gzip = False; bz2 = False
-            if self._output_path.endswith(".gz"): gzip = True
-            if self._output_path.endswith(".bz2"): bz2 = True
-            charcol.write(self._output_path, gzip=gzip, bz2=bz2)
+            charcol.save(self._output_path)
 
 parser = OptionParser(usage="usage: %prog [options] [output-path]",
                       version="%prog " + VERSION)
@@ -119,6 +86,10 @@
                   action="append", type="string", dest="charcols",
                   default=[],
                   help="character collection XML files")
+parser.add_option("-b", "--db",
+                  action="append", type="string", dest="databases",
+                  default=[],
+                  help="character collection XML files")
 parser.add_option("-t", "--tomoe-dict",
                   action="append", type="string", dest="tomoe",
                   default=[],
diff -Nru tegaki-tools-0.3/src/tegaki-eval tegaki-tools-0.3.1/src/tegaki-eval
--- tegaki-tools-0.3/src/tegaki-eval	2009-10-19 08:58:33.000000000 +0100
+++ tegaki-tools-0.3.1/src/tegaki-eval	2010-03-23 07:07:56.000000000 +0000
@@ -25,12 +25,12 @@
 import time
 from optparse import OptionParser
 
-from tegaki.character import CharacterCollection
+from tegaki.charcol import CharacterCollection
 from tegaki.recognizer import Recognizer, RecognizerError
 
-from tegakitools.tomoe import tomoe_dict_to_character_collection
+from tegakitools.charcol import *
 
-VERSION = '0.3'
+VERSION = '0.3.1'
 
 def harmonic_mean(x1, x2):
     if x1 == 0.0 and x2 == 0.0:
@@ -48,9 +48,15 @@
     def __init__(self, options, args):
         self._verbosity_level = options.verbosity_level
         self._directories = options.directories
+        self._databases = options.databases
         self._charcols = options.charcols
         self._tomoe = options.tomoe
+        self._kuchibue = options.kuchibue
         self._list = options.list
+        self._include = options.include
+        self._exclude = options.exclude
+        self._max_samples = options.max_samples
+
         if not self._list:
             self._recognizer = args[0]
             self._model = args[1]
@@ -67,25 +73,22 @@
                          in avail_recognizers])
 
     def _recognize(self):
-        charcol = CharacterCollection()
-
-        # add the directories provided
-        for directory in self._directories:
-            charcol += CharacterCollection.from_character_directory(directory)
-
-        # add the character collections provided
-        for charcol_path in self._charcols:
-            _charcol = CharacterCollection()
-            gzip = False; bz2 = False
-            if charcol_path.endswith(".gz"): gzip = True
-            if charcol_path.endswith(".bz2"): bz2 = True
-            _charcol.read(charcol_path, gzip=gzip, bz2=bz2)
-            charcol += _charcol
-
-        # add tomoe dictionaries provided
-        for tomoe in self._tomoe:
-            charcol += tomoe_dict_to_character_collection(tomoe)
+        charcol = get_aggregated_charcol(
+                        ((TYPE_CHARCOL, self._charcols),
+                         (TYPE_CHARCOL_DB, self._databases),
+                         (TYPE_DIRECTORY, self._directories),
+                         (TYPE_TOMOE, self._tomoe),
+                         (TYPE_KUCHIBUE, self._kuchibue)))
+
+
+        charcol.include_characters_from_files(self._include)
+        charcol.exclude_characters_from_files(self._exclude)
+
+        # max samples
+        if self._max_samples:
+            charcol.remove_samples(keep_at_most=self._max_samples)
 
+        # FIXME: don't load all characters in memory
         all_chars = charcol.get_all_characters()
 
         if len(all_chars) == 0:
@@ -279,6 +282,8 @@
 parser.add_option("-v", "--verbosity-level",
                   type="int", dest="verbosity_level", default=0,
                   help="verbosity level between 0 and 2")
+
+
 parser.add_option("-d", "--directory",
                   action="append", type="string", dest="directories",
                   default=[],
@@ -287,14 +292,40 @@
                   action="append", type="string", dest="charcols",
                   default=[],
                   help="character collection XML files")
+parser.add_option("-b", "--db",
+                  action="append", type="string", dest="databases",
+                  default=[],
+                  help="character collection XML files")
 parser.add_option("-t", "--tomoe-dict",
                   action="append", type="string", dest="tomoe",
                   default=[],
                   help="Tomoe XML dictionary files")
+parser.add_option("-k", "--kuchibue",
+                  action="append", type="string", dest="kuchibue",
+                  default=[],
+                  help="Kuchibue unipen database")
+
+
 parser.add_option("-l", "--list",
                   action="store_true",dest="list", default=False,
                   help="List available recognizers and models")
 
+
+
+parser.add_option("-i", "--include",
+                  action="append", type="string", dest="include",
+                  default=[],
+                  help="File containing characters to include")
+parser.add_option("-e", "--exclude",
+                  action="append", type="string", dest="exclude",
+                  default=[],
+                  help="File containing characters to exclude")
+parser.add_option("-m", "--max-samples",
+                  type="int", dest="max_samples",
+                  help="Maximum number of samples per character")
+
+
+
 (options, args) = parser.parse_args()
 
 try:
diff -Nru tegaki-tools-0.3/src/tegaki-stats tegaki-tools-0.3.1/src/tegaki-stats
--- tegaki-tools-0.3/src/tegaki-stats	2009-10-20 05:48:16.000000000 +0100
+++ tegaki-tools-0.3.1/src/tegaki-stats	2010-03-23 07:07:56.000000000 +0000
@@ -24,11 +24,10 @@
 import os
 from optparse import OptionParser
 
-from tegaki.character import CharacterCollection
-
+from tegaki.charcol import CharacterCollection
 from tegakitools.charcol import *
 
-VERSION = '0.3'
+VERSION = '0.3.1'
 
 class TegakiStatsError(Exception):
     pass
@@ -41,6 +40,7 @@
     def __init__(self, options, args):
         self._directories = options.directories
         self._charcols = options.charcols
+        self._databases = options.databases
         self._tomoe = options.tomoe
         self._kuchibue = options.kuchibue
         self._include = options.include
@@ -51,6 +51,7 @@
     def run(self):
         charcol = get_aggregated_charcol(
                         ((TYPE_CHARCOL, self._charcols),
+                         (TYPE_CHARCOL_DB, self._databases),
                          (TYPE_DIRECTORY, self._directories),
                          (TYPE_TOMOE, self._tomoe),
                          (TYPE_KUCHIBUE, self._kuchibue)))
@@ -76,14 +77,16 @@
 
         print "Total number of samples: ", charcol.get_total_n_characters()
 
-        n_samples = [len(samp_by_class[k]) for k in samp_by_class.keys()]
+        print "Total number of strokes: ", charcol.get_total_n_strokes()
+
+        n_samples = samp_by_class.values()
         avg = float(sum(n_samples)) / len(n_samples)
         print "Average number of samples per character/class: %0.2f" % avg
 
         if self._verbosity_level >= 2:
             print "\nNumber of samples for each character:"
-            for utf8, chars in samp_by_class.items():
-                print "%s: %d" % (utf8, len(chars))
+            for utf8, n_chars in samp_by_class.items():
+                print "%s: %d" % (utf8, n_chars)
             print "\n"
 
         classes_by_sc = self._get_classes_by_stroke_count(charcol)
@@ -112,17 +115,17 @@
     def _get_samples_by_class(self, charcol):
         d = {}
         for set_name in charcol.get_set_list():
-            for char in charcol.get_characters(set_name):
-                utf8 = char.get_utf8()
-                d[utf8] = d.get(utf8, []) + [char]
+            for row in charcol.get_character_rows(set_name):
+                utf8 = row['utf8'].encode("utf8")
+                d[utf8] = d.get(utf8, 0) + 1
         return d
 
     def _get_classes_by_stroke_count(self, charcol):
         d = {}
         for set_name in charcol.get_set_list():
-            for char in charcol.get_characters(set_name):
-                n_strokes = char.get_writing().get_n_strokes()
-                utf8 = char.get_utf8()
+            for row in charcol.get_character_rows(set_name):
+                n_strokes = row['n_strokes']
+                utf8 = row['utf8'].encode("utf8")
                 d[n_strokes] = d.get(n_strokes, [])
                 if not utf8 in d[n_strokes]:
                     d[n_strokes].append(utf8)
@@ -131,9 +134,9 @@
     def _get_stroke_counts_by_class(self, charcol):
         d = {}
         for set_name in charcol.get_set_list():
-            for char in charcol.get_characters(set_name):
-                n_strokes = char.get_writing().get_n_strokes()
-                utf8 = char.get_utf8()
+            for row in charcol.get_character_rows(set_name):
+                n_strokes = row['n_strokes']
+                utf8 = row['utf8'].encode("utf8")
                 d[utf8] = d.get(utf8, [])
                 if not n_strokes in d[utf8]:
                     d[utf8].append(n_strokes)
@@ -152,6 +155,10 @@
                   action="append", type="string", dest="charcols",
                   default=[],
                   help="character collection XML files")
+parser.add_option("-b", "--db",
+                  action="append", type="string", dest="databases",
+                  default=[],
+                  help="character collection XML files")
 parser.add_option("-t", "--tomoe-dict",
                   action="append", type="string", dest="tomoe",
                   default=[],
diff -Nru tegaki-tools-0.3/src/tegakitools/charcol.py tegaki-tools-0.3.1/src/tegakitools/charcol.py
--- tegaki-tools-0.3/src/tegakitools/charcol.py	2009-10-19 08:26:23.000000000 +0100
+++ tegaki-tools-0.3.1/src/tegakitools/charcol.py	2010-02-27 04:52:04.000000000 +0000
@@ -20,43 +20,73 @@
 # Contributors to this file:
 # - Mathieu Blondel
 
-from tegaki.character import CharacterCollection
+import os
+
+from tegaki.charcol import CharacterCollection
 
 from tegakitools.tomoe import tomoe_dict_to_character_collection
 from tegakitools.kuchibue import kuchibue_to_character_collection
 
-TYPE_CHARCOL, TYPE_DIRECTORY, TYPE_TOMOE, TYPE_KUCHIBUE = range(4)
+TYPE_CHARCOL, TYPE_CHARCOL_DB, TYPE_DIRECTORY, TYPE_TOMOE, TYPE_KUCHIBUE = \
+range(5)
+
+def _get_charcol(charcol_type, charcol_path):
+    if charcol_type == TYPE_DIRECTORY:
+        # charcol_path is actually a directory here
+        return CharacterCollection.from_character_directory(charcol_path)
+
+    elif charcol_type in (TYPE_CHARCOL, TYPE_CHARCOL_DB):
+        return CharacterCollection(charcol_path)
+
+    elif charcol_type == TYPE_TOMOE:
+        return tomoe_dict_to_character_collection(charcol_path)
 
-def get_aggregated_charcol(tuples):
+    elif charcol_type == TYPE_KUCHIBUE:
+        return kuchibue_to_character_collection(charcol_path)
+    
+
+def get_aggregated_charcol(tuples, dbpath=None):
     """
     Create a character collection out of other character collections,
     character directories, tomoe dictionaries or kuchibue databases.
 
-    tuples: a list of tuples (TYPE, list)
+    tuples: a list of tuples (TYPE, path list)
     """
-    charcol = CharacterCollection()
 
-    for typ, files in tuples:
-        if typ == TYPE_DIRECTORY:
-            # files should actually contain a list of directories
-            for d in files: 
-                charcol += CharacterCollection.from_character_directory(d)
-
-        elif typ == TYPE_CHARCOL:
-            for charcol_path in files:
-                _charcol = CharacterCollection()
-                gzip = False; bz2 = False
-                if charcol_path.endswith(".gz"): gzip = True
-                if charcol_path.endswith(".bz2"): bz2 = True
-                _charcol.read(charcol_path, gzip=gzip, bz2=bz2)
-                charcol += _charcol
-
-        elif typ == TYPE_TOMOE:
-            for tomoe in files:
-                charcol += tomoe_dict_to_character_collection(tomoe)
-
-        elif typ == TYPE_KUCHIBUE:
-            for kuchibue in files:
-                charcol += kuchibue_to_character_collection(kuchibue)
+    # number of files for each character collection type
+    n_files = [len(t[1]) for t in tuples]
+    
+    # we don't need to merge character collections if only one is provided
+    # this can save a lot of time for large collections
+    if sum(n_files) == 1 and dbpath is None:
+        idx = n_files.index(1)
+        return _get_charcol(tuples[idx][0], tuples[idx][1][0])
+
+    if dbpath is not None and dbpath.endswith(".chardb"):
+        if os.path.exists(dbpath):
+            print "%s exists already." % dbpath
+            print "Continuing will modify it..."
+            answer = raw_input("Continue anyway? (y/N)")
+            if answer == "y":
+                print "Overwrite to concatenate collections together " + \
+                      "in a new database"
+                print "Don't overwrite to append new characters or "  + \
+                      "filter (-i,-e,-m) existing database"
+                answer = raw_input("Overwrite it? (y/N)")
+                if answer == "y":
+                    os.unlink(dbpath)
+            else:
+                exit()
+
+        charcol = CharacterCollection(dbpath)
+        #charcol.WRITE_BACK = False
+        #charcol.AUTO_COMMIT = True
+    else:
+        charcol = CharacterCollection() # in memory db
+
+    charcols = [_get_charcol(typ, path) \
+                    for typ, paths in tuples for path in paths]
+
+    charcol.merge(charcols)
 
     return charcol
diff -Nru tegaki-tools-0.3/src/tegakitools/kuchibue.py tegaki-tools-0.3.1/src/tegakitools/kuchibue.py
--- tegaki-tools-0.3/src/tegakitools/kuchibue.py	2009-09-18 09:32:19.000000000 +0100
+++ tegaki-tools-0.3.1/src/tegakitools/kuchibue.py	2010-02-27 04:52:04.000000000 +0000
@@ -26,8 +26,8 @@
 import re
 import os
 
-from tegaki.character import Point, Stroke, Writing, Character, \
-                             CharacterCollection
+from tegaki.character import Point, Stroke, Writing, Character
+from tegaki.charcol import CharacterCollection
 
 from unipen import UnipenParser
 from shiftjis import SHIFT_JIS_TABLE
diff -Nru tegaki-tools-0.3/src/tegakitools/tomoe.py tegaki-tools-0.3.1/src/tegakitools/tomoe.py
--- tegaki-tools-0.3/src/tegakitools/tomoe.py	2009-07-10 11:45:20.000000000 +0100
+++ tegaki-tools-0.3.1/src/tegakitools/tomoe.py	2010-02-27 04:52:04.000000000 +0000
@@ -20,8 +20,8 @@
 # Contributors to this file:
 # - Mathieu Blondel
 
-from tegaki.character import Point, Stroke, Writing, Character, \
-                             CharacterCollection, _XmlBase
+from tegaki.character import Point, Stroke, Writing, Character, _XmlBase
+from tegaki.charcol import CharacterCollection
 
 class TomoeXmlDictionaryReader(_XmlBase):
 
diff -Nru tegaki-tools-0.3/src/tegakitools/unipen.py tegaki-tools-0.3.1/src/tegakitools/unipen.py
--- tegaki-tools-0.3/src/tegakitools/unipen.py	2009-09-08 13:22:25.000000000 +0100
+++ tegaki-tools-0.3.1/src/tegakitools/unipen.py	2010-02-27 04:52:04.000000000 +0000
@@ -26,8 +26,8 @@
 import re
 import os
 
-from tegaki.character import Point, Stroke, Writing, Character, \
-                             CharacterCollection
+from tegaki.character import Point, Stroke, Writing, Character
+from tegaki.charcol import CharacterCollection
 
 class UnipenEventParser(object):
     """SAX-like event-based parser"""
@@ -147,8 +147,17 @@
     def get_character_collection(self):
         charcol = CharacterCollection()
         assert(len(self._labels) == len(self._characters))
+
+        # group characters with the same label into sets
+        sets = {}
         for i in range(len(self._characters)):
-            self._characters[i].set_utf8(self._labels[i])
-            charcol.add_set(self._labels[i])
-            charcol.append_character(self._labels[i], self._characters[i])
+            utf8 = self._labels[i]
+            self._characters[i].set_utf8(utf8)
+            sets[utf8] = sets.get(utf8, []) + [self._characters[i]]
+
+        charcol.add_sets(sets.keys())
+
+        for set_name, characters in sets.items():
+            charcol.append_characters(set_name, characters)
+
         return charcol