Merge lp:~max-rabkin/ibid/unihan-simp-trad into lp:~ibid-core/ibid/old-trunk-1.6

Proposed by Max Rabkin
Status: Merged
Approved by: Stefano Rivera
Approved revision: 928
Merged at revision: 928
Proposed branch: lp:~max-rabkin/ibid/unihan-simp-trad
Merge into: lp:~ibid-core/ibid/old-trunk-1.6
Diff against target: 123 lines (+35/-14)
1 file modified
ibid/plugins/conversions.py (+35/-14)
To merge this branch: bzr merge lp:~max-rabkin/ibid/unihan-simp-trad
Reviewer Review Type Date Requested Status
Keegan Carruthers-Smith Approve
Jonathan Hitchcock Approve
Stefano Rivera Approve
Review via email: mp+23995@code.launchpad.net

Commit message

Add traditional and simplified variants to unihan information.
Clean up white-space style in unihan.

Description of the change

In the People's Republic of China and Singapore, some Han characters have been simplified. This can sometimes cause confusion when these characters are discussed, if some of the conversants are only familiar with one version. The Unihan database contains this information, so this patch adds this to Ibid's unicode information.

Some examples for testing:
U+56FD (国) and U+570B (國) are respectively simplified and traditional versions of each other.
U+4E00 (一) and U+65E5 (日) are the same in the simplified and traditional script, though the former has other variants (which we ignore for now).

To post a comment you must log in.
Revision history for this message
Stefano Rivera (stefanor) wrote :

+ def variant (self):

Whitespace style, please.

+ variant, _ = variant.contents[0].split(None, 1)

I'm not mad about using _ for an ignore as it's often used as a translation function.

lp:~max-rabkin/ibid/unihan-simp-trad updated
928. By Max Rabkin

fix style issues (blame tumbleweed for unrelated changes)

Revision history for this message
Stefano Rivera (stefanor) :
review: Approve
Revision history for this message
Jonathan Hitchcock (vhata) :
review: Approve
Revision history for this message
Keegan Carruthers-Smith (keegan-csmith) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'ibid/plugins/conversions.py'
2--- ibid/plugins/conversions.py 2010-04-13 11:22:13 +0000
3+++ ibid/plugins/conversions.py 2010-04-25 21:17:24 +0000
4@@ -454,7 +454,7 @@
5 return syllable
6
7 class Unihan(object):
8- def __init__ (self, char):
9+ def __init__(self, char):
10 self.char = char
11 url = 'http://www.unicode.org/cgi-bin/GetUnihanData.pl?'
12 params = {'codepoint': self.char.encode('utf8'),
13@@ -466,6 +466,9 @@
14 self.defn = self.soup.find(text='Other Dictionary Data') \
15 .findNext('table')('tr')[1]('td')[0] \
16 .contents[0].strip()
17+ self.variants = self.soup.find(text='Variants')
18+ if self.variants is not None:
19+ self.variants = self.variants.findNext('table')('tr')[1]('td')
20 self.other_data = defaultdict(unicode,
21 ((row('td')[0].contents[0].strip(),
22 row('td')[1].code.contents[0].strip())
23@@ -473,33 +476,51 @@
24 self.soup.find(text='Other Data')
25 .findNext('table')('tr')[1:]))
26
27- def pinyin (self):
28+ def pinyin(self):
29 return map(fix_pinyin_tone, self.phonetic[1].contents[0].lower().split())
30
31- def hangul (self):
32+ def hangul(self):
33 return self.other_data['kHangul'].split()
34
35- def korean_yale (self):
36+ def korean_yale(self):
37 return self.phonetic[5].contents[0].lower().split()
38
39- def korean (self):
40+ def korean(self):
41 return [u'%s [%s]' % (h, y) for h, y in
42 zip(self.hangul(), self.korean_yale())]
43
44- def japanese_on (self):
45+ def japanese_on(self):
46 return self.phonetic[3].contents[0].lower().split()
47
48- def japanese_kun (self):
49+ def japanese_kun(self):
50 return self.phonetic[4].contents[0].lower().split()
51
52- def definition (self):
53+ def definition(self):
54 return self.defn
55
56- def __unicode__ (self):
57+ def variant(self):
58+ if self.variants is None:
59+ return []
60+
61+ msgs = []
62+ for variant, name in ((0, 'simplified'),
63+ (1, 'traditional')):
64+ variant = self.variants[variant].contents[0]
65+ if not isinstance(variant, basestring):
66+ variant, rest = variant.contents[0].split(None, 1)
67+
68+ msgs.append(u'the %(name)s form is %(var)s' %
69+ {'name': name,
70+ 'var': unichr(int(variant[2:], 16))})
71+ return msgs
72+
73+ def __unicode__(self):
74 msgs = []
75 if self.definition():
76 msgs = [u'it means %s' % self.definition()]
77
78+ msgs += self.variant()
79+
80 prons = []
81 for reading, lang in ((self.pinyin, 'pinyin'),
82 (self.korean, 'Korean'),
83@@ -562,7 +583,7 @@
84 r'^(?:unicode|unihan|ascii)\s+'
85 r'([0-9a-f]*(?:[0-9][a-f]|[a-f][0-9])[0-9a-f]*)$|'
86 r'^(?:unicode|unihan|ascii)\s+#?(\d{2,})$')
87- def unichr (self, event, hexcode, hexcode2, deccode):
88+ def unichr(self, event, hexcode, hexcode2, deccode):
89 if hexcode or hexcode2:
90 code = int(hexcode or hexcode2, 16)
91 else:
92@@ -586,7 +607,7 @@
93 info)
94
95 @match(r'^uni(?:code|han)\s+(.)$', 'deaddressed')
96- def ord (self, event, char):
97+ def ord(self, event, char):
98 try:
99 info = self.info(char)
100 except UnassignedCharacter:
101@@ -603,7 +624,7 @@
102 info)
103
104 @match(r'^uni(?:code|han)\s+([a-z][a-z0-9 -]+)$')
105- def fromname (self, event, name):
106+ def fromname(self, event, name):
107 try:
108 char = unicodedata.lookup(name.upper())
109 except KeyError:
110@@ -619,11 +640,11 @@
111
112 # Match any string that can't be a character name or a number.
113 @match(r'^unicode\s+(.*[^0-9a-z#+\s-].+|.+[^0-9a-z#+\s-].*)$', 'deaddressed')
114- def characters (self, event, string):
115+ def characters(self, event, string):
116 event.addresponse(human_join('U+%(code)s %(name)s' % self.info(c)
117 for c in string))
118
119- def info (self, char):
120+ def info(self, char):
121 cat = unicodedata.category(char)
122 if cat == 'Cn':
123 raise UnassignedCharacter

Subscribers

People subscribed via source and target branches