Check program for transliteration mapping data Unidecode
This program analyze the code mapping data quality of uidecode 0.04.1 program. Change the directory path which is hard-coded in the program and run it by typing "python ./unicheck.py" on your shell.
unicheck.py
—
Python Source,
6Kb
ファイルコンテンツ
#!/usr/bin/python
import sys
import os
import glob
rangeText = """
0000-007F Basic Latin
0080-00FF Latin-1 Supplement
0100-017F Latin Extended-A
0180-024F Latin Extended-B
0250-02AF IPA Extensions
02B0-02FF Spacing Modifier Letters
0300-036F Combining Diacritical Marks
0370-03FF Greek and Coptic
0400-04FF Cyrillic
0500-052F Cyrillic Supplement
0530-058F Armenian
0590-05FF Hebrew
0600-06FF Arabic
0700-074F Syriac
0750-077F Arabic Supplement
0780-07BF Thaana
07C0-07FF NKo
0800-083F Samaritan
0900-097F Devanagari
0980-09FF Bengali
0A00-0A7F Gurmukhi
0A80-0AFF Gujarati
0B00-0B7F Oriya
0B80-0BFF Tamil
0C00-0C7F Telugu
0C80-0CFF Kannada
0D00-0D7F Malayalam
0D80-0DFF Sinhala
0E00-0E7F Thai
0E80-0EFF Lao
0F00-0FFF Tibetan
1000-109F Myanmar
10A0-10FF Georgian
1100-11FF Hangul Jamo
1200-137F Ethiopic
1380-139F Ethiopic Supplement
13A0-13FF Cherokee
1400-167F Unified Canadian Aboriginal Syllabics
1680-169F Ogham
16A0-16FF Runic
1700-171F Tagalog
1720-173F Hanunoo
1740-175F Buhid
1760-177F Tagbanwa
1780-17FF Khmer
1800-18AF Mongolian
18B0-18FF Unified Canadian Aboriginal Syllabics Extended
1900-194F Limbu
1950-197F Tai Le
1980-19DF New Tai Lue
19E0-19FF Khmer Symbols
1A00-1A1F Buginese
1A20-1AAF Tai Tham
1B00-1B7F Balinese
1B80-1BBF Sundanese
1C00-1C4F Lepcha
1C50-1C7F Ol Chiki
1CD0-1CFF Vedic Extensions
1D00-1D7F Phonetic Extensions
1D80-1DBF Phonetic Extensions Supplement
1DC0-1DFF Combining Diacritical Marks Supplement
1E00-1EFF Latin Extended Additional
1F00-1FFF Greek Extended
2000-206F General Punctuation
2070-209F Superscripts and Subscripts
20A0-20CF Currency Symbols
20D0-20FF Combining Diacritical Marks for Symbols
2100-214F Letterlike Symbols
2150-218F Number Forms
2190-21FF Arrows
2200-22FF Mathematical Operators
2300-23FF Miscellaneous Technical
2400-243F Control Pictures
2440-245F Optical Character Recognition
2460-24FF Enclosed Alphanumerics
2500-257F Box Drawing
2580-259F Block Elements
25A0-25FF Geometric Shapes
2600-26FF Miscellaneous Symbols
2700-27BF Dingbats
27C0-27EF Miscellaneous Mathematical Symbols-A
27F0-27FF Supplemental Arrows-A
2800-28FF Braille Patterns
2900-297F Supplemental Arrows-B
2980-29FF Miscellaneous Mathematical Symbols-B
2A00-2AFF Supplemental Mathematical Operators
2B00-2BFF Miscellaneous Symbols and Arrows
2C00-2C5F Glagolitic
2C60-2C7F Latin Extended-C
2C80-2CFF Coptic
2D00-2D2F Georgian Supplement
2D30-2D7F Tifinagh
2D80-2DDF Ethiopic Extended
2DE0-2DFF Cyrillic Extended-A
2E00-2E7F Supplemental Punctuation
2E80-2EFF CJK Radicals Supplement
2F00-2FDF Kangxi Radicals
2FE0-2FEF NOT SPECIFIED
2FF0-2FFF Ideographic Description Characters
3000-303F CJK Symbols and Punctuation
3040-309F Hiragana
30A0-30FF Katakana
3100-312F Bopomofo
3130-318F Hangul Compatibility Jamo
3190-319F Kanbun
31A0-31BF Bopomofo Extended
31C0-31EF CJK Strokes
31F0-31FF Katakana Phonetic Extensions
3200-32FF Enclosed CJK Letters and Months
3300-33FF CJK Compatibility
3400-4DBF CJK Unified Ideographs Extension A
4DC0-4DFF Yijing Hexagram Symbols
4E00-9FFF CJK Unified Ideographs
A000-A48F Yi Syllables
A490-A4CF Yi Radicals
A4D0-A4FF Lisu
A500-A63F Vai
A640-A69F Cyrillic Extended-B
A6A0-A6FF Bamum
A700-A71F Modifier Tone Letters
A720-A7FF Latin Extended-D
A800-A82F Syloti Nagri
A830-A83F Common Indic Number Forms
A840-A87F Phags-pa
A880-A8DF Saurashtra
A8E0-A8FF Devanagari Extended
A900-A92F Kayah Li
A930-A95F Rejang
A960-A97F Hangul Jamo Extended-A
A980-A9DF Javanese
AA00-AA5F Cham
AA60-AA7F Myanmar Extended-A
AA80-AADF Tai Viet
ABC0-ABFF Meetei Mayek
AC00-D7AF Hangul Syllables
D7B0-D7FF Hangul Jamo Extended-B
D800-DB7F High Surrogates
DB80-DBFF High Private Use Surrogates
DC00-DFFF Low Surrogates
E000-F8FF Private Use Area
F900-FAFF CJK Compatibility Ideographs
FB00-FB4F Alphabetic Presentation Forms
FB50-FDFF Arabic Presentation Forms-A
FE00-FE0F Variation Selectors
FE10-FE1F Vertical Forms
FE20-FE2F Combining Half Marks
FE30-FE4F CJK Compatibility Forms
FE50-FE6F Small Form Variants
FE70-FEFF Arabic Presentation Forms-B
FF00-FFEF Halfwidth and Fullwidth Forms
FFF0-FFFF Specials
"""
uniTable = []
# print rangeText
rangeText = rangeText.splitlines()
# print rangeText
for line in rangeText:
startLoc = line[0:4].lower()
endLoc = line[5:9].lower()
desc = line[11:]
if startLoc != '':
uniLine = [startLoc, endLoc, desc, 0, 0, 0, 0]
uniTable.append(uniLine)
dirName = '/Users/retsu/Downloads/Unidecode-0.04.1/unidecode/'
os.chdir(dirName)
fileNames = glob.glob('*.py')
fileNames = [item for item in fileNames if item[0] == 'x']
countUnknownTotal = 0
countRangeNotFound = 0
rangeNotFound = []
for fileName in fileNames:
countUnknown = 0
inFile = fileName
f = open(inFile, 'r')
lines = f.readlines()
f.close()
for i, line in enumerate(lines):
if line[:8] == 'data = (' or line[:1] == ')':
# print 'LINE[:8]:', line[:8]
continue
uPos = line.rfind('#') + 4
uCode = inFile[1:3] + line[uPos:uPos+2]
rangeName = 'NOT KNOWN'
for uniLine in uniTable:
vStart = int(uniLine[0], 16)
vEnd = int(uniLine[1], 16)
vCode = int(uCode, 16)
if vStart <= vCode and vCode <= vEnd:
uniLine[4] = uniLine[4] + 1
rangeName = uniLine[2]
if '[?]' in line:
uniLine[3] = uniLine[3] + 1
countUnknown = countUnknown + 1
# print 'UNKNOWN: ', inFile, i+1, line[:-1], inFile[1:3], line[14:16], uniLine[2]
if rangeName == 'NOT KNOWN':
print 'RANGE NOT FOUND:', inFile, i+1, line[:-1]
rangeNotFound.append(uCode)
countRangeNotFound = countRangeNotFound + 1
countUnknownTotal = countUnknownTotal + countUnknown
print
print 'COUNTING LIST FOR [?] UNKNOWN MARKED CODE'
print 'NoDef Defnd Width Bad% Code Range Range Description'
print '----- ----- ----- ---- --------- ' + '-'*40
countRangeFound = 0
countRangeDefined = 0
countRangeWidth = 0
for uniLine in uniTable:
uniLine[5] = 1 + int(uniLine[1],16) - int(uniLine[0],16)
if uniLine[4] > 0:
uniLine[6] = 100 * float(uniLine[3]) / uniLine[4]
else:
uniLine[6] = 100
print '%5d %5d %5d %3d%% %s-%s %s' % \
(uniLine[3], uniLine[4], uniLine[5], uniLine[6], uniLine[0], uniLine[1], uniLine[2][:40])
countRangeFound = countRangeFound + uniLine[3]
countRangeDefined = countRangeDefined + uniLine[4]
countRangeWidth = countRangeWidth + uniLine[5]
print 'UNKNOWN TOTAL:', countUnknownTotal
print 'RANGE FOUND:', countRangeFound
print 'RANGE UNKNOWN:', countRangeNotFound
print 'RANGE DESCRIPTION NOT FOUND FOR FOLLOWING CODE:', rangeNotFound
digit = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']
allFiles = set(['x'+i+j+'.py' for i in digit for j in digit])
missingFiles = allFiles.difference(set(fileNames))
print 'MISSING FILES:', len(missingFiles), sorted(missingFiles)
