Personal tools
現在位置: ホーム Users retsu My Utilities Check program for transliteration mapping data Unidecode

Check program for transliteration mapping data Unidecode

This program analyze the code mapping data quality of uidecode 0.04.1 program. Change the directory path which is hard-coded in the program and run it by typing "python ./unicheck.py" on your shell.

unicheck.py — Python Source, 6Kb

ファイルコンテンツ

#!/usr/bin/python

import sys
import os
import glob

rangeText = """
0000-007F  Basic Latin
0080-00FF  Latin-1 Supplement
0100-017F  Latin Extended-A
0180-024F  Latin Extended-B
0250-02AF  IPA Extensions
02B0-02FF  Spacing Modifier Letters
0300-036F  Combining Diacritical Marks  
0370-03FF  Greek and Coptic
0400-04FF  Cyrillic
0500-052F  Cyrillic Supplement
0530-058F  Armenian
0590-05FF  Hebrew
0600-06FF  Arabic
0700-074F  Syriac
0750-077F  Arabic Supplement
0780-07BF  Thaana
07C0-07FF  NKo
0800-083F  Samaritan
0900-097F  Devanagari
0980-09FF  Bengali
0A00-0A7F  Gurmukhi
0A80-0AFF  Gujarati
0B00-0B7F  Oriya
0B80-0BFF  Tamil
0C00-0C7F  Telugu
0C80-0CFF  Kannada
0D00-0D7F  Malayalam
0D80-0DFF  Sinhala
0E00-0E7F  Thai
0E80-0EFF  Lao
0F00-0FFF  Tibetan
1000-109F  Myanmar  
10A0-10FF  Georgian
1100-11FF  Hangul Jamo
1200-137F  Ethiopic
1380-139F  Ethiopic Supplement
13A0-13FF  Cherokee
1400-167F  Unified Canadian Aboriginal Syllabics
1680-169F  Ogham
16A0-16FF  Runic
1700-171F  Tagalog
1720-173F  Hanunoo
1740-175F  Buhid
1760-177F  Tagbanwa
1780-17FF  Khmer
1800-18AF  Mongolian
18B0-18FF  Unified Canadian Aboriginal Syllabics Extended
1900-194F  Limbu
1950-197F  Tai Le
1980-19DF  New Tai Lue
19E0-19FF  Khmer Symbols
1A00-1A1F  Buginese
1A20-1AAF  Tai Tham
1B00-1B7F  Balinese
1B80-1BBF  Sundanese
1C00-1C4F  Lepcha
1C50-1C7F  Ol Chiki
1CD0-1CFF  Vedic Extensions
1D00-1D7F  Phonetic Extensions
1D80-1DBF  Phonetic Extensions Supplement
1DC0-1DFF  Combining Diacritical Marks Supplement
1E00-1EFF  Latin Extended Additional
1F00-1FFF  Greek Extended
2000-206F  General Punctuation
2070-209F  Superscripts and Subscripts
20A0-20CF  Currency Symbols
20D0-20FF  Combining Diacritical Marks for Symbols
2100-214F  Letterlike Symbols  
2150-218F  Number Forms
2190-21FF  Arrows
2200-22FF  Mathematical Operators
2300-23FF  Miscellaneous Technical
2400-243F  Control Pictures
2440-245F  Optical Character Recognition
2460-24FF  Enclosed Alphanumerics
2500-257F  Box Drawing
2580-259F  Block Elements
25A0-25FF  Geometric Shapes
2600-26FF  Miscellaneous Symbols
2700-27BF  Dingbats
27C0-27EF  Miscellaneous Mathematical Symbols-A
27F0-27FF  Supplemental Arrows-A
2800-28FF  Braille Patterns
2900-297F  Supplemental Arrows-B
2980-29FF  Miscellaneous Mathematical Symbols-B
2A00-2AFF  Supplemental Mathematical Operators
2B00-2BFF  Miscellaneous Symbols and Arrows
2C00-2C5F  Glagolitic
2C60-2C7F  Latin Extended-C
2C80-2CFF  Coptic
2D00-2D2F  Georgian Supplement
2D30-2D7F  Tifinagh
2D80-2DDF  Ethiopic Extended
2DE0-2DFF  Cyrillic Extended-A
2E00-2E7F  Supplemental Punctuation
2E80-2EFF  CJK Radicals Supplement
2F00-2FDF  Kangxi Radicals
2FE0-2FEF  NOT SPECIFIED
2FF0-2FFF  Ideographic Description Characters
3000-303F  CJK Symbols and Punctuation
3040-309F  Hiragana
30A0-30FF  Katakana
3100-312F  Bopomofo
3130-318F  Hangul Compatibility Jamo
3190-319F  Kanbun
31A0-31BF  Bopomofo Extended
31C0-31EF  CJK Strokes
31F0-31FF  Katakana Phonetic Extensions
3200-32FF  Enclosed CJK Letters and Months
3300-33FF  CJK Compatibility
3400-4DBF  CJK Unified Ideographs Extension A
4DC0-4DFF  Yijing Hexagram Symbols
4E00-9FFF  CJK Unified Ideographs  
A000-A48F  Yi Syllables
A490-A4CF  Yi Radicals
A4D0-A4FF  Lisu
A500-A63F  Vai
A640-A69F  Cyrillic Extended-B
A6A0-A6FF  Bamum
A700-A71F  Modifier Tone Letters
A720-A7FF  Latin Extended-D
A800-A82F  Syloti Nagri
A830-A83F  Common Indic Number Forms
A840-A87F  Phags-pa
A880-A8DF  Saurashtra
A8E0-A8FF  Devanagari Extended
A900-A92F  Kayah Li
A930-A95F  Rejang
A960-A97F  Hangul Jamo Extended-A
A980-A9DF  Javanese
AA00-AA5F  Cham
AA60-AA7F  Myanmar Extended-A
AA80-AADF  Tai Viet
ABC0-ABFF  Meetei Mayek
AC00-D7AF  Hangul Syllables
D7B0-D7FF  Hangul Jamo Extended-B
D800-DB7F  High Surrogates
DB80-DBFF  High Private Use Surrogates
DC00-DFFF  Low Surrogates
E000-F8FF  Private Use Area
F900-FAFF  CJK Compatibility Ideographs
FB00-FB4F  Alphabetic Presentation Forms
FB50-FDFF  Arabic Presentation Forms-A
FE00-FE0F  Variation Selectors
FE10-FE1F  Vertical Forms
FE20-FE2F  Combining Half Marks
FE30-FE4F  CJK Compatibility Forms
FE50-FE6F  Small Form Variants
FE70-FEFF  Arabic Presentation Forms-B
FF00-FFEF  Halfwidth and Fullwidth Forms
FFF0-FFFF  Specials
"""
uniTable = []
# print rangeText
rangeText = rangeText.splitlines()
# print rangeText
for line in rangeText:
    startLoc = line[0:4].lower()
    endLoc = line[5:9].lower()
    desc = line[11:]
    if startLoc != '':
        uniLine = [startLoc, endLoc, desc, 0, 0, 0, 0]
        uniTable.append(uniLine)

dirName = '/Users/retsu/Downloads/Unidecode-0.04.1/unidecode/'
os.chdir(dirName)
fileNames = glob.glob('*.py')
fileNames = [item for item in fileNames if item[0] == 'x']
countUnknownTotal = 0
countRangeNotFound = 0
rangeNotFound = []
for fileName in fileNames:
    countUnknown = 0
    inFile = fileName
    f = open(inFile, 'r')
    lines = f.readlines()
    f.close()
    for i, line in enumerate(lines):
        if line[:8] == 'data = (' or line[:1] == ')':
#            print 'LINE[:8]:', line[:8]
            continue
        uPos = line.rfind('#') + 4
        uCode = inFile[1:3] + line[uPos:uPos+2]
        rangeName = 'NOT KNOWN'
        for uniLine in uniTable:
            vStart = int(uniLine[0], 16)
            vEnd = int(uniLine[1], 16)
            vCode = int(uCode, 16)
            if vStart <= vCode and vCode <= vEnd:
                uniLine[4] = uniLine[4] + 1
                rangeName = uniLine[2]
                if '[?]' in line:
                    uniLine[3] = uniLine[3] + 1
                    countUnknown = countUnknown + 1
              #      print 'UNKNOWN: ', inFile, i+1, line[:-1], inFile[1:3], line[14:16], uniLine[2]
        if rangeName == 'NOT KNOWN':
            print 'RANGE NOT FOUND:', inFile, i+1, line[:-1]
            rangeNotFound.append(uCode)
            countRangeNotFound = countRangeNotFound + 1
    countUnknownTotal = countUnknownTotal + countUnknown
print
print 'COUNTING LIST FOR [?] UNKNOWN MARKED CODE'
print 'NoDef Defnd Width Bad% Code Range  Range Description'
print '----- ----- ----- ---- --------- ' + '-'*40
countRangeFound = 0
countRangeDefined = 0
countRangeWidth = 0
for uniLine in uniTable:
    uniLine[5] = 1 + int(uniLine[1],16) - int(uniLine[0],16)
    if uniLine[4] > 0:
        uniLine[6] = 100 * float(uniLine[3]) / uniLine[4]
    else:
        uniLine[6] = 100
    print '%5d %5d %5d %3d%% %s-%s %s' % \
          (uniLine[3], uniLine[4], uniLine[5], uniLine[6], uniLine[0], uniLine[1], uniLine[2][:40])
    countRangeFound = countRangeFound + uniLine[3]
    countRangeDefined = countRangeDefined + uniLine[4]
    countRangeWidth = countRangeWidth + uniLine[5]

print 'UNKNOWN TOTAL:', countUnknownTotal
print 'RANGE FOUND:', countRangeFound
print 'RANGE UNKNOWN:', countRangeNotFound
print 'RANGE DESCRIPTION NOT FOUND FOR FOLLOWING CODE:', rangeNotFound
digit = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']
allFiles = set(['x'+i+j+'.py' for i in digit for j in digit])
missingFiles = allFiles.difference(set(fileNames))
print 'MISSING FILES:', len(missingFiles), sorted(missingFiles)
ドキュメントアクション