#!/usr/bin/python

import sys
import os
import glob

rangeText = """
0000-007F  Basic Latin
0080-00FF  Latin-1 Supplement
0100-017F  Latin Extended-A
0180-024F  Latin Extended-B
0250-02AF  IPA Extensions
02B0-02FF  Spacing Modifier Letters
0300-036F  Combining Diacritical Marks  
0370-03FF  Greek and Coptic
0400-04FF  Cyrillic
0500-052F  Cyrillic Supplement
0530-058F  Armenian
0590-05FF  Hebrew
0600-06FF  Arabic
0700-074F  Syriac
0750-077F  Arabic Supplement
0780-07BF  Thaana
07C0-07FF  NKo
0800-083F  Samaritan
0900-097F  Devanagari
0980-09FF  Bengali
0A00-0A7F  Gurmukhi
0A80-0AFF  Gujarati
0B00-0B7F  Oriya
0B80-0BFF  Tamil
0C00-0C7F  Telugu
0C80-0CFF  Kannada
0D00-0D7F  Malayalam
0D80-0DFF  Sinhala
0E00-0E7F  Thai
0E80-0EFF  Lao
0F00-0FFF  Tibetan
1000-109F  Myanmar  
10A0-10FF  Georgian
1100-11FF  Hangul Jamo
1200-137F  Ethiopic
1380-139F  Ethiopic Supplement
13A0-13FF  Cherokee
1400-167F  Unified Canadian Aboriginal Syllabics
1680-169F  Ogham
16A0-16FF  Runic
1700-171F  Tagalog
1720-173F  Hanunoo
1740-175F  Buhid
1760-177F  Tagbanwa
1780-17FF  Khmer
1800-18AF  Mongolian
18B0-18FF  Unified Canadian Aboriginal Syllabics Extended
1900-194F  Limbu
1950-197F  Tai Le
1980-19DF  New Tai Lue
19E0-19FF  Khmer Symbols
1A00-1A1F  Buginese
1A20-1AAF  Tai Tham
1B00-1B7F  Balinese
1B80-1BBF  Sundanese
1C00-1C4F  Lepcha
1C50-1C7F  Ol Chiki
1CD0-1CFF  Vedic Extensions
1D00-1D7F  Phonetic Extensions
1D80-1DBF  Phonetic Extensions Supplement
1DC0-1DFF  Combining Diacritical Marks Supplement
1E00-1EFF  Latin Extended Additional
1F00-1FFF  Greek Extended
2000-206F  General Punctuation
2070-209F  Superscripts and Subscripts
20A0-20CF  Currency Symbols
20D0-20FF  Combining Diacritical Marks for Symbols
2100-214F  Letterlike Symbols  
2150-218F  Number Forms
2190-21FF  Arrows
2200-22FF  Mathematical Operators
2300-23FF  Miscellaneous Technical
2400-243F  Control Pictures
2440-245F  Optical Character Recognition
2460-24FF  Enclosed Alphanumerics
2500-257F  Box Drawing
2580-259F  Block Elements
25A0-25FF  Geometric Shapes
2600-26FF  Miscellaneous Symbols
2700-27BF  Dingbats
27C0-27EF  Miscellaneous Mathematical Symbols-A
27F0-27FF  Supplemental Arrows-A
2800-28FF  Braille Patterns
2900-297F  Supplemental Arrows-B
2980-29FF  Miscellaneous Mathematical Symbols-B
2A00-2AFF  Supplemental Mathematical Operators
2B00-2BFF  Miscellaneous Symbols and Arrows
2C00-2C5F  Glagolitic
2C60-2C7F  Latin Extended-C
2C80-2CFF  Coptic
2D00-2D2F  Georgian Supplement
2D30-2D7F  Tifinagh
2D80-2DDF  Ethiopic Extended
2DE0-2DFF  Cyrillic Extended-A
2E00-2E7F  Supplemental Punctuation
2E80-2EFF  CJK Radicals Supplement
2F00-2FDF  Kangxi Radicals
2FE0-2FEF  NOT SPECIFIED
2FF0-2FFF  Ideographic Description Characters
3000-303F  CJK Symbols and Punctuation
3040-309F  Hiragana
30A0-30FF  Katakana
3100-312F  Bopomofo
3130-318F  Hangul Compatibility Jamo
3190-319F  Kanbun
31A0-31BF  Bopomofo Extended
31C0-31EF  CJK Strokes
31F0-31FF  Katakana Phonetic Extensions
3200-32FF  Enclosed CJK Letters and Months
3300-33FF  CJK Compatibility
3400-4DBF  CJK Unified Ideographs Extension A
4DC0-4DFF  Yijing Hexagram Symbols
4E00-9FFF  CJK Unified Ideographs  
A000-A48F  Yi Syllables
A490-A4CF  Yi Radicals
A4D0-A4FF  Lisu
A500-A63F  Vai
A640-A69F  Cyrillic Extended-B
A6A0-A6FF  Bamum
A700-A71F  Modifier Tone Letters
A720-A7FF  Latin Extended-D
A800-A82F  Syloti Nagri
A830-A83F  Common Indic Number Forms
A840-A87F  Phags-pa
A880-A8DF  Saurashtra
A8E0-A8FF  Devanagari Extended
A900-A92F  Kayah Li
A930-A95F  Rejang
A960-A97F  Hangul Jamo Extended-A
A980-A9DF  Javanese
AA00-AA5F  Cham
AA60-AA7F  Myanmar Extended-A
AA80-AADF  Tai Viet
ABC0-ABFF  Meetei Mayek
AC00-D7AF  Hangul Syllables
D7B0-D7FF  Hangul Jamo Extended-B
D800-DB7F  High Surrogates
DB80-DBFF  High Private Use Surrogates
DC00-DFFF  Low Surrogates
E000-F8FF  Private Use Area
F900-FAFF  CJK Compatibility Ideographs
FB00-FB4F  Alphabetic Presentation Forms
FB50-FDFF  Arabic Presentation Forms-A
FE00-FE0F  Variation Selectors
FE10-FE1F  Vertical Forms
FE20-FE2F  Combining Half Marks
FE30-FE4F  CJK Compatibility Forms
FE50-FE6F  Small Form Variants
FE70-FEFF  Arabic Presentation Forms-B
FF00-FFEF  Halfwidth and Fullwidth Forms
FFF0-FFFF  Specials
"""
uniTable = []
# print rangeText
rangeText = rangeText.splitlines()
# print rangeText
for line in rangeText:
    startLoc = line[0:4].lower()
    endLoc = line[5:9].lower()
    desc = line[11:]
    if startLoc != '':
        uniLine = [startLoc, endLoc, desc, 0, 0, 0, 0]
        uniTable.append(uniLine)

dirName = '/Users/retsu/Downloads/Unidecode-0.04.1/unidecode/'
os.chdir(dirName)
fileNames = glob.glob('*.py')
fileNames = [item for item in fileNames if item[0] == 'x']
countUnknownTotal = 0
countRangeNotFound = 0
rangeNotFound = []
for fileName in fileNames:
    countUnknown = 0
    inFile = fileName
    f = open(inFile, 'r')
    lines = f.readlines()
    f.close()
    for i, line in enumerate(lines):
        if line[:8] == 'data = (' or line[:1] == ')':
#            print 'LINE[:8]:', line[:8]
            continue
        uPos = line.rfind('#') + 4
        uCode = inFile[1:3] + line[uPos:uPos+2]
        rangeName = 'NOT KNOWN'
        for uniLine in uniTable:
            vStart = int(uniLine[0], 16)
            vEnd = int(uniLine[1], 16)
            vCode = int(uCode, 16)
            if vStart <= vCode and vCode <= vEnd:
                uniLine[4] = uniLine[4] + 1
                rangeName = uniLine[2]
                if '[?]' in line:
                    uniLine[3] = uniLine[3] + 1
                    countUnknown = countUnknown + 1
              #      print 'UNKNOWN: ', inFile, i+1, line[:-1], inFile[1:3], line[14:16], uniLine[2]
        if rangeName == 'NOT KNOWN':
            print 'RANGE NOT FOUND:', inFile, i+1, line[:-1]
            rangeNotFound.append(uCode)
            countRangeNotFound = countRangeNotFound + 1
    countUnknownTotal = countUnknownTotal + countUnknown
print
print 'COUNTING LIST FOR [?] UNKNOWN MARKED CODE'
print 'NoDef Defnd Width Bad% Code Range  Range Description'
print '----- ----- ----- ---- --------- ' + '-'*40
countRangeFound = 0
countRangeDefined = 0
countRangeWidth = 0
for uniLine in uniTable:
    uniLine[5] = 1 + int(uniLine[1],16) - int(uniLine[0],16)
    if uniLine[4] > 0:
        uniLine[6] = 100 * float(uniLine[3]) / uniLine[4]
    else:
        uniLine[6] = 100
    print '%5d %5d %5d %3d%% %s-%s %s' % \
          (uniLine[3], uniLine[4], uniLine[5], uniLine[6], uniLine[0], uniLine[1], uniLine[2][:40])
    countRangeFound = countRangeFound + uniLine[3]
    countRangeDefined = countRangeDefined + uniLine[4]
    countRangeWidth = countRangeWidth + uniLine[5]

print 'UNKNOWN TOTAL:', countUnknownTotal
print 'RANGE FOUND:', countRangeFound
print 'RANGE UNKNOWN:', countRangeNotFound
print 'RANGE DESCRIPTION NOT FOUND FOR FOLLOWING CODE:', rangeNotFound
digit = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']
allFiles = set(['x'+i+j+'.py' for i in digit for j in digit])
missingFiles = allFiles.difference(set(fileNames))
print 'MISSING FILES:', len(missingFiles), sorted(missingFiles)

