Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import codecs
INTAB = u"aáàảãạâấầẩẫậăắằẳẵặeéèẻẽẹêếềểễệiíìỉĩịoóòỏõọôốồổỗộơớờởỡợuúùủũụưứừửữựyýỳỷỹỵđ"
OUTTAB = "a" * 18 + "e" * 12 + "i" * 6 + "o" * 18 + "u" * 12 + "y" * 6 + "d"
INTAB = INTAB + INTAB.upper()
OUTTAB = OUTTAB + OUTTAB.upper()
r = re.compile("|".join(INTAB))
replaces_dict = dict(zip(INTAB, OUTTAB))
def remove_diacritic(utf8_str):
return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)
if __name__ == '__main__':
fi = codecs.open(sys.argv[1], 'r', 'utf-8')
fo = codecs.open(sys.argv[2], 'w', 'utf-8')
loai_dau_unicode = remove_diacritic(fi.read())
#sac \u0301
#huyen \u0300
#hoi \u0309
#nga \u0303
#nang \u0323
#['\u0301', '\u0300', '\u0309', '\u0303', '\u0323']
# dau unicode to hop
#for i in [unichr(771), unichr(768), unichr(777), unichr(769), unichr(803)]:
#loai_dau_unicode = loai_dau_unicode.replace(i, '')
fo.write(loai_dau_unicode)
fi.close()
fo.close()