Files
Python_CookBook_repo/2.字符串和文本/9.将unicode文本进行统一规范表示.py
2025-09-10 16:12:45 +08:00

38 lines
1.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import unicodedata
if __name__ == '__main__':
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
# 可以看到某些字符串的表示在unicode下有多种选项
print(s1,s2)
print(s1==s2)
# 显然这种情况在我们判断字符串的时候极其不利这时候我们就需要把unicode编码进行规范
# 有两种规范方式NFC全组成和NFD组合字符
t1 = unicodedata.normalize('NFD', s1)
t2 = unicodedata.normalize('NFD', s2)
print(t1, t2)
print(t1==t2)
print(ascii(t1))
t3 = unicodedata.normalize('NFC', s1)
t4 = unicodedata.normalize('NFC', s2)
print(ascii(t3))
# unicodedata同时还提供NFKC和NFKD编码这种编码提供了额外的兼容功能能把下面这种字符分开
s = '\ufb01'
print(s)
print(unicodedata.normalize('NFD', s))
print(unicodedata.normalize('NFKC', s))
print(unicodedata.normalize('NFKD', s))
# 如果要去除音符标记~那么我们可以先用组合字符NFD标准化再进行去除
suka = unicodedata.normalize('NFD', s1)
print(suka)
suka = ''.join(c for c in unicodedata.normalize('NFD', suka) if not unicodedata.combining(c))
print(suka)
# 在上面的例子里,我们使用了unicodedata.combining()函数来判断字符是否属于组合型字符