2025-09-10:仓库迁移
This commit is contained in:
37
2.字符串和文本/9.将unicode文本进行统一规范表示.py
Normal file
37
2.字符串和文本/9.将unicode文本进行统一规范表示.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import unicodedata
|
||||
|
||||
if __name__ == '__main__':
|
||||
s1 = 'Spicy Jalape\u00f1o'
|
||||
s2 = 'Spicy Jalapen\u0303o'
|
||||
|
||||
# 可以看到,某些字符串的表示在unicode下有多种选项
|
||||
print(s1,s2)
|
||||
print(s1==s2)
|
||||
|
||||
# 显然,这种情况在我们判断字符串的时候极其不利,这时候我们就需要把unicode编码进行规范
|
||||
|
||||
# 有两种规范方式,NFC全组成和NFD组合字符
|
||||
t1 = unicodedata.normalize('NFD', s1)
|
||||
t2 = unicodedata.normalize('NFD', s2)
|
||||
print(t1, t2)
|
||||
print(t1==t2)
|
||||
print(ascii(t1))
|
||||
|
||||
t3 = unicodedata.normalize('NFC', s1)
|
||||
t4 = unicodedata.normalize('NFC', s2)
|
||||
print(ascii(t3))
|
||||
|
||||
# unicodedata同时还提供NFKC和NFKD编码,这种编码提供了额外的兼容功能,能把下面这种字符分开
|
||||
s = '\ufb01'
|
||||
print(s)
|
||||
print(unicodedata.normalize('NFD', s))
|
||||
print(unicodedata.normalize('NFKC', s))
|
||||
print(unicodedata.normalize('NFKD', s))
|
||||
|
||||
# 如果要去除音符标记~,那么我们可以先用组合字符NFD标准化,再进行去除
|
||||
suka = unicodedata.normalize('NFD', s1)
|
||||
print(suka)
|
||||
suka = ''.join(c for c in unicodedata.normalize('NFD', suka) if not unicodedata.combining(c))
|
||||
print(suka)
|
||||
|
||||
# 在上面的例子里,我们使用了unicodedata.combining()函数来判断字符是否属于组合型字符
|
Reference in New Issue
Block a user