用戶:P-bot/字詞轉換代碼標準化
/pywikipedia/user-fixes.py
# -*- coding: utf-8 -*-
# @filename:user-fixes.py
import re
convreg=re.compile(ur"\s*[a-zA-Z-]+\s*:\s*[^;]*(;\s*[a-zA-Z-]+\s*:\s*[^;]+)*;?\s*")
# 其它语言另列
orderedvars=['zh','zh-hans','zh-hant','zh-cn','zh-tw','zh-hk','zh-sg']
# @return list [('zh',zhWord),('zh-hans',zhHansWord)......] ordered
def getOrderVariants(varscode):
if not convreg.match(varscode):
return varscode
ret=[]
vars={}
for v in [varstr.split(':',1) for varstr in varscode.split(';')]:
if len(v)==2:
vars[v[0].strip()]=v[1].strip()
for vk in orderedvars:
if vk in vars:
ret.append((vk,vars[vk]))
del vars[vk]
for vk in vars:
ret.append((vk,vars[vk]))
return ret
# @return tuple: (string flags, list variants)
# variants see return of getOrderVariants function
def parseRulesCode(text,sep='|'):
if text.startswith('\r\n{|'):
return 'R',text
try:
flags,rules=text.split(sep,1)
except ValueError:
flags,rules='',text
else:
flags=flags.strip()
if flags=='R':
return 'R',rules
rules=getOrderVariants(rules)
return flags,rules
def getConvCodeStr(texts,codes):
ret=''
for i in range(len (codes)):
code = codes[i]
pre = '-{' if code[0]!='raw' else ''
post = '}-' if code[0]!='raw' else ''
ret += texts[i] + pre + getRulesStr(codes[i]) + post
return ret
# split 'xxx -{convcode }- yyy-{convcode }- ......' to a generator:
# @return generator:
# yield <code>('xxx ',CODE1), (' yyy',CODE2),......</code>
# will insert virtual CODE at the end of text
# about CODE format see return of function parseRulesCode
def splitConvCode(text):
while (text):
isRaw=False
try:
txt,rest=text.split('-{',1)
except ValueError:
isRaw,txt,code,text=True,text,'',''
else:
if rest:
try:
code,text=rest.split('}-',1)
except ValueError:
print (u"有语法错误:-{與}-不匹配")
isRaw,txt,code,text=True,text,'',''
else:
print (u"有语法错误:-{與}-不匹配")
isRaw,txt,code,text=True,text,'',''
yield txt,('raw','') if isRaw else parseRulesCode(code)
def getRulesStr(code,sep='|'):
if code[0]=='raw':
return code[1]
if type(code[1])==unicode or type(code[1])==str:
return (code[0]+sep if code[0] else '')\
+ (code[1] if code[0]=='R' else code[1].strip())
return (code[0]+sep if code[0] else '') \
+' '.join([vk+":"+vv+";" for (vk,vv) in code[1]])
def findend(sep,str):
end=str.find(sep)
while end>0:
pre=str[:end]
if pre.count('{{')-pre.count('}}')>0 \
or pre.count('[[')-pre.count(']]')>0:
end=str.find(sep,end+2)
else:
break
return end
# @return generator (str paraname, list codes) , ......
def parseTempContent(text):
while(text):
end=findend('|',text)
if end<0:
para,text=text,''
else:
para,text=text[:end],text[(end+1):]
yield parseRulesCode(para,'=')
def getTempStr(tname,paras):
if type(paras)==str or type(paras)==unicode:
return paras
if tname=='none':
return ''
cr = u'\n' if tname=='noteTA' else u''
return u'{{'+tname \
+ ''.join(cr+'|'+getRulesStr(code,sep='=') for code in paras) \
+cr+'}}'+cr
# split 'xxx {{tempname|tempcode }} yyy {{tempname|tempcode }} ......' to a generator:
# @return generator :
# yield <code>{'text':'xxx ','tname':tempname,'temp':PARAS1} , {'text':'yyy ','tname':tempname,'temp':PARAS2} ,......</code>
# about PARAS format see return of function parseTempContent
def splitTemp(text,cmd):
#根据cmd分派
if cmd.startswith(('noteTA','safemerge','merge')):
creg=re.compile( ur'\{\{\s*[Nn]oteTA\s*\|' )
tname=u'noteTA'
elif cmd==u'CItem':
creg=re.compile( ur'\{\{\s*[Cc]Item\s*\|' )
tname=u'CItem'
else:
yield { 'text':text, 'tname':'none', 'temp':[] }
return
#分派结束
while(text):
hasTemp=False
try:
txt,rest=creg.split(text,1)
except ValueError:
txt,text=text,''
else:
if rest!='':
end=findend("}}",rest)
if end<0:
print ("wiktext有语法错误")
txt,text=text,''
else:
hasTemp=True
parastr,text=rest[:end].strip(),rest[(end+2):]
temp=list(parseTempContent(parastr))
else:
print ("wiktext有语法错误")
txt,text=text,''
#print txt
yield { 'text':txt, 'tname':tname, 'temp':temp } if hasTemp \
else { 'text':txt, 'tname':'none', 'temp':[] }
def getOneConv(rules):
if type(rules)==str or type(rules)==unicode:
return rules
for vk,vv in rules:
if vk in orderedvars:
return vv
def mergeNoteTA(merge,item):
merge[0]['temp']+=item['temp']
merge[1]['text']+=item['text']
if item['tname']=='noteTA':
merge[0]['tname']='noteTA'
return merge
def parseConversion(text,cmd):
if not cmd in ['noteTA','CItem','merge','safemerge']:
return text
texts,codes=[],[]
for seg in splitConvCode(text):
texts.append(seg[0])
codes.append(seg[1])
convtag='<convcode/>'
tarray=splitTemp(convtag.join(texts),cmd)
if cmd.startswith(('noteTA','safemerge','merge')):
# 合并多个noteTA为一个,并置顶
tarray=reduce(mergeNoteTA,tarray,[
{'text':'','tname':'none','temp':[]},
{'text':'','tname':'none','temp':[]} ])
else:
tarray=list(tarray)
if cmd=='safemerge' or cmd=='merge':
# 合并-{A|}-到noteTA (cmd=='merge' 下也合并-{}-)
tarray[0]['temp']+=[ code
for code in codes
if code[0]=='A' or code[0]=='T' or (cmd=='merge' and code[0]=='')]
if len(tarray[0]['temp']):
tarray[0]['tname']='noteTA'
codes=[ ('raw',getOneConv(code[1]))
if code[0]=='A' or (cmd=='merge' and code[0]=='')
else ('raw','') if code[0]=='T' else code
for code in codes]
if cmd.startswith(('noteTA','safemerge','merge')):
# 清除同类项/重列序号/消除空参数/
newRules = []
existconvs = ['']
firstT = True
index = 0
for rule in tarray[0]['temp']:
if rule[0]=='T' and firstT:
firstT=False
newRules.insert(0,rule)
elif rule[0]=='A' or rule[0]=='' or re.match('\s*\d+\s*',rule[0]):
rulestr=getRulesStr(('',rule[1])).strip()
if rulestr not in existconvs:
existconvs.append(rulestr)
index+=1
newRules.append(('%d'%index,rule[1]))
else:
newRules.append(rule)
tarray[0]['temp']=newRules
texts=''.join(
[t['text'].strip() + getTempStr(t['tname'],t['temp'])
for t in tarray]
).split(convtag)
return getConvCodeStr(texts,codes).strip()
# 以下用来调度命令,可以在fix的replacements作为第二个参数
def fixConversion(cmd):
return lambda m:parseConversion(m.string,cmd)
fixes['zh-conversion'] = {
'regex': True,
'msg': {
'zh': u'机器人:字词转换代码标准化',
},
'replacements': [
# STEP 1: SORT
(ur'[\s\S]+', fixConversion('noteTA') ),
],
}
fixes['zh-conversion-to-noteta'] = {
'regex': True,
'msg': {
'zh': u'机器人:字词转换代码NoteTA化(全自动安全模式)',
},
'replacements': [
# STEP 1: replace to standard noteTA template
(ur'\{\{\s*([nN]ote[tT]?[aA][tT]?|[tT]A)(?![a-zA-Z0-9]*)\s*', ur'{{NoteTA'),
# STEP 2: to NOTETA
(ur'[\s\S]+', fixConversion('safemerge')),
],
}
# !!! this fix is unsafe!!!
# replace.py -page:珍妮佛·康納莉 -fix:zh-conversion-to-noteta-unsafe
fixes['zh-conversion-to-noteta-unsafe'] = {
'regex': True,
'msg': {
'zh': u'机器人:字词转换代码NoteTA化(半人工非安全模式)',
},
'replacements': [
# STEP 1: replace to standard noteTA template
(ur'\{\{\s*([nN]ote[tT]?[aA][tT]?|[tT]A)(?![a-zA-Z0-9]*)\s*', ur'{{NoteTA'),
# STEP 2: to NOTETA
(ur'[\s\S]+', fixConversion('merge')),
],
}