Utilizador:Giro720/match and split.py
Aspeto
Sintaxe: python match_and_split.py page:nomedapagina
Dependências: pywikibot, biblioteca djvuLibre (no PATH do sistema), e o arquivo djvu no mesmo diretório do script.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Do Split and Match on Wikisource text.
Based phetools by thomasv1 at gmx dot de, phe
"""
#
# GPL V2, author thomasv1 at gmx dot de, phe
#
import sys
import re
import difflib
import time
import pywikibot
from pywikibot.tools.djvu import DjVuFile
class MatchSplit():
def __init__(self, page):
self.user = "Giro_bot"
self.prefix = u'Página'
self.text = ''
self.codelang = 'pt'
self.djvu_name = '.'
self.mysite = pywikibot.Site(self.codelang, 'wikisource')
self.pagename = page
self.page = pywikibot.Page(self.mysite, page)
self.text = self.page.get()
try:
self.text = self.page.get()
except:
raise Exception("failed to get page")
def unquote_text_from_djvu(self, text):
text = text.replace('\\n', '\n')
text = text.replace('\\"', '"')
text = text.replace('\\\\', '\\')
text = text.replace('\\037', '\n')
text = text.replace('\\035', '')
text = text.replace('\\013', '')
text = text.rstrip('\n')
return text
def extract_djvu_text(self):
print("extracting text layer")
if 'djvu' in self.djvu_name.lower():
self.djvu_file = DjVuFile(self.djvu_name.replace(" ", "_"))
numpages = (1, self.djvu_file.number_of_images())
self.cached_text = []
for page_number in range(1, max(numpages)):
text = self.djvu_file.get_page(page_number)
self.cached_text.append(self.unquote_text_from_djvu(text))
# if len(1.1 * self.text < sum(len(self.cached_text))):
# break
elif 'pdf' in self.djvu_name.lower():
import fitz # this is pymupdf
self.djvu_file = fitz.open(self.djvu_name.replace(" ", "_"))
self.cached_text = []
for page in self.djvu_file:
text = page.getText()
self.cached_text.append(self.unquote_text_from_djvu(text))
# if len(1.1 * self.text < sum(len(self.cached_text))):
# break
if not self.cached_text:
raise Exception("unable to read djvu, if the File: exists, please retry")
def match(self):
s = difflib.SequenceMatcher()
offset = 0
output = ""
is_poem = False
last_page = self.cached_text[self.number - int((self.step + 1) / 2)]
for pagenum in range(self.number, min(self.number + 1000, len(self.cached_text)), self.step):
if pagenum - self.number == 10 and offset == 0:
raise Exception("error : could not find a text layer.")
page1 = last_page
last_page = page2 = self.cached_text[pagenum + int(self.step / 2)]
text1 = page1 + page2
text2 = self.text[offset:offset + int(1.5 * len(text1))]
p = re.compile(r'[\W]+', re.U)
fp = re.compile(r'([\W]+)', re.U)
ftext1 = fp.split(text1)
ftext2 = fp.split(text2)
page1 = p.split(page1)
text1 = p.split(text1)
text2 = p.split(text2)
s.set_seqs(text1, text2)
mb = s.get_matching_blocks()
if len(mb) < 2:
print("LEN(MB) < 2, breaking")
break
ccc = mb[-2]
ratio = s.ratio()
if ratio < 0.1:
print("low ratio", ratio)
break
mstr = ""
overflow = False
for i in range(ccc[0] + ccc[2]):
matched = False
for m in mb:
if i >= m[0] and i < m[0] + m[2]:
matched = True
if i >= len(page1):
overflow = True
break
if not overflow:
ss = ftext1[2 * i]
if matched:
ss = "\033[1;32m%s\033[0;49m" % ss
if 2 * i + 1 < len(ftext1):
mstr = mstr + ss + ftext1[2 * i + 1]
mstr = ""
no_color = ""
overflow = False
for i in range(ccc[1] + ccc[2]):
matched = False
for m in mb:
if i >= m[1] and i < m[1] + m[2]:
matched = True
if m[0] + i - m[1] >= len(page1):
overflow = True
break
if not overflow:
ss = ftext2[2 * i]
if matched:
ss = "\033[1;31m%s\033[0;49m" % ss
if 2 * i + 1 < len(ftext2):
mstr = mstr + ss + ftext2[2 * i + 1]
no_color = no_color + ftext2[2 * i] + ftext2[2 * i + 1]
if is_poem:
sep = "\n</poem>\n==[[" + self.prefix + ":%s/%d]]==\n<poem>\n" % (self.djvu_name, pagenum)
else:
sep = "\n==[[" + self.prefix + ":%s/%d]]==\n" % (self.djvu_name, pagenum)
# Move the end of the last page to the start of the next page
# if the end of the last page look like a paragraph start. 16 char
# width to detect that is a guessed value.
no_color = no_color.rstrip()
match = re.match("(?ms).*(\n\n.*)$", no_color)
if match and len(match.group(1)) <= 16:
no_color = no_color[:-len(match.group(1))]
else:
match = re.match("(?ms).*(\n\w+\W*)$", no_color)
if match:
no_color = no_color[:-(len(match.group(1)) - 1)]
offset += len(no_color)
if no_color and no_color[0] == '\n':
no_color = no_color[1:]
no_color = no_color.lstrip(' ')
output += sep + no_color
if no_color.rfind("<poem>") > no_color.rfind("</poem>"):
is_poem = True
elif no_color.rfind("<poem>") < no_color.rfind("</poem>"):
is_poem = False
if offset != 0 and self.text[offset:]:
if len(self.text) - offset >= 16:
output += "\n=== no match ===\n"
output += self.text[offset:].lstrip(' ')
if offset == 0:
output = ""
if output == "":
raise Exception("text does not match")
else:
return output
def do_match(self):
p = re.compile(u"==__MATCH__:\[\[" + self.prefix + ":(.*?)/(\d+)(\|step=(\d+))?\]\]==")
m = re.search(p, self.text)
if m:
self.djvu_name = m.group(1)
self.number = int(m.group(2))
pos = self.text.find(m.group(0))
self.head = self.text[:pos]
self.text = self.text[pos + len(m.group(0)):]
if m.group(4):
try:
self.step = int(m.group(4))
except:
raise Exception("match tag invalid")
else:
self.step = 1
else:
raise Exception("match tag not found")
pywikibot.output(self.djvu_name + " " + str(self.number) + " " + str(self.step))
self.extract_djvu_text()
data = self.match()
safe_put(self.page, self.head + data, self.user + ": match")
def do_split(self):
try:
page = pywikibot.Page(self.mysite, self.pagename)
text = self.page.get()
except:
raise Exception("unable to read page")
p = re.compile('==\[\[(' + self.prefix + ':[^=]+)\]\]==\n')
bl = p.split(text)
titles = '\n'
group = ""
fromsection = ""
tosection = ""
fromsection_page = tosection_page = None
for i in range(int(len(bl) / 2)):
title = bl[i * 2 + 1]
content = bl[i * 2 + 2]
pagetitle = title
content = content.rstrip("\n ")
pl = pywikibot.Page(self.mysite, pagetitle)
m = re.match(self.prefix + ':(.*?)/(\d+)', pagetitle)
if m:
filename = m.group(1)
pagenum = int(m.group(2))
if not group:
group = filename
pfrom = pagenum
pto = pfrom
else:
if filename != group:
titles = titles + "<pages index=\"%s\" from=%d to=%d header=1/>\n" % (group, pfrom, pto)
group = filename
pfrom = pagenum
pto = pfrom
elif pagenum != pto + 1:
titles = titles + "<pages index=\"%s\" from=%d to=%d header=1/>\n" % (group, pfrom, pto)
group = filename
pfrom = pagenum
pto = pfrom
else:
pto = pagenum
else:
if group:
titles = titles + "<pages index=\"%s\" from=%d to=%d header=1/>\n" % (group, pfrom, pto)
group = False
titles = titles + "{{" + pagetitle + "}}\n"
# prepend br
if content and content[0] == '\n':
content = '<nowiki />\n' + content
# if pl.exists():
# old_text = pl.get()
# refs = pl.getReferences(onlyTemplateInclusion=True)
# numrefs = 0
# for ref in refs:
# numrefs += 1
# # first and last pages : check if they are transcluded
# if numrefs > 0:
# m = re.match("<noinclude>(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>", old_text, re.MULTILINE | re.DOTALL)
# if m and (i == 0 or i == (len(bl) / 2 - 1)):
# print("creating sections")
# old_text = m.group(2)
# if i == 0:
# first_part = old_text
# second_part = content
# fromsection = "fromsection=s2 "
# fromsection_page = ref
# else:
# first_part = content
# second_part = old_text
# tosection = "tosection=s1 "
# tosection_page = ref
# content = "<noinclude>" + m.group(1) + "</noinclude><section begin=s1/>" + first_part + "<section end=s1/>\n----\n" \
# + "<section begin=s2/>" + second_part + "<section end=s2/><noinclude>" + m.group(3) + "</noinclude>"
# else:
# m = re.match("<noinclude><pagequality level=\"1\" user=\"(.*?)\" />(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>",
# old_text, re.MULTILINE | re.DOTALL)
# if m:
# print("ok, quality 1, first try")
# content = "<noinclude><pagequality level=\"1\" user=\"" + \
# m.group(1) + "\" />" + m.group(2) + "</noinclude>" + content + "<noinclude>" + m.group(4) + "</noinclude>"
# m2 = re.match("<noinclude>\{\{PageQuality\|1\|(.*?)\}\}(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>",
# old_text, re.MULTILINE | re.DOTALL)
# if m2:
# # FIXME: shouldn't use an hardcoded name here
# print("ok, quality 1, second try")
# content = "<noinclude><pagequality level=\"1\" user=\"Giro_bot\" />" + \
# m2.group(2) + "</noinclude>" + content + "<noinclude>" + m2.group(4) + "</noinclude>"
# else:
header = '<noinclude><pagequality level="1" user="Giro_bot" />\n\n\n</noinclude>'
footer = '<noinclude></noinclude>'
content = header + content + footer
do_put = True
if pl.exists():
if hasattr(pl, '_quality') and int(pl._quality) >= 3:
print("quality >= 3, not saved")
do_put = False
else:
print("can't get quality level")
if do_put:
safe_put(pl, content, self.user + ": split")
titles = titles + "<pages index=\"%s\" from=%d to=%d header=1/>\n" % (group, pfrom, pto)
if fromsection and fromsection_page:
rtext = fromsection_page.get()
m = re.search("<pages index=\"(.*?)\" from=(.*?) to=(.*?) (fromsection=s2 |) header=1/>", rtext)
if m and m.group(1) == group:
rtext = rtext.replace(m.group(0), m.group(0)[:-2] + "tosection=s1 header=1/>")
print("new rtext")
safe_put(fromsection_page, rtext, self.user + ": split")
if tosection and tosection_page:
rtext = tosection_page.get()
m = re.search("<pages index=\"(.*?)\" from=(.*?) to=(.*?) (tosection=s1 |)/>", rtext)
if m and m.group(1) == group:
rtext = rtext.replace(m.group(0), m.group(0)[:-2] + "fromsection=s2 />")
print("new rtext")
safe_put(tosection_page, rtext, self.user + ": split")
header = bl[0]
safe_put(page, header + titles, self.user + ": split")
def safe_put(page, text, comment):
if re.match("^[\s\n]*$", text):
return
max_retry = 5
retry_count = 0
while retry_count < max_retry:
retry_count += 1
try:
page.put(text, comment=comment)
break
except pywikibot.LockedPage:
print("put error : Page %s is locked?!" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
break
except pywikibot.NoPage:
print("put error : Page does not exist %s" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
break
except pywikibot.NoUsername:
print("put error : No user name on wiki %s" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
break
except pywikibot.PageNotSaved:
print("put error : Page not saved %s" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
print("text len: ", len(text), file=sys.stderr)
print("sleeping for:", 10 * retry_count, file=sys.stderr)
time.sleep(10 * retry_count)
continue
except pywikibot.OtherPageSaveError:
# this can occur for read-only DB because slave lag, so retry
# a few time
print("put error : Page not saved %s" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
print("retrying in", retry_count, "minute(s)", file=sys.stderr)
time.sleep(retry_count * 60)
continue
except:
print("put error: unknown exception", file=sys.stderr)
time.sleep(10)
break
if retry_count >= max_retry:
print("unable to save page after", max_retry, "try, bailing out", file=sys.stderr)
pass
def main(*args):
"""
Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
@param args: command line arguments
@type args: str
"""
# Parse command line arguments.
local_args = pywikibot.handle_args(args)
for arg in local_args:
if arg.startswith('-page:'):
page = arg[6:]
elif arg.startswith('-djvu:'):
page = arg[7:]
else:
pywikibot.output('Unknown argument ' + arg)
# index is mandatory.
if not page:
pywikibot.bot.suggest_help(missing_parameters=['-page'])
return
bot = MatchSplit(page)
bot.do_match()
bot.do_split()
if __name__ == '__main__':
try:
main()
except Exception:
pywikibot.error('Fatal error:', exc_info=True)