Saltar para o conteúdo

Utilizador:Giro720/match and split.py

Wikisource, a biblioteca livre

Sintaxe: python match_and_split.py page:nomedapagina

Dependências: pywikibot, biblioteca djvuLibre (no PATH do sistema), e o arquivo djvu no mesmo diretório do script.


#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Do Split and Match on Wikisource text.
Based phetools by thomasv1 at gmx dot de, phe
"""
#
# GPL V2, author thomasv1 at gmx dot de, phe
#
import sys
import re
import difflib
import time
import pywikibot
from pywikibot.tools.djvu import DjVuFile


class MatchSplit():
    def __init__(self, page):
        self.user = "Giro_bot"
        self.prefix = u'Página'
        self.text = ''
        self.codelang = 'pt'
        self.djvu_name = '.'
        self.mysite = pywikibot.Site(self.codelang, 'wikisource')
        self.pagename = page
        self.page = pywikibot.Page(self.mysite, page)
        self.text = self.page.get()
        try:
            self.text = self.page.get()
        except:
            raise Exception("failed to get page")

    def unquote_text_from_djvu(self, text):
        text = text.replace('\\n', '\n')
        text = text.replace('\\"', '"')
        text = text.replace('\\\\', '\\')
        text = text.replace('\\037', '\n')
        text = text.replace('\\035', '')
        text = text.replace('\\013', '')
        text = text.rstrip('\n')
        return text

    def extract_djvu_text(self):
        print("extracting text layer")
        if 'djvu' in self.djvu_name.lower():
            self.djvu_file = DjVuFile(self.djvu_name.replace(" ", "_"))
            numpages = (1, self.djvu_file.number_of_images())

            self.cached_text = []
            for page_number in range(1, max(numpages)):
                text = self.djvu_file.get_page(page_number)
                self.cached_text.append(self.unquote_text_from_djvu(text))
                # if len(1.1 * self.text < sum(len(self.cached_text))):
                #     break
        elif 'pdf' in self.djvu_name.lower():
            import fitz  # this is pymupdf
            self.djvu_file = fitz.open(self.djvu_name.replace(" ", "_"))
            self.cached_text = []
            for page in self.djvu_file:
                text = page.getText()
                self.cached_text.append(self.unquote_text_from_djvu(text))
                # if len(1.1 * self.text < sum(len(self.cached_text))):
                #     break

        if not self.cached_text:
            raise Exception("unable to read djvu, if the File: exists, please retry")

    def match(self):
        s = difflib.SequenceMatcher()
        offset = 0
        output = ""
        is_poem = False

        last_page = self.cached_text[self.number - int((self.step + 1) / 2)]

        for pagenum in range(self.number, min(self.number + 1000, len(self.cached_text)), self.step):

            if pagenum - self.number == 10 and offset == 0:
                raise Exception("error : could not find a text layer.")

            page1 = last_page
            last_page = page2 = self.cached_text[pagenum + int(self.step / 2)]

            text1 = page1 + page2
            text2 = self.text[offset:offset + int(1.5 * len(text1))]

            p = re.compile(r'[\W]+', re.U)
            fp = re.compile(r'([\W]+)', re.U)
            ftext1 = fp.split(text1)
            ftext2 = fp.split(text2)

            page1 = p.split(page1)
            text1 = p.split(text1)
            text2 = p.split(text2)
            s.set_seqs(text1, text2)

            mb = s.get_matching_blocks()
            if len(mb) < 2:
                print("LEN(MB) < 2, breaking")
                break
            ccc = mb[-2]
            ratio = s.ratio()

            if ratio < 0.1:
                print("low ratio", ratio)
                break
            mstr = ""
            overflow = False
            for i in range(ccc[0] + ccc[2]):
                matched = False
                for m in mb:
                    if i >= m[0] and i < m[0] + m[2]:
                        matched = True
                        if i >= len(page1):
                            overflow = True
                        break
                if not overflow:
                    ss = ftext1[2 * i]
                    if matched:
                        ss = "\033[1;32m%s\033[0;49m" % ss
                    if 2 * i + 1 < len(ftext1):
                        mstr = mstr + ss + ftext1[2 * i + 1]

            mstr = ""
            no_color = ""
            overflow = False
            for i in range(ccc[1] + ccc[2]):
                matched = False
                for m in mb:
                    if i >= m[1] and i < m[1] + m[2]:
                        matched = True
                        if m[0] + i - m[1] >= len(page1):
                            overflow = True
                        break

                if not overflow:
                    ss = ftext2[2 * i]
                    if matched:
                        ss = "\033[1;31m%s\033[0;49m" % ss
                    if 2 * i + 1 < len(ftext2):
                        mstr = mstr + ss + ftext2[2 * i + 1]
                        no_color = no_color + ftext2[2 * i] + ftext2[2 * i + 1]

            if is_poem:
                sep = "\n</poem>\n==[[" + self.prefix + ":%s/%d]]==\n<poem>\n" % (self.djvu_name, pagenum)
            else:
                sep = "\n==[[" + self.prefix + ":%s/%d]]==\n" % (self.djvu_name, pagenum)

            # Move the end of the last page to the start of the next page
            # if the end of the last page look like a paragraph start. 16 char
            # width to detect that is a guessed value.
            no_color = no_color.rstrip()
            match = re.match("(?ms).*(\n\n.*)$", no_color)
            if match and len(match.group(1)) <= 16:
                no_color = no_color[:-len(match.group(1))]
            else:
                match = re.match("(?ms).*(\n\w+\W*)$", no_color)
                if match:
                    no_color = no_color[:-(len(match.group(1)) - 1)]

            offset += len(no_color)

            if no_color and no_color[0] == '\n':
                no_color = no_color[1:]
            no_color = no_color.lstrip(' ')
            output += sep + no_color

            if no_color.rfind("<poem>") > no_color.rfind("</poem>"):
                is_poem = True
            elif no_color.rfind("<poem>") < no_color.rfind("</poem>"):
                is_poem = False

        if offset != 0 and self.text[offset:]:
            if len(self.text) - offset >= 16:
                output += "\n=== no match ===\n"
            output += self.text[offset:].lstrip(' ')

        if offset == 0:
            output = ""

        if output == "":
            raise Exception("text does not match")
        else:
            return output

    def do_match(self):

        p = re.compile(u"==__MATCH__:\[\[" + self.prefix + ":(.*?)/(\d+)(\|step=(\d+))?\]\]==")
        m = re.search(p, self.text)
        if m:
            self.djvu_name = m.group(1)
            self.number = int(m.group(2))
            pos = self.text.find(m.group(0))
            self.head = self.text[:pos]
            self.text = self.text[pos + len(m.group(0)):]
            if m.group(4):
                try:
                    self.step = int(m.group(4))
                except:
                    raise Exception("match tag invalid")
            else:
                self.step = 1
        else:
            raise Exception("match tag not found")

        pywikibot.output(self.djvu_name + " " + str(self.number) + " " + str(self.step))

        self.extract_djvu_text()

        data = self.match()
        safe_put(self.page, self.head + data, self.user + ": match")

    def do_split(self):
        try:
            page = pywikibot.Page(self.mysite, self.pagename)
            text = self.page.get()
        except:
            raise Exception("unable to read page")

        p = re.compile('==\[\[(' + self.prefix + ':[^=]+)\]\]==\n')
        bl = p.split(text)
        titles = '\n'

        group = ""

        fromsection = ""
        tosection = ""
        fromsection_page = tosection_page = None

        for i in range(int(len(bl) / 2)):
            title = bl[i * 2 + 1]
            content = bl[i * 2 + 2]
            pagetitle = title
            content = content.rstrip("\n ")
            pl = pywikibot.Page(self.mysite, pagetitle)
            m = re.match(self.prefix + ':(.*?)/(\d+)', pagetitle)
            if m:
                filename = m.group(1)
                pagenum = int(m.group(2))
                if not group:
                    group = filename
                    pfrom = pagenum
                    pto = pfrom
                else:
                    if filename != group:
                        titles = titles + "<pages index=\"%s\" from=%d to=%d header=1/>\n" % (group, pfrom, pto)
                        group = filename
                        pfrom = pagenum
                        pto = pfrom
                    elif pagenum != pto + 1:
                        titles = titles + "<pages index=\"%s\" from=%d to=%d header=1/>\n" % (group, pfrom, pto)
                        group = filename
                        pfrom = pagenum
                        pto = pfrom
                    else:
                        pto = pagenum
            else:
                if group:
                    titles = titles + "<pages index=\"%s\" from=%d to=%d header=1/>\n" % (group, pfrom, pto)
                    group = False
                titles = titles + "{{" + pagetitle + "}}\n"
            # prepend br
            if content and content[0] == '\n':
                content = '<nowiki />\n' + content
            # if pl.exists():
            #     old_text = pl.get()
            #     refs = pl.getReferences(onlyTemplateInclusion=True)
            #     numrefs = 0
            #     for ref in refs:
            #         numrefs += 1
            #     # first and last pages : check if they are transcluded
            #     if numrefs > 0:
            #         m = re.match("<noinclude>(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>", old_text, re.MULTILINE | re.DOTALL)
            #         if m and (i == 0 or i == (len(bl) / 2 - 1)):
            #             print("creating sections")
            #             old_text = m.group(2)
            #             if i == 0:
            #                 first_part = old_text
            #                 second_part = content
            #                 fromsection = "fromsection=s2 "
            #                 fromsection_page = ref
            #             else:
            #                 first_part = content
            #                 second_part = old_text
            #                 tosection = "tosection=s1 "
            #                 tosection_page = ref
            #             content = "<noinclude>" + m.group(1) + "</noinclude><section begin=s1/>" + first_part + "<section end=s1/>\n----\n" \
            #                 + "<section begin=s2/>" + second_part + "<section end=s2/><noinclude>" + m.group(3) + "</noinclude>"
            #     else:
            #         m = re.match("<noinclude><pagequality level=\"1\" user=\"(.*?)\" />(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>",
            #                      old_text, re.MULTILINE | re.DOTALL)
            #         if m:
            #             print("ok, quality 1, first try")
            #             content = "<noinclude><pagequality level=\"1\" user=\"" + \
            #                 m.group(1) + "\" />" + m.group(2) + "</noinclude>" + content + "<noinclude>" + m.group(4) + "</noinclude>"
            #             m2 = re.match("<noinclude>\{\{PageQuality\|1\|(.*?)\}\}(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>",
            #                           old_text, re.MULTILINE | re.DOTALL)
            #             if m2:
            #                 # FIXME: shouldn't use an hardcoded name here
            #                 print("ok, quality 1, second try")
            #                 content = "<noinclude><pagequality level=\"1\" user=\"Giro_bot\" />" + \
            #                     m2.group(2) + "</noinclude>" + content + "<noinclude>" + m2.group(4) + "</noinclude>"
            # else:
            header = '<noinclude><pagequality level="1" user="Giro_bot" />\n\n\n</noinclude>'
            footer = '<noinclude></noinclude>'
            content = header + content + footer
            do_put = True
            if pl.exists():
                if hasattr(pl, '_quality') and int(pl._quality) >= 3:
                    print("quality >= 3, not saved")
                    do_put = False
                else:
                    print("can't get quality level")
            if do_put:
                safe_put(pl, content, self.user + ": split")

        titles = titles + "<pages index=\"%s\" from=%d to=%d header=1/>\n" % (group, pfrom, pto)

        if fromsection and fromsection_page:
            rtext = fromsection_page.get()
            m = re.search("<pages index=\"(.*?)\" from=(.*?) to=(.*?) (fromsection=s2 |) header=1/>", rtext)
            if m and m.group(1) == group:
                rtext = rtext.replace(m.group(0), m.group(0)[:-2] + "tosection=s1 header=1/>")
                print("new rtext")
                safe_put(fromsection_page, rtext, self.user + ": split")

        if tosection and tosection_page:
            rtext = tosection_page.get()
            m = re.search("<pages index=\"(.*?)\" from=(.*?) to=(.*?) (tosection=s1 |)/>", rtext)
            if m and m.group(1) == group:
                rtext = rtext.replace(m.group(0), m.group(0)[:-2] + "fromsection=s2 />")
                print("new rtext")
                safe_put(tosection_page, rtext, self.user + ": split")

        header = bl[0]
        safe_put(page, header + titles, self.user + ": split")


def safe_put(page, text, comment):
    if re.match("^[\s\n]*$", text):
        return

    max_retry = 5

    retry_count = 0
    while retry_count < max_retry:
        retry_count += 1
        try:
            page.put(text, comment=comment)
            break
        except pywikibot.LockedPage:
            print("put error : Page %s is locked?!" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
            break
        except pywikibot.NoPage:
            print("put error : Page does not exist %s" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
            break
        except pywikibot.NoUsername:
            print("put error : No user name on wiki %s" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
            break
        except pywikibot.PageNotSaved:
            print("put error : Page not saved %s" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
            print("text len: ", len(text), file=sys.stderr)
            print("sleeping for:", 10 * retry_count, file=sys.stderr)
            time.sleep(10 * retry_count)
            continue
        except pywikibot.OtherPageSaveError:
            # this can occur for read-only DB because slave lag, so retry
            # a few time
            print("put error : Page not saved %s" % page.title(asUrl=True).encode("utf8"), file=sys.stderr)
            print("retrying in", retry_count, "minute(s)", file=sys.stderr)
            time.sleep(retry_count * 60)
            continue
        except:
            print("put error: unknown exception", file=sys.stderr)
            time.sleep(10)
            break

    if retry_count >= max_retry:
        print("unable to save page after", max_retry, "try, bailing out", file=sys.stderr)
        pass


def main(*args):
    """
    Process command line arguments and invoke bot.

    If args is an empty list, sys.argv is used.

    @param args: command line arguments
    @type args: str
    """

    # Parse command line arguments.
    local_args = pywikibot.handle_args(args)
    for arg in local_args:
        if arg.startswith('-page:'):
            page = arg[6:]
        elif arg.startswith('-djvu:'):
            page = arg[7:]
        else:
            pywikibot.output('Unknown argument ' + arg)

    # index is mandatory.
    if not page:
        pywikibot.bot.suggest_help(missing_parameters=['-page'])
        return

    bot = MatchSplit(page)
    bot.do_match()
    bot.do_split()


if __name__ == '__main__':
    try:
        main()
    except Exception:
        pywikibot.error('Fatal error:', exc_info=True)