Utilizador:Giro720/OCRtext.py

Wikisource, a biblioteca livre

Este script necessita que as o diretório de instalação das ferramentas do DjVuLibre (no caso de arquivos djvus) e do Xpdf (para PDFs) estejam definidos no "PATH" do sistema operacional. O script deverá ser executado dentro do diretório raiz do pywikibot.

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This bot uploads text from djvu or pdf files onto pages in the "Page"
namespace.  It is intended to be used for pt.wikisource.

The following parameters are supported:

    -ask           Ask for confirmation before uploading each page.
                   (Default: ask when overwriting pages)
    -overwrite:no  When asking for confirmation, the answer is no.
    -overwrite:yes When asking for confirmation, the answer is yes.
                   (Default: ask for the answer)
    -file:...      Filename of the djvu or pdf file
    -index:...     Name of the index page
                   (Default: the djvu filename)
    -pages:<start>-<end> Page range to upload; <end> is optional

All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""
# adapted from 'djvutext.py' by
# (C) Pywikipedia bot team, 2008-2011
#
# Distributed under the terms of the MIT license.
#
#__version__ = '$Id: 526518c4583bdb30d8cfb08039e76939580efa3e $'
import wikipedia as pywikibot
from pywikibot import i18n
import os, sys
import config, codecs
import re

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}


class FileTextBot:

    def __init__(self, filename, index, pages, ask=False, overwrite='ask', dry=False):
        """
        Constructor. Parameters:
        filename : filename
        djvu : is a djvu file?
        index : page name
        pages : page range
        """
        self.filename = filename
        if '.djvu' in self.filename or '.DJVU' in self.filename:
            self.djvu = True
        else:
            self.djvu = False
        self.index = index
        self.pages = pages
        self.dry = dry
        self.ask = ask
        self.overwrite = overwrite

    def NoOfImages(self):
        if self.djvu:
            cmd = u"djvused -e 'n' \"%s\"" % (self.filename)
            count = os.popen( cmd.encode(sys.stdout.encoding) ).readline().rstrip()
        else:
            cmd = u"pdfinfo \"%s\"" % (self.filename)
            count = os.popen( cmd.encode(sys.stdout.encoding) ).readlines()[8][6:].rstrip()
        count = int(count)
        pywikibot.output("page count = %d" % count)
        return count

    def PagesGenerator(self):
        start = 1
        end = self.NoOfImages()

        if self.pages:
            pos = self.pages.find('-')
            if pos != -1:
                start = int(self.pages[:pos])
                if pos < len(self.pages)-1:
                    end = int(self.pages[pos+1:])
            else:
                start = int(self.pages)
                end = start
        pywikibot.output(u"Processing pages %d-%d" % (start, end))
        return range(start, end+1)

    def run(self):
        # Set the edit summary message
        pywikibot.setAction(i18n.twtranslate(pywikibot.getSite(),
                                             'djvutext-creating'))

        linkingPage = pywikibot.Page(pywikibot.getSite(), self.index)
        self.prefix = linkingPage.title(withNamespace=False)
        if self.prefix[0:6] == 'Liber:':
            self.prefix = self.prefix[6:]
        pywikibot.output(u"Using prefix %s" % self.prefix)
        gen = self.PagesGenerator()

        site = pywikibot.getSite()
        self.username = config.usernames[site.family.name][site.lang]

        for pageno in gen:
            pywikibot.output("Processing page %d" % pageno)
            self.treat(pageno)

    def has_text(self):
        if self.djvu:
            cmd = u"djvudump \"%s\" > \"%s\".out" % (self.filename, self.filename)
            os.system ( cmd.encode(sys.stdout.encoding) )
            f = codecs.open(u"%s.out" % self.filename, 'r',
                            config.textfile_encoding, 'replace')
            s = f.read()
            f.close()
            return s.find('TXTz') >= 0
        else:
            cmd = u"pdftotext -enc UTF-8 \"%s\" > \"%s\".out" % (self.filename, self.filename)
            os.system ( cmd.encode(sys.stdout.encoding) )
            f = codecs.open(u"%s.out" % self.filename, 'r',
                            config.textfile_encoding, 'replace')
            s = f.read()
            f.close()
            return len(s) >= 0

    def get_page(self, pageno):
        pywikibot.output(unicode("fetching page %d" % (pageno)))
        if self.djvu:
            cmd = u"djvutxt --page=%d \"%s\" \"%s.out\"" \
                  % (pageno, self.filename, self.filename)
        else:
            cmd = u"pdftotext -enc UTF-8 -l %d -f %d \"%s\" \"%s.out\"" \
                  % (pageno, pageno, self.filename, self.filename)
        os.system ( cmd.encode(sys.stdout.encoding) )
        f = codecs.open(u"%s.out" % self.filename, 'r',
                        config.textfile_encoding, 'replace')
        file_text = f.read()
        f.close()
        return file_text

    def treat(self, pageno):
        """
        Loads the given page, does some changes, and saves it.
        """
        site = pywikibot.getSite()
#        page_namespace = site.mediawiki_message('Proofreadpage namespace')
        page_namespace = site.family.namespaces[106][site.lang]
        page = pywikibot.Page(site, u'%s:%s/%d'
                              % (page_namespace, self.prefix, pageno))
        exists = page.exists()
        filetxt = self.get_page(pageno)
        real_pageno = pageno-10

        if not filetxt:
            text = u'<noinclude><pagequality level="0" user="%s" /><div class="pagetext">\n\n\n</noinclude><noinclude><references/></div></noinclude>' % (self.username)
        else:
            if real_pageno%2:
                #odd page
                text = u'<noinclude><pagequality level="1" user="%s" /><div class="pagetext">{{cabeçalho|||%d}}\n\n\n</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,real_pageno,filetxt)
            else:
                #even page
                text = u'<noinclude><pagequality level="1" user="%s" /><div class="pagetext">{{cabeçalho|%d||}}\n\n\n</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,real_pageno,filetxt)

            # convert to wikisyntax
            # this adds a second line feed, which makes a new paragraph
            text = text.replace('�', "\n") # US /x1F
            text = text.replace('�', "\n") # GS /x1D
            text = text.replace('�', "\n") # FF /x0C
            # ajustes adicionais
            text = text.replace('�', "")
            text = text.replace('\r\n', "\n")
            text = re.sub("  ", " ", text)
            text = re.sub("[ ](['\.!?,;:\"»])", u"\\1", text)
            text = re.sub(u"« ", u"«", text)
            text = re.sub(" \n", "\n", text)
            text = re.sub("\n ", "\n", text)
            text = re.sub("\n\n\n", "\n\n", text)
            text = re.sub(" I\n", "!\n", text)
            text = re.sub(" 1\n", "!\n", text)
            text = re.sub(" 1 ", "! ", text)
            text = re.sub(" l\n", "!\n", text)
            text = re.sub(" l ", "! ", text)
            text = text.replace(u'•', ".")
            text = re.sub(u"-\n([a-zA-Z10-9ãáàéêóõíúç\-]+)([\.!?,;:\"» ])", u"\\1\\2 \n ", text)
            text = re.sub(u"([a-zA-Z10-9ãáàéêóõíúç])—([a-zA-Z10-9ãáàéêóõíúç])", u"\\1 — \\2", text)
            text = re.sub(u"\n—([a-zA-Z10-9ãáàéêóõíúç])", u"\n\\1", text)
            text = re.sub(" c ", " e ", text)
            text = re.sub("dc", "de", text)
            text = re.sub("cm", "em", text)
            text = re.sub("A'", u"Á", text)
            text = re.sub("E'", u"É", text)

        # only save if something was changed
        # automatically ask if overwriting an existing page
        ask = self.ask

        if exists:
            ask = True
            old_text = page.get()
            if old_text == text:
                pywikibot.output(u"No changes were needed on %s"
                                 % page.title(asLink=True))
                return
        else:
            old_text = ''
        pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                         % page.title())
        pywikibot.showDiff(old_text, text)
        if self.dry:
            pywikibot.inputChoice(u'Dry mode... Press enter to continue', [],
                                  [], 'dummy')
            return
        if ask: # True either when the -ask flag is used or if the page exists
            if self.overwrite == 'n':
                choice = 'n'
                pywikibot.output(u"You did not accept these changes")
            elif self.overwrite == 'y':
                choice = 'y'
                pywikibot.output(u"You accepted these changes")
            else:
                choice = pywikibot.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
        else:
            choice = 'y'
        if choice == 'y':
            try:
                # Save the page
                page.put_async(text)
            except pywikibot.LockedPage:
                pywikibot.output(u"Page %s is locked; skipping."
                                 % page.title(asLink=True))
            except pywikibot.EditConflict:
                pywikibot.output(u'Skipping %s because of edit conflict' % (page.title()))
            except pywikibot.SpamfilterError, error:
                pywikibot.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))


def main():
    import os
    index = None
    filename = None
    pages = None
    # what would have been changed.
    ask = False
    overwrite = 'ask'

    # Parse command line arguments
    for arg in pywikibot.handleArgs():
        if arg.startswith("-ask"):
            ask = True
        elif arg.startswith("-overwrite:"):
            overwrite = arg[11:12]
            if overwrite != 'y' and overwrite != 'n':
                pywikibot.output(u"Unknown argument %s; will ask before overwriting" % arg)
                overwrite = 'ask'
        elif arg.startswith("-file:"):
            filename = arg[6:]
        elif arg.startswith("-index:"):
            index = arg[7:]
        elif arg.startswith("-pages:"):
            pages = arg[7:]
        else:
            pywikibot.output(u"Unknown argument %s" % arg)

    # Check the file exists
    if filename:
        os.stat(filename)

        if not index:
            import os.path
            index = os.path.basename(filename)

    if filename and index:
        site = pywikibot.getSite()
        index_page = pywikibot.Page(site, index)

        if site.family.name != 'wikisource':
            raise pywikibot.PageNotFound(u"Found family '%s'; Wikisource required." % site.family.name)

        if not index_page.exists() and index_page.namespace() == 0:
            index_namespace = site.mediawiki_message('Proofreadpage index namespace')

            index_page = pywikibot.Page(pywikibot.getSite(),
                                        u"%s:%s" % (index_namespace, index))
        if not index_page.exists():
            raise pywikibot.NoPage(u"Page '%s' does not exist" % index)
        pywikibot.output(u"uploading text from %s to %s"
                         % (filename, index_page.title(asLink=True)) )
        bot = FileTextBot(filename, index, pages, ask, overwrite, pywikibot.simulate)
        if not bot.has_text():
            raise ValueError("No text layer in the file")
        bot.run()
    else:
        pywikibot.showHelp()

if __name__ == "__main__":
    try:
        main()
    finally:
        pywikibot.stopme()