Utilizador:Giro720/OCRtext.py
Aspeto
Este script necessita que as o diretório de instalação das ferramentas do DjVuLibre (no caso de arquivos djvus) e do Xpdf (para PDFs) estejam definidos no "PATH" do sistema operacional. O script deverá ser executado dentro do diretório raiz do pywikibot.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot uploads text from djvu or pdf files onto pages in the "Page"
namespace. It is intended to be used for pt.wikisource.
The following parameters are supported:
-ask Ask for confirmation before uploading each page.
(Default: ask when overwriting pages)
-overwrite:no When asking for confirmation, the answer is no.
-overwrite:yes When asking for confirmation, the answer is yes.
(Default: ask for the answer)
-file:... Filename of the djvu or pdf file
-index:... Name of the index page
(Default: the djvu filename)
-pages:<start>-<end> Page range to upload; <end> is optional
All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""
# adapted from 'djvutext.py' by
# (C) Pywikipedia bot team, 2008-2011
#
# Distributed under the terms of the MIT license.
#
#__version__ = '$Id: 526518c4583bdb30d8cfb08039e76939580efa3e $'
import wikipedia as pywikibot
from pywikibot import i18n
import os, sys
import config, codecs
import re
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}
class FileTextBot:
def __init__(self, filename, index, pages, ask=False, overwrite='ask', dry=False):
"""
Constructor. Parameters:
filename : filename
djvu : is a djvu file?
index : page name
pages : page range
"""
self.filename = filename
if '.djvu' in self.filename or '.DJVU' in self.filename:
self.djvu = True
else:
self.djvu = False
self.index = index
self.pages = pages
self.dry = dry
self.ask = ask
self.overwrite = overwrite
def NoOfImages(self):
if self.djvu:
cmd = u"djvused -e 'n' \"%s\"" % (self.filename)
count = os.popen( cmd.encode(sys.stdout.encoding) ).readline().rstrip()
else:
cmd = u"pdfinfo \"%s\"" % (self.filename)
count = os.popen( cmd.encode(sys.stdout.encoding) ).readlines()[8][6:].rstrip()
count = int(count)
pywikibot.output("page count = %d" % count)
return count
def PagesGenerator(self):
start = 1
end = self.NoOfImages()
if self.pages:
pos = self.pages.find('-')
if pos != -1:
start = int(self.pages[:pos])
if pos < len(self.pages)-1:
end = int(self.pages[pos+1:])
else:
start = int(self.pages)
end = start
pywikibot.output(u"Processing pages %d-%d" % (start, end))
return range(start, end+1)
def run(self):
# Set the edit summary message
pywikibot.setAction(i18n.twtranslate(pywikibot.getSite(),
'djvutext-creating'))
linkingPage = pywikibot.Page(pywikibot.getSite(), self.index)
self.prefix = linkingPage.title(withNamespace=False)
if self.prefix[0:6] == 'Liber:':
self.prefix = self.prefix[6:]
pywikibot.output(u"Using prefix %s" % self.prefix)
gen = self.PagesGenerator()
site = pywikibot.getSite()
self.username = config.usernames[site.family.name][site.lang]
for pageno in gen:
pywikibot.output("Processing page %d" % pageno)
self.treat(pageno)
def has_text(self):
if self.djvu:
cmd = u"djvudump \"%s\" > \"%s\".out" % (self.filename, self.filename)
os.system ( cmd.encode(sys.stdout.encoding) )
f = codecs.open(u"%s.out" % self.filename, 'r',
config.textfile_encoding, 'replace')
s = f.read()
f.close()
return s.find('TXTz') >= 0
else:
cmd = u"pdftotext -enc UTF-8 \"%s\" > \"%s\".out" % (self.filename, self.filename)
os.system ( cmd.encode(sys.stdout.encoding) )
f = codecs.open(u"%s.out" % self.filename, 'r',
config.textfile_encoding, 'replace')
s = f.read()
f.close()
return len(s) >= 0
def get_page(self, pageno):
pywikibot.output(unicode("fetching page %d" % (pageno)))
if self.djvu:
cmd = u"djvutxt --page=%d \"%s\" \"%s.out\"" \
% (pageno, self.filename, self.filename)
else:
cmd = u"pdftotext -enc UTF-8 -l %d -f %d \"%s\" \"%s.out\"" \
% (pageno, pageno, self.filename, self.filename)
os.system ( cmd.encode(sys.stdout.encoding) )
f = codecs.open(u"%s.out" % self.filename, 'r',
config.textfile_encoding, 'replace')
file_text = f.read()
f.close()
return file_text
def treat(self, pageno):
"""
Loads the given page, does some changes, and saves it.
"""
site = pywikibot.getSite()
# page_namespace = site.mediawiki_message('Proofreadpage namespace')
page_namespace = site.family.namespaces[106][site.lang]
page = pywikibot.Page(site, u'%s:%s/%d'
% (page_namespace, self.prefix, pageno))
exists = page.exists()
filetxt = self.get_page(pageno)
real_pageno = pageno-10
if not filetxt:
text = u'<noinclude><pagequality level="0" user="%s" /><div class="pagetext">\n\n\n</noinclude><noinclude><references/></div></noinclude>' % (self.username)
else:
if real_pageno%2:
#odd page
text = u'<noinclude><pagequality level="1" user="%s" /><div class="pagetext">{{cabeçalho|||%d}}\n\n\n</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,real_pageno,filetxt)
else:
#even page
text = u'<noinclude><pagequality level="1" user="%s" /><div class="pagetext">{{cabeçalho|%d||}}\n\n\n</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,real_pageno,filetxt)
# convert to wikisyntax
# this adds a second line feed, which makes a new paragraph
text = text.replace('�', "\n") # US /x1F
text = text.replace('�', "\n") # GS /x1D
text = text.replace('�', "\n") # FF /x0C
# ajustes adicionais
text = text.replace('�', "")
text = text.replace('\r\n', "\n")
text = re.sub(" ", " ", text)
text = re.sub("[ ](['\.!?,;:\"»])", u"\\1", text)
text = re.sub(u"« ", u"«", text)
text = re.sub(" \n", "\n", text)
text = re.sub("\n ", "\n", text)
text = re.sub("\n\n\n", "\n\n", text)
text = re.sub(" I\n", "!\n", text)
text = re.sub(" 1\n", "!\n", text)
text = re.sub(" 1 ", "! ", text)
text = re.sub(" l\n", "!\n", text)
text = re.sub(" l ", "! ", text)
text = text.replace(u'•', ".")
text = re.sub(u"-\n([a-zA-Z10-9ãáàéêóõíúç\-]+)([\.!?,;:\"» ])", u"\\1\\2 \n ", text)
text = re.sub(u"([a-zA-Z10-9ãáàéêóõíúç])—([a-zA-Z10-9ãáàéêóõíúç])", u"\\1 — \\2", text)
text = re.sub(u"\n—([a-zA-Z10-9ãáàéêóõíúç])", u"\n— \\1", text)
text = re.sub(" c ", " e ", text)
text = re.sub("dc", "de", text)
text = re.sub("cm", "em", text)
text = re.sub("A'", u"Á", text)
text = re.sub("E'", u"É", text)
# only save if something was changed
# automatically ask if overwriting an existing page
ask = self.ask
if exists:
ask = True
old_text = page.get()
if old_text == text:
pywikibot.output(u"No changes were needed on %s"
% page.title(asLink=True))
return
else:
old_text = ''
pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
% page.title())
pywikibot.showDiff(old_text, text)
if self.dry:
pywikibot.inputChoice(u'Dry mode... Press enter to continue', [],
[], 'dummy')
return
if ask: # True either when the -ask flag is used or if the page exists
if self.overwrite == 'n':
choice = 'n'
pywikibot.output(u"You did not accept these changes")
elif self.overwrite == 'y':
choice = 'y'
pywikibot.output(u"You accepted these changes")
else:
choice = pywikibot.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
else:
choice = 'y'
if choice == 'y':
try:
# Save the page
page.put_async(text)
except pywikibot.LockedPage:
pywikibot.output(u"Page %s is locked; skipping."
% page.title(asLink=True))
except pywikibot.EditConflict:
pywikibot.output(u'Skipping %s because of edit conflict' % (page.title()))
except pywikibot.SpamfilterError, error:
pywikibot.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
def main():
import os
index = None
filename = None
pages = None
# what would have been changed.
ask = False
overwrite = 'ask'
# Parse command line arguments
for arg in pywikibot.handleArgs():
if arg.startswith("-ask"):
ask = True
elif arg.startswith("-overwrite:"):
overwrite = arg[11:12]
if overwrite != 'y' and overwrite != 'n':
pywikibot.output(u"Unknown argument %s; will ask before overwriting" % arg)
overwrite = 'ask'
elif arg.startswith("-file:"):
filename = arg[6:]
elif arg.startswith("-index:"):
index = arg[7:]
elif arg.startswith("-pages:"):
pages = arg[7:]
else:
pywikibot.output(u"Unknown argument %s" % arg)
# Check the file exists
if filename:
os.stat(filename)
if not index:
import os.path
index = os.path.basename(filename)
if filename and index:
site = pywikibot.getSite()
index_page = pywikibot.Page(site, index)
if site.family.name != 'wikisource':
raise pywikibot.PageNotFound(u"Found family '%s'; Wikisource required." % site.family.name)
if not index_page.exists() and index_page.namespace() == 0:
index_namespace = site.mediawiki_message('Proofreadpage index namespace')
index_page = pywikibot.Page(pywikibot.getSite(),
u"%s:%s" % (index_namespace, index))
if not index_page.exists():
raise pywikibot.NoPage(u"Page '%s' does not exist" % index)
pywikibot.output(u"uploading text from %s to %s"
% (filename, index_page.title(asLink=True)) )
bot = FileTextBot(filename, index, pages, ask, overwrite, pywikibot.simulate)
if not bot.has_text():
raise ValueError("No text layer in the file")
bot.run()
else:
pywikibot.showHelp()
if __name__ == "__main__":
try:
main()
finally:
pywikibot.stopme()