Utilizador:Giro720/Gutenberg2wiki.py
Aspeto
Este script foi escrito originalmente para o python 2.X, e requer instalado a biblioteca BeautifulSoup.
Tipicamente os livros do Projeto Gutenberg possuem a marcação de páginas através da margação <span class="pagenum">
, embora o nome da classe possa variar conforme o livro.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup as bs
import codecs
import urllib2
import os.path
try:
import pypandoc
except:
pass
def read_Gutenberg(url):
"""Le o texto do Projeto Gutenberg e salva-o locamente."""
filename = url.split('/')[-1]
if os.path.isfile(filename):
print("Lendo arquivo local...")
html = codecs.open(filename).read()
else:
print("Baixando livro da internet...")
file(filename, "wb").write(urllib2.urlopen(url).read())
return html
def clean_text(text):
"""Clean text."""
text = unicode(text)
try:
text = text.replace(u"d\'", u"d’").replace(u"D\'", u"D’").strip()
text = text.replace(u"n\'", u"n’").replace(u"N\'", u"N’")
text = text.replace(u"h\'", u"h’")
text = pypandoc.convert(text, 'mediawiki', format='html')
text = text.replace(u"\r", u"")
except:
text = text.replace(u"d\'", u"d’").replace(u"D\'", u"D’").strip()
text = text.replace(u"n\'", u"n’").replace(u"N\'", u"N’")
text = text.replace(u"h\'", u"h’")
text = text.replace(u"\r", u"").replace(u"\n", u" ").strip()
text = text.replace(u"<b>", u"'''").replace(u"</b>", u"'''")
text = text.replace(u"<em>", u"''").replace(u"</em>", u"''")
text = text.replace(u"<it>", u"''").replace(u"</em>", u"'")
text = text.replace(u"<p>", u"\n").replace(u"</p>", u"\n")
text = text.replace(u"<br />", u"\n").replace(u"<br/>", u"\n")
text = text.replace(u"―", u" ― ")
text = text.replace(u" ", u" ").replace(u" ", u" ")
text = text.replace(u"\n ", u"\n")
text = text.replace(u" \n", u"\n")
text = text.replace(u"\n\n\n", u"\n\n")
return text
url = 'http://www.gutenberg.org/files/29120/29120-h/29120-h.htm'
html = read_Gutenberg(url)
outputfile = codecs.open(url.split('/')[-1] + "_output.txt", "w", "utf-8")
soup = bs(html)
for tag in soup.find_all('span', {"pagenum"})[0:-1]:
print(tag)
next_tag = tag.find_next('span', {"pagenum"})
subtag = tag.next_sibling
text = u""
while next_tag != subtag:
try:
text = text + unicode(subtag)
except:
pass
subtag = subtag.next_sibling
outputfile.write(clean_text(text))
outputfile.write(u"\n\n\n\n")
outputfile.close()