import urllib import re import sys from htmlentitydefs import name2codepoint def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) url = '' try: url = sys.argv[1] except: print 'no argument given' sys.exit() print 'download from %s' % url try: f = urllib.urlopen(url).read() except: print 'error downloading' print 'length of doc: %i' % len(f) f2 = re.findall('
(.*?)
', f, re.DOTALL|re.IGNORECASE) f = f2[0].decode('utf-8', 'ignore') f = unescape(f) nf = '' buf = '' f = re.sub('={5,}', '', f) f = re.sub('\+{5,}', '', f) f = re.sub('\*{5,}', '', f) for i in range(len(f)): if ( f[i] == '\n'): if i < len(f)-1: if ( ( f[i+1] == '\r' ) or (f[i+1] == '\n') or (f[i+1] == ' ') ): nf += '\r\n' elif f[i] == '\r': pass else: nf += f[i] fh = open('output.txt', 'w') fh.write(nf) fh.close()