#!/usr/bin/env python
import re, time
DEBUG = False
def unescape(text):
"""Removes HTML or XML character references
and entities from a text string.
@param text The HTML (or XML) source text.
@return The plain text, as a Unicode string, if necessary.
from Fredrik Lundh
2008-01-03: input only unicode characters string.
http://effbot.org/zone/re-sub.htm#unescape-html
"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
print "Value Error"
pass
else:
# named entity
# reescape the reserved characters.
try:
if text[1:-1] == "amp":
text = "&"
elif text[1:-1] == "gt":
text = ">"
elif text[1:-1] == "lt":
text = "<"
else:
print text[1:-1]
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
print "keyerror"
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
def get_data_line():
if DEBUG:
f = open("top.html")
else:
import urllib
f = urllib.urlopen("http://www.imdb.com/chart/top")
data = f.read()
f.close()
for line in data.splitlines():
if "Top 250 movies as voted by our users" in line:
return line
line = get_data_line()
items = line.split("</tr>")[1:]
title_re = re.compile(".+<a href=\".+\">(.+)</a> \((\d+)\)</font>")
years = {}
this_year = time.localtime().tm_year
earliest_year = this_year
for item in items:
m = title_re.match(item)
if m:
year = int(m.group(2))
if year < earliest_year:
earliest_year = year
title = unicode(unescape(m.group(1)))
if not year in years:
years[year] = [ title ]
else:
years[year].append(title)
years_range = range(earliest_year, this_year + 1)
years_range.reverse()
for year in years_range:
if year in years:
print "%d (%d):\n%s" % (year, len(years[year]), ", ".join(years[year]))
Shared with Droplr