#!/usr/bin/env python

import re, time

DEBUG = False

def unescape(text):
   """Removes HTML or XML character references 
      and entities from a text string.
   @param text The HTML (or XML) source text.
   @return The plain text, as a Unicode string, if necessary.
   from Fredrik Lundh
   2008-01-03: input only unicode characters string.
   http://effbot.org/zone/re-sub.htm#unescape-html
   """
   def fixup(m):
      text = m.group(0)
      if text[:2] == "&#":
         # character reference
         try:
            if text[:3] == "&#x":
               return unichr(int(text[3:-1], 16))
            else:
               return unichr(int(text[2:-1]))
         except ValueError:
            print "Value Error"
            pass
      else:
         # named entity
         # reescape the reserved characters.
         try:
            if text[1:-1] == "amp":
               text = "&"
            elif text[1:-1] == "gt":
               text = ">"
            elif text[1:-1] == "lt":
               text = "<"
            else:
               print text[1:-1]
               text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
         except KeyError:
            print "keyerror"
            pass
      return text # leave as is
   return re.sub("&#?\w+;", fixup, text)

def get_data_line():
    if DEBUG:
        f = open("top.html")
    else:
        import urllib

        f = urllib.urlopen("http://www.imdb.com/chart/top")

    data = f.read()
    f.close()

    for line in data.splitlines():
        if "Top 250 movies as voted by our users" in line:
            return line

line = get_data_line()
items = line.split("</tr>")[1:]
title_re = re.compile(".+<a href=\".+\">(.+)</a> \((\d+)\)</font>")

years = {}
this_year = time.localtime().tm_year
earliest_year = this_year

for item in items:
    m = title_re.match(item)
    if m:
        year = int(m.group(2))
        if year < earliest_year:
            earliest_year = year

        title = unicode(unescape(m.group(1)))
        if not year in years:
            years[year] = [ title ]
        else:
            years[year].append(title)

years_range = range(earliest_year, this_year + 1)
years_range.reverse()
for year in years_range:
    if year in years:
        print "%d (%d):\n%s" % (year, len(years[year]), ", ".join(years[year]))

Shared with Droplr