URL info

raspi 20.10.07 22:36

Hakee tietoa annetusta nettisivusta

 Tekstiversio  Arvo: 5 (7 ääntä)  Äänestä: +  -
#!/usr/bin/python
# -*- coding: ISO-8859-15 -*-
# URL info
# Pekka Järvinen 2007
# Hakee tietoa annetusta nettisivusta


# Python
import urlparse
import sys
import re
import httplib
import StringIO

# lxml
from lxml import etree

# PIL
import Image
import ImageFile

################################################################################
class MimeHandler:
  pageInfo = []

  def __init__(self, domain, query, headers, data):
    self.headers = headers
    self.data = data
    self.domain = domain
    self.query = query
    self.pageInfo = []

  def getData(self):
    return self.data

  def getHeaders(self):
    return self.headers

  def loadSettings(self):
    pass

################################################################################
class HTMLHandler(MimeHandler):
  """
    Uses lxml
  "
""

  def loadSettings(self):
    self.settings = HTMLHandlerSettings
    return None

  def getInfo(self):
 
    for cfg in self.settings:
      if self.domain == cfg["domain"]:
        detailedInfo = cfg["handler"]()
        detailedInfo.loadSettings()
        detailedInfo.setData(self.data)
        detailedInfo.setPath(self.query)
        self.pageInfo.append({"Additional info": detailedInfo.getDetailedInfo()})
       
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO.StringIO(self.data), parser)
 
    # title
    titlexpath = tree.xpath("//title")
    if len(titlexpath) > 0:
      title = titlexpath[0].text
      self.pageInfo.append({"Title": title})

    # a
    axpath = tree.xpath("//a")
    if len(axpath) > 0:
      a = len(axpath)
      self.pageInfo.append({"Link (a) count": a})

    # link
    linkxpath = tree.xpath("//link")
    if len(linkxpath) > 0:
      link = len(linkxpath)
      self.pageInfo.append({"Link (link) count": link})

    # link RSS
    rssxpath = tree.xpath("//link[@rel='alternate'][@type='application/rss+xml']")
    if len(rssxpath) > 0:
      rss = len(rssxpath)
      self.pageInfo.append({"RSS count": rss})

    # script javascript
    javascriptxpath = tree.xpath("//script[@type='text/javascript']")
    if len(javascriptxpath) > 0:
      javascript = len(javascriptxpath)
      self.pageInfo.append({"Javascript block count": javascript})

    # img
    imgxpath = tree.xpath("//img")
    if len(imgxpath) > 0:
      img = len(imgxpath)
      self.pageInfo.append({"Image count (non-css)": img})

    # object flash
    flashxpath = tree.xpath("//object[@type='application/x-shockwave-flash']")
    if len(flashxpath) > 0:
      flash = len(flashxpath)
      self.pageInfo.append({"Flash count": flash})
   
    return self.pageInfo
################################################################################
class IMDBHandler(HTMLHandler):
  def __init__(self):
    pass

  def loadSettings(self):
    self.settings = IMDBHandlerSettings
    return None

  def setData(self, value):
    self.data = value

  def setPath(self, value):
    self.path = value

  def getDetailedInfo(self):
    for cfg in self.settings:
      p = re.compile(cfg["path"])
      m = p.match(self.path)
      if m:
        detailedInfo = cfg["handler"]()
        detailedInfo.setData(self.data)
        return detailedInfo.getDetailedInfo()
    return []
################################################################################
class IMDBTitleHandler(IMDBHandler):
  def __init__(self):
    self.additionalInfo = []

  def setData(self, value):
    self.data = value

  def getDetailedInfo(self):
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO.StringIO(self.data), parser)

    # Year
    infoxpath = tree.xpath("//h1/span/a")
    if len(infoxpath) > 0:
      year = infoxpath[0].text
      self.additionalInfo.append({"Year": year})

    # Director
    infoxpath = tree.xpath("//div[@class='info']/h5")
    if len(infoxpath) > 0:
      for i in infoxpath:
        if i.text == "Director:":
          path = urlparse.urljoin(tree.getpath(i), 'a')
          directorxpath = tree.xpath(path)
          if len(directorxpath) > 0:
            director = directorxpath[0].text
            self.additionalInfo.append({"Director": director})
          break

    # Cast
    cast_limit = 3
    cast = []
    infoxpath = tree.xpath("//td[@class='nm']/a")
    if len(infoxpath) > 0:
      idx = 0
      for i in infoxpath:
        idx = idx + 1
        cast.append(i.text)
        if idx == cast_limit and len(infoxpath) > cast_limit:
          self.additionalInfo.append({"Cast": cast})
          break
   
    return self.additionalInfo
################################################################################
# TODO
class IMDBNameHandler(IMDBHandler):
  def __init__(self):
    pass

  def setData(self, value):
    self.data = value

  def getDetailedInfo(self):
    pass 
################################################################################
class ImageHandler(MimeHandler):
  """
    Uses PIL
  "
""
  def loadSettings(self):
    pass

  def getInfo(self):
    im = Image.open(StringIO.StringIO(self.data))
    self.pageInfo.append({"Image": "%sx%s %s" % (im.size[0], im.size[1], im.format)})
    return self.pageInfo
################################################################################

handlers = []

handlers.append({"handler": HTMLHandler, "mime": "text/html"})
handlers.append({"handler": ImageHandler, "mime": "image/jpeg"})
handlers.append({"handler": ImageHandler, "mime": "image/pjpeg"})
handlers.append({"handler": ImageHandler, "mime": "image/jpg"})
handlers.append({"handler": ImageHandler, "mime": "image/png"})
handlers.append({"handler": ImageHandler, "mime": "image/gif"})
handlers.append({"handler": ImageHandler, "mime": "image/tiff"})

HTMLHandlerSettings = []
HTMLHandlerSettings.append({"domain": "www.imdb.com", "handler": IMDBHandler})
HTMLHandlerSettings.append({"domain": "imdb.com", "handler": IMDBHandler})
HTMLHandlerSettings.append({"domain": "finnish.imdb.com", "handler": IMDBHandler})

IMDBHandlerSettings = []
IMDBHandlerSettings.append({"path": "/title/tt[\d+]", "handler": IMDBTitleHandler})
# TODO
#IMDBHandlerSettings.append({"path": "/name/nm[\d+]", "handler": IMDBNameHandler})

if __name__ == '__main__':
  urls = ['http://mureakuha.com/', 'http://mureakuha.com/img/skeletonkuha_small.png', 'http://finnish.imdb.com/title/tt0060666/']
 
  for addr in urls:
    url = urlparse.urlparse(addr)
    query = urlparse.urlunparse(['', '', url.path, url.params, url.query, url.fragment])
   
    conn = httplib.HTTPConnection(url.netloc)
    conn.request("GET", query)
    response = conn.getresponse()
    headers = response.getheaders()
    contenttype_raw = response.getheader('content-type')
    contenttype = contenttype_raw.split(";")[0]
    data = response.read(1024*1024*1024)
    conn.close()
   
    for handler in handlers:
      if contenttype == handler["mime"]:
        client = handler["handler"](url.netloc, query, headers, data)
        client.loadSettings()
        print client.getInfo()
        print

"""
Tulostaa:
[{'Title': 'mureakuha 3.0  '}, {'Link (a) count': 41}, {'Link (link) count': 6},
 {'RSS count': 3}, {'Javascript block count': 4}, {'Image count (non-css)': 22}]

[{'Image': '125x79 PNG'}]

[{'Additional info': [{'Year': '1966'}, {'Director': 'Harold P. Warren'}, {'Cast
': ['Tom Neyman', 'John Reynolds', 'Diane Mahree']}]}, {'Title': 'Manos: The Han
ds of Fate (1966)'}, {'Link (a) count': 240}, {'Link (link) count': 3}, {'Javasc
ript block count': 2}, {'Image count (non-css)': 33}]
"
""
 

ane 13:56 7.11.07 
Näppärä. Varmaan johonki web-softaan tarkotettu? Esim. slashdotin "[tööt.org]" linkkien jälkeen.
raspi 10:19 8.11.07 
IRC-bottiin :P