| Uutiset | Koodikirjasto | Wiki | Keskustelut | FAQ | Info |
URL inforaspi 20.10.07 22:36 Hakee tietoa annetusta nettisivusta
#!/usr/bin/python # -*- coding: ISO-8859-15 -*- # URL info # Pekka Järvinen 2007 # Hakee tietoa annetusta nettisivusta # Python import urlparse import sys import re import httplib import StringIO # lxml from lxml import etree # PIL import Image import ImageFile ################################################################################ class MimeHandler: pageInfo = [] def __init__(self, domain, query, headers, data): self.headers = headers self.data = data self.domain = domain self.query = query self.pageInfo = [] def getData(self): return self.data def getHeaders(self): return self.headers def loadSettings(self): pass ################################################################################ class HTMLHandler(MimeHandler): """ Uses lxml """ def loadSettings(self): self.settings = HTMLHandlerSettings return None def getInfo(self): for cfg in self.settings: if self.domain == cfg["domain"]: detailedInfo = cfg["handler"]() detailedInfo.loadSettings() detailedInfo.setData(self.data) detailedInfo.setPath(self.query) self.pageInfo.append({"Additional info": detailedInfo.getDetailedInfo()}) parser = etree.HTMLParser() tree = etree.parse(StringIO.StringIO(self.data), parser) # title titlexpath = tree.xpath("//title") if len(titlexpath) > 0: title = titlexpath[0].text self.pageInfo.append({"Title": title}) # a axpath = tree.xpath("//a") if len(axpath) > 0: a = len(axpath) self.pageInfo.append({"Link (a) count": a}) # link linkxpath = tree.xpath("//link") if len(linkxpath) > 0: link = len(linkxpath) self.pageInfo.append({"Link (link) count": link}) # link RSS rssxpath = tree.xpath("//link[@rel='alternate'][@type='application/rss+xml']") if len(rssxpath) > 0: rss = len(rssxpath) self.pageInfo.append({"RSS count": rss}) # script javascript javascriptxpath = tree.xpath("//script[@type='text/javascript']") if len(javascriptxpath) > 0: javascript = len(javascriptxpath) self.pageInfo.append({"Javascript block count": javascript}) # img imgxpath = tree.xpath("//img") if len(imgxpath) > 0: img = len(imgxpath) self.pageInfo.append({"Image count (non-css)": img}) # object flash flashxpath = tree.xpath("//object[@type='application/x-shockwave-flash']") if len(flashxpath) > 0: flash = len(flashxpath) self.pageInfo.append({"Flash count": flash}) return self.pageInfo ################################################################################ class IMDBHandler(HTMLHandler): def __init__(self): pass def loadSettings(self): self.settings = IMDBHandlerSettings return None def setData(self, value): self.data = value def setPath(self, value): self.path = value def getDetailedInfo(self): for cfg in self.settings: p = re.compile(cfg["path"]) m = p.match(self.path) if m: detailedInfo = cfg["handler"]() detailedInfo.setData(self.data) return detailedInfo.getDetailedInfo() return [] ################################################################################ class IMDBTitleHandler(IMDBHandler): def __init__(self): self.additionalInfo = [] def setData(self, value): self.data = value def getDetailedInfo(self): parser = etree.HTMLParser() tree = etree.parse(StringIO.StringIO(self.data), parser) # Year infoxpath = tree.xpath("//h1/span/a") if len(infoxpath) > 0: year = infoxpath[0].text self.additionalInfo.append({"Year": year}) # Director infoxpath = tree.xpath("//div[@class='info']/h5") if len(infoxpath) > 0: for i in infoxpath: if i.text == "Director:": path = urlparse.urljoin(tree.getpath(i), 'a') directorxpath = tree.xpath(path) if len(directorxpath) > 0: director = directorxpath[0].text self.additionalInfo.append({"Director": director}) break # Cast cast_limit = 3 cast = [] infoxpath = tree.xpath("//td[@class='nm']/a") if len(infoxpath) > 0: idx = 0 for i in infoxpath: idx = idx + 1 cast.append(i.text) if idx == cast_limit and len(infoxpath) > cast_limit: self.additionalInfo.append({"Cast": cast}) break return self.additionalInfo ################################################################################ # TODO class IMDBNameHandler(IMDBHandler): def __init__(self): pass def setData(self, value): self.data = value def getDetailedInfo(self): pass ################################################################################ class ImageHandler(MimeHandler): """ Uses PIL """ def loadSettings(self): pass def getInfo(self): im = Image.open(StringIO.StringIO(self.data)) self.pageInfo.append({"Image": "%sx%s %s" % (im.size[0], im.size[1], im.format)}) return self.pageInfo ################################################################################ handlers = [] handlers.append({"handler": HTMLHandler, "mime": "text/html"}) handlers.append({"handler": ImageHandler, "mime": "image/jpeg"}) handlers.append({"handler": ImageHandler, "mime": "image/pjpeg"}) handlers.append({"handler": ImageHandler, "mime": "image/jpg"}) handlers.append({"handler": ImageHandler, "mime": "image/png"}) handlers.append({"handler": ImageHandler, "mime": "image/gif"}) handlers.append({"handler": ImageHandler, "mime": "image/tiff"}) HTMLHandlerSettings = [] HTMLHandlerSettings.append({"domain": "www.imdb.com", "handler": IMDBHandler}) HTMLHandlerSettings.append({"domain": "imdb.com", "handler": IMDBHandler}) HTMLHandlerSettings.append({"domain": "finnish.imdb.com", "handler": IMDBHandler}) IMDBHandlerSettings = [] IMDBHandlerSettings.append({"path": "/title/tt[\d+]", "handler": IMDBTitleHandler}) # TODO #IMDBHandlerSettings.append({"path": "/name/nm[\d+]", "handler": IMDBNameHandler}) if __name__ == '__main__': urls = ['http://mureakuha.com/', 'http://mureakuha.com/img/skeletonkuha_small.png', 'http://finnish.imdb.com/title/tt0060666/'] for addr in urls: url = urlparse.urlparse(addr) query = urlparse.urlunparse(['', '', url.path, url.params, url.query, url.fragment]) conn = httplib.HTTPConnection(url.netloc) conn.request("GET", query) response = conn.getresponse() headers = response.getheaders() contenttype_raw = response.getheader('content-type') contenttype = contenttype_raw.split(";")[0] data = response.read(1024*1024*1024) conn.close() for handler in handlers: if contenttype == handler["mime"]: client = handler["handler"](url.netloc, query, headers, data) client.loadSettings() print client.getInfo() """ Tulostaa: [{'Title': 'mureakuha 3.0 '}, {'Link (a) count': 41}, {'Link (link) count': 6}, {'RSS count': 3}, {'Javascript block count': 4}, {'Image count (non-css)': 22}] [{'Image': '125x79 PNG'}] [{'Additional info': [{'Year': '1966'}, {'Director': 'Harold P. Warren'}, {'Cast ': ['Tom Neyman', 'John Reynolds', 'Diane Mahree']}]}, {'Title': 'Manos: The Han ds of Fate (1966)'}, {'Link (a) count': 240}, {'Link (link) count': 3}, {'Javasc ript block count': 2}, {'Image count (non-css)': 33}] """ ane 13:56 7.11.07 Näppärä. Varmaan johonki web-softaan tarkotettu? Esim. slashdotin "[tööt.org]" linkkien jälkeen. raspi 10:19 8.11.07 IRC-bottiin :P |
![]() Haku
|