planet/discovery.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

#!/usr/bin/env python
"""PostgreSQL Planet Aggregator

This file contains the functions to suck down RSS/Atom feeds 
(using feedparser), determining the actual blog URL (for the
HTML posts), and update the database with them.

Copyright (C) 2008 PostgreSQL Global Development Group
"""

import psycopg2
import feedparser
import datetime
import socket
import ConfigParser

class Aggregator:
	def __init__(self, db):
		self.db = db
		self.stored = 0
		socket.setdefaulttimeout(20)
		
	def Update(self):
		feeds = self.db.cursor()
		feeds.execute("SELECT id,feedurl,name,blogurl FROM planet.feeds WHERE blogurl='' AND feedurl NOT LIKE '%planet%'")
		for feed in feeds.fetchall():
			self.DiscoverFeed(feed)
		self.db.commit()

	def DiscoverFeed(self, feedinfo):
		feed = feedparser.parse(feedinfo[1])

		if feed.status != 200:
			# not ok!
			print "Feed %s status %s" % (feedinfo[1], feed.status)
			return

		try:
			if feed.feed.link:
				print "Setting feed for %s to %s" % (feedinfo[2], feed.feed.link)
				c = self.db.cursor()
				c.execute("UPDATE planet.feeds SET blogurl='%s' WHERE id=%i" % (feed.feed.link, feedinfo[0]))
		except:
			print "Exception when processing feed for %s" % (feedinfo[2])
			print feed

if __name__=="__main__":
	c = ConfigParser.ConfigParser()
	c.read('planet.ini')
	Aggregator(psycopg2.connect(c.get('planet','db'))).Update()