RSS link checker
Rev. | e88a86ffb402b632adfa18eb11f4ff63f22a76ea |
---|---|
サイズ | 2,615 バイト |
日時 | 2014-10-22 16:32:31 |
作者 | hylom |
ログメッセージ | add scripts
|
#!/usr/bin/python
# -*- coding: utf-8 -*-
'link checker for sourceforge.jp/magazine'
import urllib2
import argparse
import xml.etree.ElementTree as ET
import os.path
import cPickle as pickle
import httplib
from urlparse import urlparse
import sys
import mailtohirom
RSS_URL = 'http://sourceforge.jp/magazine/rss.noad'
CACHE_FILE = '~/.linkchecker.cache'
CACHE_SIZE = 100
RSS_NAMESPACES = {
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
}
def main(args):
# get RSS
rss = retrive_rss()
ET.register_namespace
root = ET.fromstring(rss)
items = root.findall('{http://purl.org/rss/1.0/}item')
urls = [x.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about') for x in items]
error = check_urls(urls, args)
if error:
# send mail to hirom
subj = 'SourceForge.JP Magazine alert'
body = 'Error occured: \n\n'
for url in error:
body = body + url + ' - ' + error[url] + '\n'
mailtohirom.send(subj, body)
def read_cache():
realpath = os.path.expanduser(CACHE_FILE)
if(os.path.isfile(realpath)):
# read cache
f = open(realpath)
cache = pickle.load(f)
f.close()
return cache
return {}
def write_cache(cache):
realpath = os.path.expanduser(CACHE_FILE)
f = open(realpath, 'w')
pickle.dump(cache, f)
f.close()
def check_url(url):
parsed = urlparse(url)
conn = httplib.HTTPConnection(parsed.hostname)
conn.request('HEAD', parsed.path)
res = conn.getresponse()
return res
def check_urls(urls, args={}):
cache = read_cache()
new_cache = {}
mesg = ''
error_flag = False
error_url = {}
for url in urls:
if url in cache and cache[url]:
mesg = 'ok (cached)'
new_cache[url] = True
else:
res = check_url(url)
if res.status == 200:
mesg = 'ok'
new_cache[url] = True
else:
mesg = str(res.status)
new_cache[url] = False
error_flag = True
error_url[url] = str(res.status) + ': ' + res.reason
if(args.verbose):
print url + ' : ' + mesg
write_cache(new_cache)
if error_flag:
return error_url
else:
return None
def retrive_rss():
url = urllib2.urlopen(RSS_URL)
return url.read()
# call main function
if __name__ == '__main__':
parser = argparse.ArgumentParser('check links in RSS')
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args()
main(args)