• R/O
  • HTTP
  • SSH
  • HTTPS

タグ
未設定

よく使われているワード(クリックで追加)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

RSS link checker


ファイル情報

Rev. e88a86ffb402b632adfa18eb11f4ff63f22a76ea
サイズ 2,615 バイト
日時 2014-10-22 16:32:31
作者 hylom
ログメッセージ

add scripts

内容

#!/usr/bin/python
# -*- coding: utf-8 -*-

'link checker for sourceforge.jp/magazine'

import urllib2
import argparse
import xml.etree.ElementTree as ET
import os.path
import cPickle as pickle
import httplib
from urlparse import urlparse 
import sys

import mailtohirom

RSS_URL = 'http://sourceforge.jp/magazine/rss.noad'
CACHE_FILE = '~/.linkchecker.cache'
CACHE_SIZE = 100
RSS_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
    }

def main(args):
    # get RSS
    rss = retrive_rss()
    ET.register_namespace
    root = ET.fromstring(rss)
    items = root.findall('{http://purl.org/rss/1.0/}item')
    urls = [x.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about') for x in items]
    error = check_urls(urls, args)
    if error:
        # send mail to hirom
        subj = 'SourceForge.JP Magazine alert'
        body = 'Error occured: \n\n'
        for url in error:
            body = body + url + ' - ' + error[url] + '\n'
        mailtohirom.send(subj, body)

def read_cache():
    realpath = os.path.expanduser(CACHE_FILE)
    if(os.path.isfile(realpath)):
        # read cache
        f = open(realpath)
        cache = pickle.load(f)
        f.close()
        return cache
    return {}

def write_cache(cache):
    realpath = os.path.expanduser(CACHE_FILE)
    f = open(realpath, 'w')
    pickle.dump(cache, f)
    f.close()

def check_url(url):
    parsed = urlparse(url)
    conn = httplib.HTTPConnection(parsed.hostname)
    conn.request('HEAD', parsed.path)
    res = conn.getresponse()
    return res

def check_urls(urls, args={}):
    cache = read_cache()
    new_cache = {}
    mesg = ''
    error_flag = False
    error_url = {}
    for url in urls:
        if url in cache and cache[url]:
            mesg = 'ok (cached)'
            new_cache[url] = True
        else:
            res = check_url(url)
            if res.status == 200:
                mesg = 'ok'
                new_cache[url] = True
            else:
                mesg = str(res.status)
                new_cache[url] = False
                error_flag = True
                error_url[url] = str(res.status) + ': ' + res.reason
        if(args.verbose):
            print url + ' : ' + mesg
    write_cache(new_cache)
    if error_flag:
        return error_url
    else:
        return None

def retrive_rss():
    url = urllib2.urlopen(RSS_URL)
    return url.read()

# call main function
if __name__ == '__main__':
    parser = argparse.ArgumentParser('check links in RSS')
    parser.add_argument('-v', '--verbose', action='store_true')
    args = parser.parse_args()
    main(args)