news自動集積ツール
リビジョン | f850a28ae86e3bb1209f361b7297e63c5f9c5d03 (tree) |
---|---|
日時 | 2013-05-28 04:27:59 |
作者 | hylom <hylom@user...> |
コミッター | hylom |
initial commit
@@ -0,0 +1,35 @@ | ||
1 | += README = | |
2 | + | |
3 | +== gnewsとは? == | |
4 | + ニュース記事を収集・分類するニュースアグリゲーションサイトを構築するためのツールです。静的にHTMLを作成するのが特徴です。Planetにインスパイアされ、Google Newsクローンを目指しています。 | |
5 | + | |
6 | +== 必要なライブラリ == | |
7 | + 利用には下記のPythonモジュールおよびGNU makeが必要です。 | |
8 | + | |
9 | + * mako (python-mako、Makotenplate library for Python) | |
10 | + * feedparser (python-feedparser、feed library for Python) http://code.google.com/p/feedparser | |
11 | + * python-dateutil | |
12 | + | |
13 | +== 設定ファイルの作成 == | |
14 | + gnewsでサイトを生成するために、下記の設定ファイルが必要です。 | |
15 | + | |
16 | +=== config.py === | |
17 | + HTMLの生成先やサイト名などを設定するファイルです。PythonのDictionaryおよびArray形式で記述されています。 | |
18 | + config.py.sampleをコピーしてconfig.pyを作成し、編集します。 | |
19 | + | |
20 | +=== sources.ini === | |
21 | + RSSの取得先を設定するファイルです。ini形式で記述されています。セクション名が表示されるサイト名、urlパラメータがサイトのURL、sourcesパラメータがRSSの取得先、filtersが使用するフィルタ一覧(カンマ区切り)となります。 | |
22 | + | |
23 | +=== install.conf === | |
24 | + 関連ファイルのコピー先を指定するファイルです。 | |
25 | + install.conf.sampleをコピーしてintall.confを作成し、編集します。 | |
26 | + | |
27 | +=== keywords.txt === | |
28 | + 記事の分類を行うためのキーワードを指定するファイルです。 | |
29 | + keywords.txt.sampleをコピーしてkeywords.txtを作成し、編集します。 | |
30 | + | |
31 | +== 関連ファイルのインストール == | |
32 | + 上記の設定ファイルを用意したうえで、make installを実行します。 | |
33 | + | |
34 | +== サイトの作成・更新 | |
35 | + makeを実行します。 |
@@ -0,0 +1,42 @@ | ||
1 | +# config.py | |
2 | +# -*- coding: utf-8 -*- | |
3 | +# configuration for gnews.py | |
4 | + | |
5 | +config = { | |
6 | + 'template_directory': 'templates', | |
7 | + 'output_directory': 'outputs', | |
8 | + 'filter_directory': 'filters', | |
9 | + 'plugin_directory': 'plugins', | |
10 | + 'pagination_unit': 20, | |
11 | + 'log_level': 0, | |
12 | + 'index': { | |
13 | + 'template': 'index.tmpl.html', | |
14 | + 'output_directory': 'outputs/', | |
15 | + }, | |
16 | + 'tags': { | |
17 | + 'template': 'index.tmpl.html', | |
18 | + 'output_directory': 'outputs/tag', | |
19 | + }, | |
20 | + 'site_parameter': { | |
21 | + 'name': 'SourceForge.JP Antenna', | |
22 | + 'css_directory': '/css', | |
23 | + 'img_directory': '/img', | |
24 | + 'js_directory': '/js', | |
25 | + 'tag_directory': '/tag', | |
26 | + 'root': '/', | |
27 | + }, | |
28 | + 'pre_filters': [ | |
29 | + 'pr_block', | |
30 | + 'img_block', | |
31 | + ], | |
32 | + 'post_filters': [ | |
33 | + 'cleanup', | |
34 | + 'trimming', | |
35 | + 'remove_tracker' | |
36 | + ], | |
37 | + 'plugins': [ | |
38 | + 'indexing' | |
39 | +# 'hatebu_counter' | |
40 | + ] | |
41 | +} | |
42 | + |
@@ -0,0 +1,27 @@ | ||
1 | +# configloader.py | |
2 | +# -*- config: utf-8 -*- | |
3 | + | |
4 | +import ConfigParser | |
5 | + | |
6 | +CONFIG_FILE = 'sources.ini' | |
7 | + | |
8 | +def load(config_file=CONFIG_FILE): | |
9 | + 'parse .ini file and create config object' | |
10 | + config = ConfigParser.SafeConfigParser() | |
11 | + fp = open(config_file, 'r') | |
12 | + config.readfp(fp) | |
13 | + fp.close() | |
14 | + sources = [] | |
15 | + for section in config.sections(): | |
16 | + source = {} | |
17 | + source["name"] = section | |
18 | + source["source"] = config.get(section, 'source') | |
19 | + source["url"] = config.get(section, 'url') | |
20 | + if config.has_option(section, 'filters'): | |
21 | + filters = config.get(section, 'filters').split(',') | |
22 | + filters = [x.strip() for x in filters] | |
23 | + source["filters"] = filters | |
24 | + sources.append(source) | |
25 | + return sources | |
26 | + | |
27 | + |
@@ -0,0 +1,123 @@ | ||
1 | +#-*- coding: utf-8 -*- | |
2 | +'fetcher.py - RSS fetcher' | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +import feedparser | |
7 | +import dateutil.parser | |
8 | +#from config import config as config | |
9 | +from logger import log | |
10 | + | |
11 | +def _get_attr(obj, attr, default=""): | |
12 | + try: | |
13 | + return obj.__getattr__(attr) | |
14 | + except AttributeError: | |
15 | + return default | |
16 | + | |
17 | +class FeedFetcher(object): | |
18 | + 'Feed fetching and parsing' | |
19 | + def __init__(self, feed, config): | |
20 | + self._feed = feed | |
21 | + self._config = config | |
22 | + | |
23 | + def _fetch(self): | |
24 | + 'do fetch' | |
25 | + f = feedparser.parse(self._feed["source"]) | |
26 | + entries = [] | |
27 | + for e in f['entries']: | |
28 | + | |
29 | + entry = { | |
30 | + 'feed': self._feed, | |
31 | + 'tags': [], | |
32 | + } | |
33 | + entry["title"] = _get_attr(e, "title", "(no title)") | |
34 | + entry["url"] = _get_attr(e, "link") | |
35 | + entry["body"] = _get_attr(e, "description") | |
36 | + entry["date"] = _get_attr(e, "updated", None) | |
37 | + | |
38 | + if len(entry["url"]) == 0: | |
39 | + continue | |
40 | + | |
41 | + if len(entry["title"]) == 0: | |
42 | + continue | |
43 | + | |
44 | + if entry["date"] == None: | |
45 | + entry["date"] = _get_attr(e, "published", None) | |
46 | + | |
47 | + if entry["date"] == None: | |
48 | + continue | |
49 | + | |
50 | + try: | |
51 | + entry["date"] = dateutil.parser.parse(entry["date"]) | |
52 | + except ValueError: | |
53 | + # when date field contain illegal format | |
54 | + continue | |
55 | + | |
56 | + if entry['date'].tzinfo == None: | |
57 | + entry['date'] = entry['date'].replace(tzinfo=dateutil.tz.tzutc()) | |
58 | + | |
59 | + entries.append(entry) | |
60 | + return entries | |
61 | + | |
62 | + def _embeded_filter(self, entry): | |
63 | + # remove PR entry | |
64 | + if re.search(u'^(PR|AD)(:|:)', entry['title']): | |
65 | + log('delete PR entry - %s' % entry['title']) | |
66 | + return None | |
67 | + return entry | |
68 | + | |
69 | + def _apply_filters(self, filters, entries): | |
70 | + for f in filters: | |
71 | + entry_filter = self._get_filter(f) | |
72 | + entries = [entry_filter(x) for x in entries] | |
73 | + # remove entry which is None | |
74 | + entries = [x for x in entries if x] | |
75 | + return entries | |
76 | + | |
77 | + def _apply_pre_filters(self, entries): | |
78 | + return self._apply_filters(self._config['pre_filters'], entries) | |
79 | + | |
80 | + def _apply_post_filters(self, entries): | |
81 | + return self._apply_filters(self._config['post_filters'], entries) | |
82 | + | |
83 | + def get_entries(self): | |
84 | + 'get entries' | |
85 | + entries = self._fetch() | |
86 | + entries = self._apply_pre_filters(entries) | |
87 | + | |
88 | + filters = self._config.get("default_filters", []) | |
89 | + if 'filters' in self._feed: | |
90 | + f = self._feed.get('filters', []) | |
91 | + if len(f) > 0: | |
92 | + filters.extend(f) | |
93 | + | |
94 | + entries = self._apply_filters(filters, entries) | |
95 | + entries = self._apply_post_filters(entries) | |
96 | + return entries | |
97 | + | |
98 | + def _get_filter(self, filter_name): | |
99 | + 'load filter by seed settings' | |
100 | + | |
101 | + # fallback when filter isn't defined | |
102 | + if filter_name is None: | |
103 | + return lambda x:x | |
104 | + | |
105 | + filter_name = filter_name.encode('utf-8') | |
106 | + # import module | |
107 | + mods = __import__(self._config['filter_directory'], | |
108 | + globals(), | |
109 | + locals(), | |
110 | + [filter_name,]) | |
111 | + try: | |
112 | + mod = mods.__getattribute__(filter_name) | |
113 | + except AttributeError: | |
114 | + raise FilterError(filter_name) | |
115 | + | |
116 | + # return module's entry_filter function | |
117 | + return mod.entry_filter | |
118 | + | |
119 | +class FilterError(Exception): | |
120 | + def __init__(self, value): | |
121 | + self.value = value | |
122 | + def __str__(self): | |
123 | + return 'filter "' + self.value + '" is not found.' |
@@ -0,0 +1,1 @@ | ||
1 | +# __init__.py stub |
@@ -0,0 +1,27 @@ | ||
1 | +# filter for Image extraction | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_blank = re.compile(r'<\s*(\w+)[^>]*>\s*</\s*\1\s*>') | |
7 | +re_br = re.compile(r'<\s*br\s*/?>') | |
8 | + | |
9 | +def _replace_all(rex, text): | |
10 | + m = rex.search(text) | |
11 | + while(m): | |
12 | + text = rex.sub('', text) | |
13 | + m = rex.search(text) | |
14 | + return text | |
15 | + | |
16 | +def entry_filter(entry): | |
17 | + body = entry['body'] | |
18 | + | |
19 | + # 空のタグを削除 | |
20 | + body = _replace_all(re_blank, body) | |
21 | + | |
22 | + # brタグを削除 | |
23 | + body = _replace_all(re_br, body) | |
24 | + | |
25 | + entry['body'] = body | |
26 | + return entry | |
27 | + |
@@ -0,0 +1,30 @@ | ||
1 | +# filter for slashdot.jp | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_read_all = re.compile(ur'''<p>\s*<a href=['"][^'"]+['"]>\s*すべて読む\s*</a>.*?</p>''') | |
7 | +re_related = re.compile(ur'''<p>\s*関連ストーリー:.*?</p>''') | |
8 | +re_topics = re.compile(ur'''<a href="http://slashdot.jp/stories/\w+">(.*?)</a>''') | |
9 | + | |
10 | +def entry_filter(entry): | |
11 | + # すべて読む、関連ストーリーを削除 | |
12 | + body = entry['body'] | |
13 | + topics = [] | |
14 | + m = re_read_all.search(body) | |
15 | + if m: | |
16 | + s = m.group(0) | |
17 | + itr = re_topics.findall(s) | |
18 | + for items in itr: | |
19 | + topics.append(items) | |
20 | + | |
21 | + body = re_read_all.sub('', body) | |
22 | + body = re_related.sub('', body) | |
23 | + if 'tags' in entry: | |
24 | + entry['tags'].extend(topics) | |
25 | + else: | |
26 | + entry['tags'] = topics | |
27 | + entry['body'] = body | |
28 | + | |
29 | + return entry | |
30 | + |
@@ -0,0 +1,27 @@ | ||
1 | +# filter for Image extraction | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_imgtag = re.compile(ur'''<img[^>]*src=["']([^'"]*?)["'].*?>''') | |
7 | + | |
8 | +def entry_filter(entry): | |
9 | + # 画像を削除 | |
10 | + body = entry['body'] | |
11 | + images = [] | |
12 | + m = re_imgtag.search(body) | |
13 | + if m: | |
14 | + itr = re_imgtag.findall(body) | |
15 | + for url in itr: | |
16 | + images.append(url) | |
17 | + body = re_imgtag.sub('', body) | |
18 | + | |
19 | + if len(images) > 0: | |
20 | + if 'images' in entry: | |
21 | + entry['images'].extend(images) | |
22 | + else: | |
23 | + entry['images'] = images | |
24 | + entry['body'] = body | |
25 | + | |
26 | + return entry | |
27 | + |
@@ -0,0 +1,19 @@ | ||
1 | +# filter for slashdot.jp | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import MeCab | |
5 | +tagger = MeCab.Tagger() | |
6 | + | |
7 | +def entry_filter(entry): | |
8 | + body = entry['body'].encode("utf-8") | |
9 | + node = tagger.parseToNode(body) | |
10 | + keywords = set() | |
11 | + while(node): | |
12 | + features = node.feature.split(',') | |
13 | + if features[0] == '名詞' and features[1] == '一般': | |
14 | + keywords.add(node.surface.decode('utf-8')) | |
15 | + node = node.next | |
16 | + | |
17 | + entry['keywords'] = keywords | |
18 | + return entry | |
19 | + |
@@ -0,0 +1,15 @@ | ||
1 | +# filter for slashdot.jp | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_title = re.compile(r'\[ITmedia.*?\]') | |
7 | + | |
8 | +def entry_filter(entry): | |
9 | + # すべて読む、関連ストーリーを削除 | |
10 | + title = entry['title'] | |
11 | + title = re_title.sub('', title) | |
12 | + entry['title'] = title.strip() | |
13 | + | |
14 | + return entry | |
15 | + |
@@ -0,0 +1,7 @@ | ||
1 | +# filter for Trimming | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +def entry_filter(entry): | |
5 | + entry['body'] = "" | |
6 | + return entry | |
7 | + |
@@ -0,0 +1,12 @@ | ||
1 | +# PR filter | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +def entry_filter(entry): | |
7 | + if re.search(u'^(PR|AD)(:|:)', entry['title']): | |
8 | + #print 'delete PR entry - %s' % entry['title'] | |
9 | + return None | |
10 | + return entry | |
11 | + | |
12 | + |
@@ -0,0 +1,15 @@ | ||
1 | +# remove images for tracking | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_rssad_url = re.compile(r'^http://rss.rssad.jp/') | |
7 | + | |
8 | +def entry_filter(entry): | |
9 | + if "images" in entry: | |
10 | + for i in range(len(entry["images"])): | |
11 | + if re_rssad_url.search(entry["images"][i]): | |
12 | + entry["images"].pop(i) | |
13 | + | |
14 | + return entry | |
15 | + |
@@ -0,0 +1,35 @@ | ||
1 | +# filter for slashdot.jp | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_read_all = re.compile(ur'''<p>\s*<a href=['"][^'"]+['"]>\s*すべて読む\s*</a>.*?</p>''') | |
7 | +re_related = re.compile(ur'''<p>\s*関連ストーリー:.*?</p>''') | |
8 | +re_topics = re.compile(ur'''<a href="http://slashdot.jp/stories/\w+">(.*?)</a>''') | |
9 | + | |
10 | +re_break = re.compile(r'''\n\n(.*?)\n''') | |
11 | + | |
12 | +def entry_filter(entry): | |
13 | + # すべて読む、関連ストーリーを削除 | |
14 | + body = entry['body'] | |
15 | + topics = [] | |
16 | + m = re_read_all.search(body) | |
17 | + if m: | |
18 | + s = m.group(0) | |
19 | + itr = re_topics.findall(s) | |
20 | + for items in itr: | |
21 | + topics.append(items) | |
22 | + | |
23 | + while re_break.search(body): | |
24 | + body = re_break.sub(r'</p><p>\1</p><p>', body) | |
25 | + | |
26 | + body = re_read_all.sub('', body) | |
27 | + body = re_related.sub('', body) | |
28 | + if 'tags' in entry: | |
29 | + entry['tags'].extend(topics) | |
30 | + else: | |
31 | + entry['tags'] = topics | |
32 | + entry['body'] = body | |
33 | + | |
34 | + return entry | |
35 | + |
@@ -0,0 +1,26 @@ | ||
1 | +# filter for slashdot.jp | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +#import re | |
5 | +from keywords import keywords | |
6 | + | |
7 | +def entry_filter(entry): | |
8 | + # キーワード文字列が含まれていればそれをタグに設定する | |
9 | + body = entry['body'] | |
10 | + title = entry['title'] | |
11 | + tags = [] | |
12 | + for keyword in keywords: | |
13 | + keyname = keyword[0] | |
14 | + for key in keyword: | |
15 | + if body.find(key) >= 0 or title.find(key) >= 0: | |
16 | + tags.append(keyname) | |
17 | + break | |
18 | + | |
19 | + if 'tags' in entry: | |
20 | + entry['tags'].extend(tags) | |
21 | + else: | |
22 | + entry['tags'] = tags | |
23 | + entry['body'] = body | |
24 | + | |
25 | + return entry | |
26 | + |
@@ -0,0 +1,17 @@ | ||
1 | +# filter for Trimming | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_first_para = re.compile(r'<p>(.*?)</p>') | |
7 | + | |
8 | +def entry_filter(entry): | |
9 | + body = entry['body'] | |
10 | + m = re_first_para.search(body) | |
11 | + if m: | |
12 | + text = m.group(1).strip() | |
13 | + if len(text) > 0: | |
14 | + body = "<p>" + m.group(1) + "...</p>" | |
15 | + entry['body'] = body | |
16 | + return entry | |
17 | + |
@@ -0,0 +1,22 @@ | ||
1 | +#!/usr/bin/python | |
2 | + | |
3 | +import sys | |
4 | +import re | |
5 | + | |
6 | +HEADER = """# | |
7 | +# -*- coding: utf-8 -*- | |
8 | + | |
9 | +keywords = [""" | |
10 | + | |
11 | +FOOTER = """]""" | |
12 | + | |
13 | +print HEADER | |
14 | + | |
15 | +for l in sys.stdin: | |
16 | + l = l.strip() | |
17 | + terms = l.split(',') | |
18 | + t = " (u'" + "',u'".join(terms) + "',)," | |
19 | + print t | |
20 | + | |
21 | +print FOOTER | |
22 | + |
@@ -0,0 +1,157 @@ | ||
1 | +#!/usr/bin/python | |
2 | +'gnews.py - google news clone' | |
3 | + | |
4 | +import sys | |
5 | +#from config import config as config | |
6 | +import renderer | |
7 | +import fetcher | |
8 | +import os.path | |
9 | +import urllib | |
10 | +from logger import log | |
11 | +import configloader | |
12 | + | |
13 | +import json | |
14 | + | |
15 | +USAGE = "%s <config_json>" % sys.argv[0] | |
16 | + | |
17 | +class GNews(object): | |
18 | + def main(self): | |
19 | + "gnews's main function" | |
20 | + try: | |
21 | + config_json = sys.argv[1] | |
22 | + except IndexError: | |
23 | + sys.exit(USAGE) | |
24 | + | |
25 | + f = open(config_json, "r") | |
26 | + self.config = json.load(f) | |
27 | + f.close() | |
28 | + | |
29 | + self.sources = configloader.load(self.config["sources"]) | |
30 | + | |
31 | + # fetch RSS feed | |
32 | + entries = [] | |
33 | + for feed in self.sources: | |
34 | + f = fetcher.FeedFetcher(feed, self.config) | |
35 | + e = f.get_entries() | |
36 | + entries.extend(e) | |
37 | + | |
38 | + # aggregate tags | |
39 | + tags = {} | |
40 | + for entry in entries: | |
41 | + for tag in entry['tags']: | |
42 | + if tag in tags: | |
43 | + tags[tag]['entry'].append(entry) | |
44 | + else: | |
45 | + tags[tag] = {} | |
46 | + tags[tag]['entry'] = [entry,] | |
47 | + tags[tag]['quoted_name'] = urllib.quote(tag.encode('utf-8')) | |
48 | + | |
49 | + # count stories for each tags | |
50 | + for tag in tags: | |
51 | + tags[tag]['count'] = len(tags[tag]['entry']) | |
52 | + | |
53 | + # sort tag by count | |
54 | + sorted_tags = tags.keys() | |
55 | + sorted_tags.sort(lambda x,y: cmp(tags[y]['count'], tags[x]['count'])) | |
56 | + | |
57 | + # sort by date | |
58 | + cmp_entries = (lambda x,y: 1 if (x["date"] < y["date"]) else -1) | |
59 | + entries.sort(cmp_entries) | |
60 | + | |
61 | + for e in entries: | |
62 | + log(e["date"]) | |
63 | + | |
64 | + # do rendering | |
65 | + params = { | |
66 | + 'tags':tags, | |
67 | + 'page':{}, | |
68 | + 'sorted_tags':sorted_tags, | |
69 | + } | |
70 | + | |
71 | + self.call_plugins("pre_render", entries, params) | |
72 | + | |
73 | + # render index page | |
74 | + self.do_rendering('index', 'index%s.html', entries, params) | |
75 | + | |
76 | + # render tag page | |
77 | + for tag in tags: | |
78 | + subentries = tags[tag]['entry'] | |
79 | + self.do_rendering('tags', tag + '%s.html', subentries, params) | |
80 | + | |
81 | + # rendering keyword page | |
82 | + if "sorted_keywords" in params: | |
83 | + for keyword in params["sorted_keywords"]: | |
84 | + d = params["keywords"][keyword] | |
85 | + if d["count"] > self.config["site_parameter"]["keyword_threshold"]: | |
86 | + subentries = d["entry"] | |
87 | + self.do_rendering('keywords', keyword + '%s.html', subentries, params) | |
88 | + | |
89 | + def _get_plugin(self, plugin_name): | |
90 | + 'load plugin by config settings' | |
91 | + | |
92 | + # fallback when filter isn't defined | |
93 | + if plugin_name is None: | |
94 | + return lambda x:x | |
95 | + | |
96 | + # import module | |
97 | + names = [plugin_name.encode('utf-8'),] | |
98 | + mods = __import__(self.config['plugin_directory'], | |
99 | + globals(), | |
100 | + locals(), | |
101 | + names) | |
102 | + try: | |
103 | + mod = mods.__getattribute__(plugin_name) | |
104 | + except AttributeError: | |
105 | + raise PluginError(plugin_name) | |
106 | + return mod | |
107 | + | |
108 | + def call_plugins(self, function_name, *args): | |
109 | + for plugin in self.config['plugins']: | |
110 | + mod = self._get_plugin(plugin) | |
111 | + plugin = mod.__getattribute__("export") | |
112 | + try: | |
113 | + f = plugin.__getattribute__(function_name) | |
114 | + f(*args) | |
115 | + except AttributeError: | |
116 | + continue | |
117 | + | |
118 | + def do_rendering(self, page_type, filename, entries, params): | |
119 | + "rendering page" | |
120 | + | |
121 | + r = renderer.Renderer(self.sources, self.config) | |
122 | + tmpl = self.config[page_type]['template'] | |
123 | + output_dir = self.config[page_type]['output_directory'] | |
124 | + | |
125 | + # do pagination | |
126 | + pageunit = self.config['pagination_unit'] | |
127 | + total_page = 1 + (len(entries) - 1) / pageunit | |
128 | + params['page']['total'] = total_page | |
129 | + params['page']['filename'] = filename | |
130 | + | |
131 | + # rendering each page | |
132 | + for page in range(1, total_page + 1): | |
133 | + params['page']['current'] = page | |
134 | + start = pageunit * (page - 1) | |
135 | + end = pageunit * page | |
136 | + if page == 1: | |
137 | + output_fullpath = os.path.join(output_dir, filename % '') | |
138 | + else: | |
139 | + output_fullpath = os.path.join(output_dir, filename % page) | |
140 | + | |
141 | + log('generate ' + output_fullpath + '...') | |
142 | + f = open(output_fullpath, "w") | |
143 | + html = r.render(tmpl, entries[start:end], params) | |
144 | + f.write(html) | |
145 | + f.close() | |
146 | + | |
147 | + | |
148 | +class PluginError(Exception): | |
149 | + def __init__(self, value): | |
150 | + self.value = value | |
151 | + def __str__(self): | |
152 | + return 'plugin "' + self.value + '" is not found.' | |
153 | + | |
154 | + | |
155 | +if __name__ == '__main__': | |
156 | + gnews = GNews() | |
157 | + gnews.main() |
@@ -0,0 +1,157 @@ | ||
1 | +# | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +keywords = [ | |
5 | + (u'AMD',), | |
6 | + (u'Amiga',), | |
7 | + (u'Android',), | |
8 | + (u'apache',), | |
9 | + (u'BeOS',), | |
10 | + (u'Blackberry',), | |
11 | + (u'BSD',), | |
12 | + (u'Caldera',), | |
13 | + (u'Chrome',), | |
14 | + (u'Chromium',), | |
15 | + (u'Comdex',), | |
16 | + (u'Compaq',), | |
17 | + (u'Debian',), | |
18 | + (u'Digital',), | |
19 | + (u'DRM',), | |
20 | + (u'EFF',), | |
21 | + (u'enlightenment',), | |
22 | + (u'EU',), | |
23 | + (u'Facebook',), | |
24 | + (u'Firefox',), | |
25 | + (u'GNOME',), | |
26 | + (u'GNU',), | |
27 | + (u'Google',), | |
28 | + (u'GUI',), | |
29 | + (u'HP',), | |
30 | + (u'IBM',), | |
31 | + (u'idle',), | |
32 | + (u'Intel',), | |
33 | + (u'iOS',), | |
34 | + (u'iPhone',), | |
35 | + (u'IT',), | |
36 | + (u'Java',), | |
37 | + (u'JAXA',), | |
38 | + (u'KDE',), | |
39 | + (u'Linux',), | |
40 | + (u'Mandrake',), | |
41 | + (u'Linuxcare',), | |
42 | + (u'Mac OS X',u'MacOS X',), | |
43 | + (u'Mozilla',), | |
44 | + (u'Namazu',), | |
45 | + (u'NASA',), | |
46 | + (u'Novell',), | |
47 | + (u'NTT',), | |
48 | + (u'Opera',), | |
49 | + (u'Oracle',), | |
50 | + (u'OS',), | |
51 | + (u'Perl',), | |
52 | + (u'PHP',), | |
53 | + (u'Python',), | |
54 | + (u'Quake',), | |
55 | + (u'Ruby',), | |
56 | + (u'Safari',), | |
57 | + (u'SGI',), | |
58 | + (u'SNS',), | |
59 | + (u'Sony',u'ソニー',), | |
60 | + (u'spam',), | |
61 | + (u'SuSE',), | |
62 | + (u'Gimp',), | |
63 | + (u'Transmeta',), | |
64 | + (u'TRON',), | |
65 | + (u'Twitter',), | |
66 | + (u'Ubuntu',), | |
67 | + (u'UNIX',), | |
68 | + (u'Wikipedia',u'ウィキペディア',), | |
69 | + (u'Windows',), | |
70 | + (u'Windows Azure',), | |
71 | + (u'Wine',), | |
72 | + (u'Ximian',), | |
73 | + (u'Yahoo',u'ヤフー',), | |
74 | + (u'YouTube',), | |
75 | + (u'Apple',u'アップル',), | |
76 | + (u'インターネット',), | |
77 | + (u'Internet Explorer',u'インターネットエクスプローラ',), | |
78 | + (u'Open Source',u'OpenSource',u'オープンソース',), | |
79 | + (u'ガンダム',), | |
80 | + (u'Cloud',u'クラウド',), | |
81 | + (u'game',u'ゲーム',), | |
82 | + (u'Corel',u'コーレル',), | |
83 | + (u'Star Wars',u'スターウオーズ',), | |
84 | + (u'Startrek',u'スタートレック',), | |
85 | + (u'Storage',u'ストレージ',), | |
86 | + (u'スパコン',), | |
87 | + (u'Slashdot',u'スラッシュドット',), | |
88 | + (u'セキュリティ',), | |
89 | + (u'ソフトウェア',), | |
90 | + (u'ターボリナックス',), | |
91 | + (u'TV',u'テレビ',), | |
92 | + (u'データベース',), | |
93 | + (u'Netscape',u'ネットスケープ',), | |
94 | + (u'ネットワーク',), | |
95 | + (u'ノートPC',u'ノートパソコン',), | |
96 | + (u'ハンドヘルド',), | |
97 | + (u'ハードウェア',), | |
98 | + (u'ハードウェアハック',), | |
99 | + (u'バイオテック',), | |
100 | + (u'バグ',), | |
101 | + (u'特許',u'パテント',), | |
102 | + (u'ビジネス',), | |
103 | + (u'ビール',), | |
104 | + (u'プライバシ',), | |
105 | + (u'プリンタ',), | |
106 | + (u'プログラミング',), | |
107 | + (u'ボットネット',), | |
108 | + (u'Microsoft',u'マイクロソフト',), | |
109 | + (u'メディア',), | |
110 | + (u'モニター',), | |
111 | + (u'モバイル',), | |
112 | + (u'リンク',), | |
113 | + (u'Red Hat',u'レッドハット',), | |
114 | + (u'ロボット',), | |
115 | + (u'ワーム',), | |
116 | + (u'中国',), | |
117 | + (u'交通',), | |
118 | + (u'AI',u'人工知能',), | |
119 | + (u'仮想化',), | |
120 | + (u'任天堂',), | |
121 | + (u'入力デバイス',), | |
122 | + (u'医療',), | |
123 | + (u'原子力',), | |
124 | + (u'IIS',u'国際宇宙ステーション',), | |
125 | + (u'地球',), | |
126 | + (u'地震',), | |
127 | + (u'娯楽',), | |
128 | + (u'宇宙',), | |
129 | + (u'広告',), | |
130 | + (u'情報漏洩',), | |
131 | + (u'携帯通信',), | |
132 | + (u'携帯電話',), | |
133 | + (u'政府',), | |
134 | + (u'政治',), | |
135 | + (u'教育',), | |
136 | + (u'数学',), | |
137 | + (u'日本',), | |
138 | + (u'日記',), | |
139 | + (u'映画',), | |
140 | + (u'暗号',), | |
141 | + (u'書籍',), | |
142 | + (u'検閲',), | |
143 | + (u'法廷',), | |
144 | + (u'海賊行為',), | |
145 | + (u'火星',), | |
146 | + (u'犯罪',), | |
147 | + (u'統計',), | |
148 | + (u'英国',), | |
149 | + (u'著作権',), | |
150 | + (u'軍事',), | |
151 | + (u'通信',), | |
152 | + (u'電力',), | |
153 | + (u'音楽',), | |
154 | + (u'スマートフォン',u'スマホ',u'スマートホン',), | |
155 | + (u'グラフィックカード',u'グラフィックスカード',u'GPU',), | |
156 | + (u'NTTドコモ',u'ドコモ',), | |
157 | +] |
@@ -0,0 +1,152 @@ | ||
1 | +AMD | |
2 | +Amiga | |
3 | +Android | |
4 | +apache | |
5 | +BeOS | |
6 | +Blackberry | |
7 | +BSD | |
8 | +Caldera | |
9 | +Chrome | |
10 | +Chromium | |
11 | +Comdex | |
12 | +Compaq | |
13 | +Debian | |
14 | +Digital | |
15 | +DRM | |
16 | +EFF | |
17 | +enlightenment | |
18 | +EU | |
19 | ||
20 | +Firefox | |
21 | +GNOME | |
22 | +GNU | |
23 | ||
24 | +GUI | |
25 | +HP | |
26 | +IBM | |
27 | +idle | |
28 | +Intel | |
29 | +iOS | |
30 | +iPhone | |
31 | +IT | |
32 | +Java | |
33 | +JAXA | |
34 | +KDE | |
35 | +Linux | |
36 | +Mandrake | |
37 | +Linuxcare | |
38 | +Mac OS X,MacOS X | |
39 | +Mozilla | |
40 | +Namazu | |
41 | +NASA | |
42 | +Novell | |
43 | +NTT | |
44 | +Opera | |
45 | +Oracle | |
46 | +OS | |
47 | +Perl | |
48 | +PHP | |
49 | +Python | |
50 | +Quake | |
51 | +Ruby | |
52 | +Safari | |
53 | +SGI | |
54 | +SNS | |
55 | +Sony,ソニー | |
56 | +spam | |
57 | +SuSE | |
58 | +Gimp | |
59 | +Transmeta | |
60 | +TRON | |
61 | ||
62 | +Ubuntu | |
63 | +UNIX | |
64 | +Wikipedia,ウィキペディア | |
65 | +Windows | |
66 | +Windows Azure | |
67 | +Wine | |
68 | +Ximian | |
69 | +Yahoo,ヤフー | |
70 | +YouTube | |
71 | +Apple,アップル | |
72 | +インターネット | |
73 | +Internet Explorer,インターネットエクスプローラ | |
74 | +Open Source,OpenSource,オープンソース | |
75 | +ガンダム | |
76 | +Cloud,クラウド | |
77 | +game,ゲーム | |
78 | +Corel,コーレル | |
79 | +Star Wars,スターウオーズ | |
80 | +Startrek,スタートレック | |
81 | +Storage,ストレージ | |
82 | +スパコン | |
83 | +Slashdot,スラッシュドット | |
84 | +セキュリティ | |
85 | +ソフトウェア | |
86 | +ターボリナックス | |
87 | +TV,テレビ | |
88 | +データベース | |
89 | +Netscape,ネットスケープ | |
90 | +ネットワーク | |
91 | +ノートPC,ノートパソコン | |
92 | +ハンドヘルド | |
93 | +ハードウェア | |
94 | +ハードウェアハック | |
95 | +バイオテック | |
96 | +バグ | |
97 | +特許,パテント | |
98 | +ビジネス | |
99 | +ビール | |
100 | +プライバシ | |
101 | +プリンタ | |
102 | +プログラミング | |
103 | +ボットネット | |
104 | +Microsoft,マイクロソフト | |
105 | +メディア | |
106 | +モニター | |
107 | +モバイル | |
108 | +リンク | |
109 | +Red Hat,レッドハット | |
110 | +ロボット | |
111 | +ワーム | |
112 | +中国 | |
113 | +交通 | |
114 | +AI,人工知能 | |
115 | +仮想化 | |
116 | +任天堂 | |
117 | +入力デバイス | |
118 | +医療 | |
119 | +原子力 | |
120 | +IIS,国際宇宙ステーション | |
121 | +地球 | |
122 | +地震 | |
123 | +娯楽 | |
124 | +宇宙 | |
125 | +広告 | |
126 | +情報漏洩 | |
127 | +携帯通信 | |
128 | +携帯電話 | |
129 | +政府 | |
130 | +政治 | |
131 | +教育 | |
132 | +数学 | |
133 | +日本 | |
134 | +日記 | |
135 | +映画 | |
136 | +暗号 | |
137 | +書籍 | |
138 | +検閲 | |
139 | +法廷 | |
140 | +海賊行為 | |
141 | +火星 | |
142 | +犯罪 | |
143 | +統計 | |
144 | +英国 | |
145 | +著作権 | |
146 | +軍事 | |
147 | +通信 | |
148 | +電力 | |
149 | +音楽 | |
150 | +スマートフォン,スマホ,スマートホン | |
151 | +グラフィックカード,グラフィックスカード,GPU | |
152 | +NTTドコモ,ドコモ |
@@ -0,0 +1,152 @@ | ||
1 | +AMD | |
2 | +Amiga | |
3 | +Android | |
4 | +apache | |
5 | +BeOS | |
6 | +Blackberry | |
7 | +BSD | |
8 | +Caldera | |
9 | +Chrome | |
10 | +Chromium | |
11 | +Comdex | |
12 | +Compaq | |
13 | +Debian | |
14 | +Digital | |
15 | +DRM | |
16 | +EFF | |
17 | +enlightenment | |
18 | +EU | |
19 | ||
20 | +Firefox | |
21 | +GNOME | |
22 | +GNU | |
23 | ||
24 | +GUI | |
25 | +HP | |
26 | +IBM | |
27 | +idle | |
28 | +Intel | |
29 | +iOS | |
30 | +iPhone | |
31 | +IT | |
32 | +Java | |
33 | +JAXA | |
34 | +KDE | |
35 | +Linux | |
36 | +Mandrake | |
37 | +Linuxcare | |
38 | +Mac OS X,MacOS X | |
39 | +Mozilla | |
40 | +Namazu | |
41 | +NASA | |
42 | +Novell | |
43 | +NTT | |
44 | +Opera | |
45 | +Oracle | |
46 | +OS | |
47 | +Perl | |
48 | +PHP | |
49 | +Python | |
50 | +Quake | |
51 | +Ruby | |
52 | +Safari | |
53 | +SGI | |
54 | +SNS | |
55 | +Sony,ソニー | |
56 | +spam | |
57 | +SuSE | |
58 | +Gimp | |
59 | +Transmeta | |
60 | +TRON | |
61 | ||
62 | +Ubuntu | |
63 | +UNIX | |
64 | +Wikipedia,ウィキペディア | |
65 | +Windows | |
66 | +Windows Azure | |
67 | +Wine | |
68 | +Ximian | |
69 | +Yahoo,ヤフー | |
70 | +YouTube | |
71 | +Apple,アップル | |
72 | +インターネット | |
73 | +Internet Explorer,インターネットエクスプローラ | |
74 | +Open Source,OpenSource,オープンソース | |
75 | +ガンダム | |
76 | +Cloud,クラウド | |
77 | +game,ゲーム | |
78 | +Corel,コーレル | |
79 | +Star Wars,スターウオーズ | |
80 | +Startrek,スタートレック | |
81 | +Storage,ストレージ | |
82 | +スパコン | |
83 | +Slashdot,スラッシュドット | |
84 | +セキュリティ | |
85 | +ソフトウェア | |
86 | +ターボリナックス | |
87 | +TV,テレビ | |
88 | +データベース | |
89 | +Netscape,ネットスケープ | |
90 | +ネットワーク | |
91 | +ノートPC,ノートパソコン | |
92 | +ハンドヘルド | |
93 | +ハードウェア | |
94 | +ハードウェアハック | |
95 | +バイオテック | |
96 | +バグ | |
97 | +特許,パテント | |
98 | +ビジネス | |
99 | +ビール | |
100 | +プライバシ | |
101 | +プリンタ | |
102 | +プログラミング | |
103 | +ボットネット | |
104 | +Microsoft,マイクロソフト | |
105 | +メディア | |
106 | +モニター | |
107 | +モバイル | |
108 | +リンク | |
109 | +Red Hat,レッドハット | |
110 | +ロボット | |
111 | +ワーム | |
112 | +中国 | |
113 | +交通 | |
114 | +AI,人工知能 | |
115 | +仮想化 | |
116 | +任天堂 | |
117 | +入力デバイス | |
118 | +医療 | |
119 | +原子力 | |
120 | +IIS,国際宇宙ステーション | |
121 | +地球 | |
122 | +地震 | |
123 | +娯楽 | |
124 | +宇宙 | |
125 | +広告 | |
126 | +情報漏洩 | |
127 | +携帯通信 | |
128 | +携帯電話 | |
129 | +政府 | |
130 | +政治 | |
131 | +教育 | |
132 | +数学 | |
133 | +日本 | |
134 | +日記 | |
135 | +映画 | |
136 | +暗号 | |
137 | +書籍 | |
138 | +検閲 | |
139 | +法廷 | |
140 | +海賊行為 | |
141 | +火星 | |
142 | +犯罪 | |
143 | +統計 | |
144 | +英国 | |
145 | +著作権 | |
146 | +軍事 | |
147 | +通信 | |
148 | +電力 | |
149 | +音楽 | |
150 | +スマートフォン,スマホ,スマートホン | |
151 | +グラフィックカード,グラフィックスカード,GPU | |
152 | +NTTドコモ,ドコモ |
@@ -0,0 +1,9 @@ | ||
1 | +'logger.py - log output utility' | |
2 | + | |
3 | +from config import config | |
4 | + | |
5 | +def log(*args): | |
6 | + "log helper function" | |
7 | + if config['log_level'] > 0: | |
8 | + print ' '.join([x if isinstance(x, unicode) else str(x) for x in args]) | |
9 | + |
@@ -0,0 +1,21 @@ | ||
1 | +#!/usr/bin/python | |
2 | +# -*- coding: utf-8 | |
3 | +'plugin base class' | |
4 | + | |
5 | +class Plugin(object): | |
6 | + "gnews plugin base class" | |
7 | + def __init__(self): | |
8 | + pass | |
9 | + | |
10 | + def pre_fetch(self): | |
11 | + pass | |
12 | + | |
13 | + def pre_tag_aggregate(self, entries): | |
14 | + pass | |
15 | + | |
16 | + def pre_render(self, entries): | |
17 | + pass | |
18 | + | |
19 | + def pre_quit(self, entries): | |
20 | + pass | |
21 | + |
@@ -0,0 +1,1 @@ | ||
1 | +# __init__.py stub |
@@ -0,0 +1,40 @@ | ||
1 | +#!/usr/bin/python | |
2 | +# -*- coding: utf-8 | |
3 | +'plugin for hatena bookmark counter' | |
4 | + | |
5 | +#from __future__ import with_statement | |
6 | + | |
7 | +import xmlrpclib | |
8 | +import datetime | |
9 | +import time | |
10 | +import sys | |
11 | + | |
12 | +# see http://d.hatena.ne.jp/keyword/%a4%cf%a4%c6%a4%ca%a5%d6%a5%c3%a5%af%a5%de%a1%bc%a5%af%b7%ef%bf%f4%bc%e8%c6%c0API?kid=146686 | |
13 | + | |
14 | +urls = [] | |
15 | +counts = {} | |
16 | + | |
17 | +def pre_fetch(): | |
18 | + pass | |
19 | + | |
20 | +def pre_tag_aggregate(entries): | |
21 | + pass | |
22 | + | |
23 | +def pre_render(entries): | |
24 | + counts = [] | |
25 | + for i in range(0, len(entries), 50): | |
26 | + urls = [x['url'] for x in entries[i:i+50]] | |
27 | + c = _get_count(urls) | |
28 | + for j in range(0, len(c)): | |
29 | + entries[i+j]['url'] = c[j] | |
30 | + | |
31 | +def pre_quit(entries): | |
32 | + pass | |
33 | + | |
34 | +def _get_count(urls): | |
35 | + # urls can have max 50 items | |
36 | + uri = "http://b.hatena.ne.jp/xmlrpc" | |
37 | + server = xmlrpclib.ServerProxy(uri) | |
38 | + t = server.bookmark.getCount(*urls) | |
39 | + return t | |
40 | + |
@@ -0,0 +1,33 @@ | ||
1 | +#!/usr/bin/python | |
2 | +# -*- coding: utf-8 | |
3 | +'plugin for indexing' | |
4 | + | |
5 | +import urllib | |
6 | + | |
7 | +from plugin import Plugin | |
8 | + | |
9 | +class Indexing(Plugin): | |
10 | + def pre_render(self, entries, params): | |
11 | + keywords = {} | |
12 | + for entry in entries: | |
13 | + if "keywords" in entry: | |
14 | + for keyword in entry["keywords"]: | |
15 | + if len(keyword) <= 1: | |
16 | + continue | |
17 | + if keyword in keywords: | |
18 | + d = keywords[keyword] | |
19 | + d["entry"].append(entry) | |
20 | + d["count"] += 1 | |
21 | + else: | |
22 | + keywords[keyword] = { | |
23 | + "entry": [entry], | |
24 | + "quoted_name": urllib.quote(keyword.encode('utf-8')), | |
25 | + "count": 1 | |
26 | + } | |
27 | + params["keywords"] = keywords | |
28 | + sorted_keywords = keywords.keys() | |
29 | + sorted_keywords.sort(lambda x,y: cmp(keywords[y]['count'], keywords[x]['count'])) | |
30 | + | |
31 | + params["sorted_keywords"] = sorted_keywords | |
32 | + | |
33 | +export = Indexing() |
@@ -0,0 +1,34 @@ | ||
1 | +"propertizer.py - access dictionary's data using '.'(dot)." | |
2 | + | |
3 | +def propertize(obj): | |
4 | + if isinstance(obj, dict): | |
5 | + for key in obj: | |
6 | + d = propertize(obj[key]) | |
7 | + obj[key] = d | |
8 | + return Propertizer(obj) | |
9 | + elif isinstance(obj, list): | |
10 | + for i in range(len(obj)): | |
11 | + d = propertize(obj[i]) | |
12 | + obj[i] = d | |
13 | + return obj | |
14 | + else: | |
15 | + return obj | |
16 | + | |
17 | +class Propertizer(object): | |
18 | + def __init__(self, d): | |
19 | + self._dict = d | |
20 | + | |
21 | + def __getattr__(self, name): | |
22 | + return self._dict[name] | |
23 | + | |
24 | + def __iter__(self): | |
25 | + return self._dict.__iter__() | |
26 | + | |
27 | + def next(self): | |
28 | + return self._dict.next() | |
29 | + | |
30 | + def __getitem__(self, key): | |
31 | + return self._dict[key] | |
32 | + | |
33 | + def __setitem__(self, key, value): | |
34 | + self._dict[key] = value |
@@ -0,0 +1,57 @@ | ||
1 | +'renderer.py - rendering html and some items' | |
2 | + | |
3 | +import datetime | |
4 | + | |
5 | +from mako.template import Template | |
6 | +from mako.lookup import TemplateLookup | |
7 | +from mako.exceptions import RichTraceback | |
8 | +import dateutil.parser | |
9 | + | |
10 | +from propertizer import propertize | |
11 | +from logger import log | |
12 | + | |
13 | +def date_format(date): | |
14 | + return date.strftime('%Y/%m/%d %H:%M') | |
15 | + | |
16 | +class Renderer(object): | |
17 | + def __init__(self, sources, config): | |
18 | + self.template_dir = config['template_directory'] | |
19 | + self._sources = sources | |
20 | + self._config = config | |
21 | + | |
22 | + def _get_template(self, template_name): | |
23 | + 'read template file' | |
24 | + tl = TemplateLookup(directories=[self.template_dir], | |
25 | + input_encoding="utf-8", | |
26 | + output_encoding="utf-8", | |
27 | + default_filters=['decode.utf8'], | |
28 | + format_exceptions=True) | |
29 | + return tl.get_template(template_name) | |
30 | + | |
31 | + def render(self, template, entries, params={}): | |
32 | + 'rendering template' | |
33 | + t = self._get_template(template) | |
34 | + | |
35 | + | |
36 | + kwargs = { | |
37 | + 'date_format': date_format, | |
38 | + 'entries': entries, | |
39 | + 'params': params, | |
40 | + 'site': self._config['site_parameter'], | |
41 | + 'sources': self._sources, | |
42 | + } | |
43 | + kwargs['site']['last_update'] = datetime.datetime.utcnow() | |
44 | + for key in kwargs: | |
45 | + d = propertize(kwargs[key]) | |
46 | + kwargs[key] = d | |
47 | + try: | |
48 | + result = t.render(**kwargs) | |
49 | + except: | |
50 | + traceback = RichTraceback() | |
51 | + for (filename, lineno, function, line) in traceback.traceback: | |
52 | + log("File %s, line %s, in %s" % (filename, lineno, function)) | |
53 | + log(line, "\n") | |
54 | + log("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error)) | |
55 | + sys.exit(-1) | |
56 | + return result | |
57 | + |