diff options
author | icoz <icoz.vt@gmail.com> | 2016-05-28 16:38:30 +0300 |
---|---|---|
committer | icoz <icoz.vt@gmail.com> | 2016-05-28 16:38:30 +0300 |
commit | b5c9fa7ab09dd1e7013c6ab39de29b233e01f16f (patch) | |
tree | 5401f0e3d3c25214622e3585a5449a4ea7cb2a6a | |
parent | 55d2963b8a4943973daeae14bd147d235a9d48eb (diff) |
В связи с закрытием Мегамозга, его код и его тесты закомментированы.
В связи с изменениями разметки Хабра внесены следующие изменения в получение следующих данных:
- заголовков (у хабра и гиктаймса они теперь лежат по разному)
- комментариев (#7)
Добавлены дополнительные тесты.
С комментариями остались проблемы, поэтому пока их сохранение недоступно (хотя код есть - кто может, помогите)
Поправлен habr/__init__.py
-rw-r--r-- | habr/__init__.py | 6 | ||||
-rw-r--r-- | habr/topic.py | 81 | ||||
-rw-r--r-- | habr/user.py | 49 | ||||
-rwxr-xr-x | habraparse.py | 68 |
4 files changed, 126 insertions, 78 deletions
diff --git a/habr/__init__.py b/habr/__init__.py index 66c4d6d..d1cd45c 100644 --- a/habr/__init__.py +++ b/habr/__init__.py @@ -1,8 +1,8 @@ -from .user import HabraUser -from .topic import HabraTopic +from .user import HabraUser, GeektimesUser +from .topic import HabraTopic, GeektimesTopic __author__ = 'icoz' __name__ = 'habraparse' __version__ = '0.0.1' -__all__ = [HabraTopic, HabraUser] +__all__ = [HabraTopic, HabraUser, GeektimesTopic, GeektimesUser] diff --git a/habr/topic.py b/habr/topic.py index bac1c1a..8bbe750 100644 --- a/habr/topic.py +++ b/habr/topic.py @@ -46,10 +46,10 @@ class TMTopic(object): hubs = doc.xpath("//div[@class='hubs']/a") for h in hubs: self.post['hubs'].append((h.text, h.attrib['href'])) - post_title = doc.xpath('//h1/span[@class="post_title"]') + post_title = doc.xpath('//h1[@class="post__title"]/span') if len(post_title) == 0: raise PostDeleted - self.post['title'] = post_title[0].text + self.post['title'] = post_title tmp = doc.xpath("//div[@class='author-info__username']//a[@class='author-info__nickname']") or \ doc.xpath("//div[@class='author-info__username']//a[@class='author-info__name']") or \ doc.xpath("//div[@class='author-info__username']//span[@class='author-info__name']") @@ -63,10 +63,29 @@ class TMTopic(object): if len(tmp) else '' self.post['comments'] = [] # bug in class 'comments_list ' - space added - comments = doc.xpath("//div[@class='comments_list ']//div[@class='comment_item']") + # comments = doc.xpath("//div[@class='comments_list ']//div[@class='comment_item']") + comments = doc.xpath("//ul[@id='comments-list']//li[@class='comment_item']") self.post['comments_count'] = len(comments) - for c in comments: - self.post['comments'].append(etree.tostring(c, pretty_print=True, method='html').decode('utf-8')) + # record = (author, text) + authors = map(lambda x: x.text, doc.xpath("//a[@class='comment-item__username']")) + cmt_texts = map(lambda x: x.text, doc.xpath("//div[@class='message html_format ']")) + c_id = map(lambda x: x.attrib['id'][8:], doc.xpath("//li[@class='comment_item']")) + p_id = map(lambda x: x.attrib['data-parent_id'], doc.xpath("//span[@class='parent_id']")) + self.post['comments'] = tuple(zip(authors, cmt_texts, c_id, p_id)) + + # self.post['comments'] = list() + # for c in comments: + # # self.post['comments'].append(etree.tostring(c, pretty_print=True, method='html').decode('utf-8')) + # # record = (author, text, c_id, parent_c_id) + # author = c.xpath("//a[@class='comment-item__username']") + # if len(author): author = author[0].text + # else: author = '<anonymous>' + # text = c.xpath("//div[@class='message html_format ']") + # if text != '': text = text[0].text + # c_id = c.attrib['id'] + # p_id = c.xpath("//span[@class='parent_id']")[0] + # if p_id != '': p_id = p_id.attrib['data-parent_id'] + # self.post['comments'].append((author, text, c_id, p_id)) def author(self): return deepcopy(self.post['author']) @@ -93,16 +112,19 @@ class TMTopic(object): class HabraTopic(TMTopic): def __init__(self, topic_id): super().__init__(topic_id, domain='habrahabr.ru') + self.post['title'] = self.post['title'][1].text class GeektimesTopic(TMTopic): def __init__(self, topic_id): super().__init__(topic_id, domain='geektimes.ru') + self.post['title'] = self.post['title'][0].text -class MegamozgTopic(TMTopic): - def __init__(self, topic_id): - super().__init__(topic_id, domain='megamozg.ru') +# R.I.P. +# class MegamozgTopic(TMTopic): +# def __init__(self, topic_id): +# super().__init__(topic_id, domain='megamozg.ru') import pprint @@ -115,6 +137,7 @@ class TestHabraTopic(TestCase): pp.pprint(t.author()) self.assertEqual(t.author(), 'Яндекс') pp.pprint(t.title()) + self.assertEqual(t.title(), 'Memory management в ядре Linux. Семинар в Яндексе') pp.pprint(t.post['comments_count']) pp.pprint(t.post['rating']) @@ -123,9 +146,12 @@ class TestHabraTopic(TestCase): pp = pprint.PrettyPrinter(indent=4) pp.pprint(t.author()) self.assertEqual(t.author(), '@icoz') + self.assertEqual(t.title(), 'Экспорт Избранного на Хабре в PDF') pp.pprint(t.title()) pp.pprint(t.post['comments_count']) pp.pprint(t.post['rating']) + self.assertEqual(t.comments()[0][0], 'keccak') + self.assertEqual(t.comments()[1][0], 'icoz') class TestGTTopic(TestCase): @@ -135,6 +161,7 @@ class TestGTTopic(TestCase): pp.pprint(t.author()) self.assertEqual(t.author(), 'Soundpal') pp.pprint(t.title()) + self.assertEqual(t.title(), 'На что влияет сопротивление наушников') pp.pprint(t.post['comments_count']) pp.pprint(t.post['rating']) @@ -144,25 +171,25 @@ class TestGTTopic(TestCase): pp.pprint(t.author()) self.assertEqual(t.author(), '@Robotex') pp.pprint(t.title()) + self.assertEqual(t.title(), 'Autodesk и Voxel8 делают 3D-печать электроники реальностью') pp.pprint(t.post['comments_count']) pp.pprint(t.post['rating']) - -class TestMMTopic(TestCase): - def test_topic(self): - t = MegamozgTopic(418) - pp = pprint.PrettyPrinter(indent=4) - pp.pprint(t.author()) - self.assertEqual(t.author(), '@Kirilkin') - pp.pprint(t.title()) - pp.pprint(t.post['comments_count']) - pp.pprint(t.post['rating']) - - def test_topic2(self): - t = MegamozgTopic(8568) - pp = pprint.PrettyPrinter(indent=4) - pp.pprint(t.author()) - self.assertEqual(t.author(), '@jasiejames') - pp.pprint(t.title()) - pp.pprint(t.post['comments_count']) - pp.pprint(t.post['rating']) +# class TestMMTopic(TestCase): +# def test_topic(self): +# t = MegamozgTopic(418) +# pp = pprint.PrettyPrinter(indent=4) +# pp.pprint(t.author()) +# self.assertEqual(t.author(), '@Kirilkin') +# pp.pprint(t.title()) +# pp.pprint(t.post['comments_count']) +# pp.pprint(t.post['rating']) +# +# def test_topic2(self): +# t = MegamozgTopic(8568) +# pp = pprint.PrettyPrinter(indent=4) +# pp.pprint(t.author()) +# self.assertEqual(t.author(), '@jasiejames') +# pp.pprint(t.title()) +# pp.pprint(t.post['comments_count']) +# pp.pprint(t.post['rating']) diff --git a/habr/user.py b/habr/user.py index 1721697..f6cda91 100644 --- a/habr/user.py +++ b/habr/user.py @@ -237,9 +237,10 @@ class GeektimesUser(TMUser): super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='geektimes.ru') -class MegamozgUser(TMUser): - def __init__(self, username, need_favorites=False, need_user_posts=False): - super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='megamozg.ru') +# R.I.P. +# class MegamozgUser(TMUser): +# def __init__(self, username, need_favorites=False, need_user_posts=False): +# super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='megamozg.ru') import pprint @@ -297,24 +298,24 @@ class Test_GeektimesUser(TestCase): pp.pprint('userposts=') pp.pprint(hu.user_posts()) - -class Test_MegamozgUser(TestCase): - def setUp(self): - self.hu = MegamozgUser('icoz') - pass - - def test_parseUserpage(self): - pp = pprint.PrettyPrinter(indent=4) - pp.pprint(self.hu.activity()) - pp.pprint(self.hu.profile()) - pp.pprint(self.hu.karma()) - - # def test_favs(self): - # pp = pprint.PrettyPrinter(indent=4) - - def test_user_posts(self): - hu = MegamozgUser('Zelenyikot') - pp = pprint.PrettyPrinter(indent=4) - pp.pprint('userposts=') - pp.pprint(hu.user_posts()) - +# +# class Test_MegamozgUser(TestCase): +# def setUp(self): +# self.hu = MegamozgUser('icoz') +# pass +# +# def test_parseUserpage(self): +# pp = pprint.PrettyPrinter(indent=4) +# pp.pprint(self.hu.activity()) +# pp.pprint(self.hu.profile()) +# pp.pprint(self.hu.karma()) +# +# # def test_favs(self): +# # pp = pprint.PrettyPrinter(indent=4) +# +# def test_user_posts(self): +# hu = MegamozgUser('Zelenyikot') +# pp = pprint.PrettyPrinter(indent=4) +# pp.pprint('userposts=') +# pp.pprint(hu.user_posts()) +# diff --git a/habraparse.py b/habraparse.py index 4e948c1..3b69d2d 100755 --- a/habraparse.py +++ b/habraparse.py @@ -5,8 +5,8 @@ import sys from weasyprint import HTML, CSS -from habr.topic import HabraTopic, PostDeleted, MegamozgTopic, GeektimesTopic -from habr.user import HabraUser, GeektimesUser, MegamozgUser +from habr.topic import HabraTopic, PostDeleted, GeektimesTopic +from habr.user import HabraUser, GeektimesUser __author__ = 'icoz' @@ -38,10 +38,24 @@ def prepare_html(topic, with_comments=False): </div> ''' html_cmnts = ''' - <div id="comments" class="comments_list"> - <h2 class="title">Комментарии</h2> - {comments} + <h2>Комментарии</h2> + <ul id="comments-list"> + {comments} + </ul> + ''' + html_cmnt = ''' + <li class="comment_item" id="comment_{c_id}"> + <span class="parent_id" data-parent_id="{p_id}"></span> + <div class="comment_body "> + <div class="info comments-list__item comment-item " rel="{c_id}"> + <span class="comment-item__user-info" rel="user-popover" data-user-login="{user}"> + <a href="https://habrahabr.ru/users/{user}/" class="comment-item__username">{user}</a> + </span> + <div class="message html_format "> + {cmnt} </div> + </div> + </div> ''' html_foot = ''' </div> @@ -53,9 +67,15 @@ def prepare_html(topic, with_comments=False): if with_comments: cmnts = '' html_format = html_head + html_cmnts + html_foot + # print("t.comments()=", len(t.comments())) + l = 0 for c in t.comments(): - cmnts += '{}\n'.format(c) + user, cmnt, c_id, p_id = c + cmnts += html_cmnt.format(user=user, cmnt=cmnt, c_id=c_id, p_id=p_id) + # print('cmnts.len=', len(cmnts)) + # print('l=', l) html = html_format.format(title=t.title(), author=t.author(), text=t.text(), comments=cmnts) + # print('html.len=', len(html)) else: html_format = html_head + html_foot html = html_format.format(title=t.title(), author=t.author(), text=t.text()) @@ -73,10 +93,11 @@ def save_html(topic_id, filename, with_comments=False, project='h'): with open(filename, "wt") as f: if project == 'g': ht = GeektimesTopic(topic_id) - elif project == 'm': - ht = MegamozgTopic(topic_id) + # elif project == 'm': + # ht = MegamozgTopic(topic_id) else: ht = HabraTopic(topic_id) + print('comments_cnt=', ht.comments_count()) html = prepare_html(ht, with_comments=with_comments) f.write(html) # TODO: get all images and css @@ -99,8 +120,8 @@ def save_pdf(topic_id, filename, with_comments=False, project='h'): return if project == 'g': ht = GeektimesTopic(topic_id) - elif project == 'm': - ht = MegamozgTopic(topic_id) + # elif project == 'm': + # ht = MegamozgTopic(topic_id) else: ht = HabraTopic(topic_id) @@ -118,8 +139,8 @@ def save_all_favs_for_user(username, out_dir, save_in_html=True, with_comments=F # hu = HabraUser(username, need_favorites=True) if project == 'g': hu = GeektimesUser(username) - elif project == 'm': - hu = MegamozgUser(username) + # elif project == 'm': + # hu = MegamozgUser(username) else: hu = HabraUser(username) # hu = GeektimesUser(username) if project == 'g' else MegamozgUser(username) if project == 'm' else HabraUser(username) @@ -176,9 +197,8 @@ def create_url_list(username, filename, project='h'): :param project: one of 'h', 'g', 'm' :return: ''' - hu = GeektimesUser(username) if project == 'g' else MegamozgUser(username) if project == 'm' else HabraUser( - username) - T = GeektimesTopic if project == 'g' else MegamozgTopic if project == 'm' else HabraTopic + hu = GeektimesUser(username) if project == 'g' else HabraUser(username) + T = GeektimesTopic if project == 'g' else HabraTopic urls = list() favs_id = hu.favorites() for topic_name in favs_id: @@ -197,9 +217,9 @@ import docopt def main(): # {prog} save_posts [--gt|--mm] [-c --save-html --limit=N] <username> <out_dir> params = """Usage: - {prog} save_favs_list [--gt|--mm] <username> <out_file> - {prog} save_favs [--gt|--mm] [-cn --save-html --limit=N] <username> <out_dir> - {prog} save_post [--gt|--mm] [-c --save-html] <topic_id> <out_file> + {prog} save_favs_list [--gt] <username> <out_file> + {prog} save_favs [--gt] [-cn --save-html --limit=N] <username> <out_dir> + {prog} save_post [--gt] [-c --save-html] <topic_id> <out_file> {prog} --help Arguments: @@ -209,16 +229,16 @@ def main(): Options: --gt Работать с Geektimes - --mm Работать с Megamozg --save-html Сохранить в HTML (по умолчанию, в PDF) -n, --save-by-name Сохранять с именем, полученным из названия статьи (по умолчанию - по ID статьи) - -c, --with-comments Сохранить вместе с коментариями --limit=N Ограничить количество в N статей """.format(prog=sys.argv[0]) + # -c, --with-comments Сохранить вместе с коментариями + try: args = docopt.docopt(params) # print(args) - project = 'g' if args.get('--gt') else 'm' if args.get('--mm') else 'h' + project = 'g' if args.get('--gt') else 'h' # print (project) # print(args) # debug if args['save_favs_list']: @@ -226,16 +246,16 @@ def main(): return if args['save_favs']: save_all_favs_for_user(args['<username>'], args['<out_dir>'], save_in_html=args['--save-html'], - with_comments=args['--with-comments'], save_by_name=args['--save-by-name'], + with_comments=args.get('--with-comments', False), save_by_name=args['--save-by-name'], limit=args['--limit'], project=project) return if args['save_post']: t_id = args['<topic_id>'] fname = args['<out_file>'] if args['--save-html']: - save_html(t_id, filename=fname, with_comments=args['--with-comments'], project=project) + save_html(t_id, filename=fname, with_comments=args.get('--with-comments', False), project=project) else: - save_pdf(t_id, filename=fname, with_comments=args['--with-comments'], project=project) + save_pdf(t_id, filename=fname, with_comments=args.get('--with-comments', False), project=project) # if args['save_posts']: # print('Not implemented yet') # return |