Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/icoz/habraparse.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoricoz <icoz.vt@gmail.com>2016-05-28 16:38:30 +0300
committericoz <icoz.vt@gmail.com>2016-05-28 16:38:30 +0300
commitb5c9fa7ab09dd1e7013c6ab39de29b233e01f16f (patch)
tree5401f0e3d3c25214622e3585a5449a4ea7cb2a6a
parent55d2963b8a4943973daeae14bd147d235a9d48eb (diff)
В связи с закрытием Мегамозга, его код и его тесты закомментированы.
В связи с изменениями разметки Хабра внесены следующие изменения в получение следующих данных: - заголовков (у хабра и гиктаймса они теперь лежат по разному) - комментариев (#7) Добавлены дополнительные тесты. С комментариями остались проблемы, поэтому пока их сохранение недоступно (хотя код есть - кто может, помогите) Поправлен habr/__init__.py
-rw-r--r--habr/__init__.py6
-rw-r--r--habr/topic.py81
-rw-r--r--habr/user.py49
-rwxr-xr-xhabraparse.py68
4 files changed, 126 insertions, 78 deletions
diff --git a/habr/__init__.py b/habr/__init__.py
index 66c4d6d..d1cd45c 100644
--- a/habr/__init__.py
+++ b/habr/__init__.py
@@ -1,8 +1,8 @@
-from .user import HabraUser
-from .topic import HabraTopic
+from .user import HabraUser, GeektimesUser
+from .topic import HabraTopic, GeektimesTopic
__author__ = 'icoz'
__name__ = 'habraparse'
__version__ = '0.0.1'
-__all__ = [HabraTopic, HabraUser]
+__all__ = [HabraTopic, HabraUser, GeektimesTopic, GeektimesUser]
diff --git a/habr/topic.py b/habr/topic.py
index bac1c1a..8bbe750 100644
--- a/habr/topic.py
+++ b/habr/topic.py
@@ -46,10 +46,10 @@ class TMTopic(object):
hubs = doc.xpath("//div[@class='hubs']/a")
for h in hubs:
self.post['hubs'].append((h.text, h.attrib['href']))
- post_title = doc.xpath('//h1/span[@class="post_title"]')
+ post_title = doc.xpath('//h1[@class="post__title"]/span')
if len(post_title) == 0:
raise PostDeleted
- self.post['title'] = post_title[0].text
+ self.post['title'] = post_title
tmp = doc.xpath("//div[@class='author-info__username']//a[@class='author-info__nickname']") or \
doc.xpath("//div[@class='author-info__username']//a[@class='author-info__name']") or \
doc.xpath("//div[@class='author-info__username']//span[@class='author-info__name']")
@@ -63,10 +63,29 @@ class TMTopic(object):
if len(tmp) else ''
self.post['comments'] = []
# bug in class 'comments_list ' - space added
- comments = doc.xpath("//div[@class='comments_list ']//div[@class='comment_item']")
+ # comments = doc.xpath("//div[@class='comments_list ']//div[@class='comment_item']")
+ comments = doc.xpath("//ul[@id='comments-list']//li[@class='comment_item']")
self.post['comments_count'] = len(comments)
- for c in comments:
- self.post['comments'].append(etree.tostring(c, pretty_print=True, method='html').decode('utf-8'))
+ # record = (author, text)
+ authors = map(lambda x: x.text, doc.xpath("//a[@class='comment-item__username']"))
+ cmt_texts = map(lambda x: x.text, doc.xpath("//div[@class='message html_format ']"))
+ c_id = map(lambda x: x.attrib['id'][8:], doc.xpath("//li[@class='comment_item']"))
+ p_id = map(lambda x: x.attrib['data-parent_id'], doc.xpath("//span[@class='parent_id']"))
+ self.post['comments'] = tuple(zip(authors, cmt_texts, c_id, p_id))
+
+ # self.post['comments'] = list()
+ # for c in comments:
+ # # self.post['comments'].append(etree.tostring(c, pretty_print=True, method='html').decode('utf-8'))
+ # # record = (author, text, c_id, parent_c_id)
+ # author = c.xpath("//a[@class='comment-item__username']")
+ # if len(author): author = author[0].text
+ # else: author = '<anonymous>'
+ # text = c.xpath("//div[@class='message html_format ']")
+ # if text != '': text = text[0].text
+ # c_id = c.attrib['id']
+ # p_id = c.xpath("//span[@class='parent_id']")[0]
+ # if p_id != '': p_id = p_id.attrib['data-parent_id']
+ # self.post['comments'].append((author, text, c_id, p_id))
def author(self):
return deepcopy(self.post['author'])
@@ -93,16 +112,19 @@ class TMTopic(object):
class HabraTopic(TMTopic):
def __init__(self, topic_id):
super().__init__(topic_id, domain='habrahabr.ru')
+ self.post['title'] = self.post['title'][1].text
class GeektimesTopic(TMTopic):
def __init__(self, topic_id):
super().__init__(topic_id, domain='geektimes.ru')
+ self.post['title'] = self.post['title'][0].text
-class MegamozgTopic(TMTopic):
- def __init__(self, topic_id):
- super().__init__(topic_id, domain='megamozg.ru')
+# R.I.P.
+# class MegamozgTopic(TMTopic):
+# def __init__(self, topic_id):
+# super().__init__(topic_id, domain='megamozg.ru')
import pprint
@@ -115,6 +137,7 @@ class TestHabraTopic(TestCase):
pp.pprint(t.author())
self.assertEqual(t.author(), 'Яндекс')
pp.pprint(t.title())
+ self.assertEqual(t.title(), 'Memory management в ядре Linux. Семинар в Яндексе')
pp.pprint(t.post['comments_count'])
pp.pprint(t.post['rating'])
@@ -123,9 +146,12 @@ class TestHabraTopic(TestCase):
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(t.author())
self.assertEqual(t.author(), '@icoz')
+ self.assertEqual(t.title(), 'Экспорт Избранного на Хабре в PDF')
pp.pprint(t.title())
pp.pprint(t.post['comments_count'])
pp.pprint(t.post['rating'])
+ self.assertEqual(t.comments()[0][0], 'keccak')
+ self.assertEqual(t.comments()[1][0], 'icoz')
class TestGTTopic(TestCase):
@@ -135,6 +161,7 @@ class TestGTTopic(TestCase):
pp.pprint(t.author())
self.assertEqual(t.author(), 'Soundpal')
pp.pprint(t.title())
+ self.assertEqual(t.title(), 'На что влияет сопротивление наушников')
pp.pprint(t.post['comments_count'])
pp.pprint(t.post['rating'])
@@ -144,25 +171,25 @@ class TestGTTopic(TestCase):
pp.pprint(t.author())
self.assertEqual(t.author(), '@Robotex')
pp.pprint(t.title())
+ self.assertEqual(t.title(), 'Autodesk и Voxel8 делают 3D-печать электроники реальностью')
pp.pprint(t.post['comments_count'])
pp.pprint(t.post['rating'])
-
-class TestMMTopic(TestCase):
- def test_topic(self):
- t = MegamozgTopic(418)
- pp = pprint.PrettyPrinter(indent=4)
- pp.pprint(t.author())
- self.assertEqual(t.author(), '@Kirilkin')
- pp.pprint(t.title())
- pp.pprint(t.post['comments_count'])
- pp.pprint(t.post['rating'])
-
- def test_topic2(self):
- t = MegamozgTopic(8568)
- pp = pprint.PrettyPrinter(indent=4)
- pp.pprint(t.author())
- self.assertEqual(t.author(), '@jasiejames')
- pp.pprint(t.title())
- pp.pprint(t.post['comments_count'])
- pp.pprint(t.post['rating'])
+# class TestMMTopic(TestCase):
+# def test_topic(self):
+# t = MegamozgTopic(418)
+# pp = pprint.PrettyPrinter(indent=4)
+# pp.pprint(t.author())
+# self.assertEqual(t.author(), '@Kirilkin')
+# pp.pprint(t.title())
+# pp.pprint(t.post['comments_count'])
+# pp.pprint(t.post['rating'])
+#
+# def test_topic2(self):
+# t = MegamozgTopic(8568)
+# pp = pprint.PrettyPrinter(indent=4)
+# pp.pprint(t.author())
+# self.assertEqual(t.author(), '@jasiejames')
+# pp.pprint(t.title())
+# pp.pprint(t.post['comments_count'])
+# pp.pprint(t.post['rating'])
diff --git a/habr/user.py b/habr/user.py
index 1721697..f6cda91 100644
--- a/habr/user.py
+++ b/habr/user.py
@@ -237,9 +237,10 @@ class GeektimesUser(TMUser):
super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='geektimes.ru')
-class MegamozgUser(TMUser):
- def __init__(self, username, need_favorites=False, need_user_posts=False):
- super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='megamozg.ru')
+# R.I.P.
+# class MegamozgUser(TMUser):
+# def __init__(self, username, need_favorites=False, need_user_posts=False):
+# super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='megamozg.ru')
import pprint
@@ -297,24 +298,24 @@ class Test_GeektimesUser(TestCase):
pp.pprint('userposts=')
pp.pprint(hu.user_posts())
-
-class Test_MegamozgUser(TestCase):
- def setUp(self):
- self.hu = MegamozgUser('icoz')
- pass
-
- def test_parseUserpage(self):
- pp = pprint.PrettyPrinter(indent=4)
- pp.pprint(self.hu.activity())
- pp.pprint(self.hu.profile())
- pp.pprint(self.hu.karma())
-
- # def test_favs(self):
- # pp = pprint.PrettyPrinter(indent=4)
-
- def test_user_posts(self):
- hu = MegamozgUser('Zelenyikot')
- pp = pprint.PrettyPrinter(indent=4)
- pp.pprint('userposts=')
- pp.pprint(hu.user_posts())
-
+#
+# class Test_MegamozgUser(TestCase):
+# def setUp(self):
+# self.hu = MegamozgUser('icoz')
+# pass
+#
+# def test_parseUserpage(self):
+# pp = pprint.PrettyPrinter(indent=4)
+# pp.pprint(self.hu.activity())
+# pp.pprint(self.hu.profile())
+# pp.pprint(self.hu.karma())
+#
+# # def test_favs(self):
+# # pp = pprint.PrettyPrinter(indent=4)
+#
+# def test_user_posts(self):
+# hu = MegamozgUser('Zelenyikot')
+# pp = pprint.PrettyPrinter(indent=4)
+# pp.pprint('userposts=')
+# pp.pprint(hu.user_posts())
+#
diff --git a/habraparse.py b/habraparse.py
index 4e948c1..3b69d2d 100755
--- a/habraparse.py
+++ b/habraparse.py
@@ -5,8 +5,8 @@ import sys
from weasyprint import HTML, CSS
-from habr.topic import HabraTopic, PostDeleted, MegamozgTopic, GeektimesTopic
-from habr.user import HabraUser, GeektimesUser, MegamozgUser
+from habr.topic import HabraTopic, PostDeleted, GeektimesTopic
+from habr.user import HabraUser, GeektimesUser
__author__ = 'icoz'
@@ -38,10 +38,24 @@ def prepare_html(topic, with_comments=False):
</div>
'''
html_cmnts = '''
- <div id="comments" class="comments_list">
- <h2 class="title">Комментарии</h2>
- {comments}
+ <h2>Комментарии</h2>
+ <ul id="comments-list">
+ {comments}
+ </ul>
+ '''
+ html_cmnt = '''
+ <li class="comment_item" id="comment_{c_id}">
+ <span class="parent_id" data-parent_id="{p_id}"></span>
+ <div class="comment_body ">
+ <div class="info comments-list__item comment-item " rel="{c_id}">
+ <span class="comment-item__user-info" rel="user-popover" data-user-login="{user}">
+ <a href="https://habrahabr.ru/users/{user}/" class="comment-item__username">{user}</a>
+ </span>
+ <div class="message html_format ">
+ {cmnt}
</div>
+ </div>
+ </div>
'''
html_foot = '''
</div>
@@ -53,9 +67,15 @@ def prepare_html(topic, with_comments=False):
if with_comments:
cmnts = ''
html_format = html_head + html_cmnts + html_foot
+ # print("t.comments()=", len(t.comments()))
+ l = 0
for c in t.comments():
- cmnts += '{}\n'.format(c)
+ user, cmnt, c_id, p_id = c
+ cmnts += html_cmnt.format(user=user, cmnt=cmnt, c_id=c_id, p_id=p_id)
+ # print('cmnts.len=', len(cmnts))
+ # print('l=', l)
html = html_format.format(title=t.title(), author=t.author(), text=t.text(), comments=cmnts)
+ # print('html.len=', len(html))
else:
html_format = html_head + html_foot
html = html_format.format(title=t.title(), author=t.author(), text=t.text())
@@ -73,10 +93,11 @@ def save_html(topic_id, filename, with_comments=False, project='h'):
with open(filename, "wt") as f:
if project == 'g':
ht = GeektimesTopic(topic_id)
- elif project == 'm':
- ht = MegamozgTopic(topic_id)
+ # elif project == 'm':
+ # ht = MegamozgTopic(topic_id)
else:
ht = HabraTopic(topic_id)
+ print('comments_cnt=', ht.comments_count())
html = prepare_html(ht, with_comments=with_comments)
f.write(html)
# TODO: get all images and css
@@ -99,8 +120,8 @@ def save_pdf(topic_id, filename, with_comments=False, project='h'):
return
if project == 'g':
ht = GeektimesTopic(topic_id)
- elif project == 'm':
- ht = MegamozgTopic(topic_id)
+ # elif project == 'm':
+ # ht = MegamozgTopic(topic_id)
else:
ht = HabraTopic(topic_id)
@@ -118,8 +139,8 @@ def save_all_favs_for_user(username, out_dir, save_in_html=True, with_comments=F
# hu = HabraUser(username, need_favorites=True)
if project == 'g':
hu = GeektimesUser(username)
- elif project == 'm':
- hu = MegamozgUser(username)
+ # elif project == 'm':
+ # hu = MegamozgUser(username)
else:
hu = HabraUser(username)
# hu = GeektimesUser(username) if project == 'g' else MegamozgUser(username) if project == 'm' else HabraUser(username)
@@ -176,9 +197,8 @@ def create_url_list(username, filename, project='h'):
:param project: one of 'h', 'g', 'm'
:return:
'''
- hu = GeektimesUser(username) if project == 'g' else MegamozgUser(username) if project == 'm' else HabraUser(
- username)
- T = GeektimesTopic if project == 'g' else MegamozgTopic if project == 'm' else HabraTopic
+ hu = GeektimesUser(username) if project == 'g' else HabraUser(username)
+ T = GeektimesTopic if project == 'g' else HabraTopic
urls = list()
favs_id = hu.favorites()
for topic_name in favs_id:
@@ -197,9 +217,9 @@ import docopt
def main():
# {prog} save_posts [--gt|--mm] [-c --save-html --limit=N] <username> <out_dir>
params = """Usage:
- {prog} save_favs_list [--gt|--mm] <username> <out_file>
- {prog} save_favs [--gt|--mm] [-cn --save-html --limit=N] <username> <out_dir>
- {prog} save_post [--gt|--mm] [-c --save-html] <topic_id> <out_file>
+ {prog} save_favs_list [--gt] <username> <out_file>
+ {prog} save_favs [--gt] [-cn --save-html --limit=N] <username> <out_dir>
+ {prog} save_post [--gt] [-c --save-html] <topic_id> <out_file>
{prog} --help
Arguments:
@@ -209,16 +229,16 @@ def main():
Options:
--gt Работать с Geektimes
- --mm Работать с Megamozg
--save-html Сохранить в HTML (по умолчанию, в PDF)
-n, --save-by-name Сохранять с именем, полученным из названия статьи (по умолчанию - по ID статьи)
- -c, --with-comments Сохранить вместе с коментариями
--limit=N Ограничить количество в N статей
""".format(prog=sys.argv[0])
+ # -c, --with-comments Сохранить вместе с коментариями
+
try:
args = docopt.docopt(params)
# print(args)
- project = 'g' if args.get('--gt') else 'm' if args.get('--mm') else 'h'
+ project = 'g' if args.get('--gt') else 'h'
# print (project)
# print(args) # debug
if args['save_favs_list']:
@@ -226,16 +246,16 @@ def main():
return
if args['save_favs']:
save_all_favs_for_user(args['<username>'], args['<out_dir>'], save_in_html=args['--save-html'],
- with_comments=args['--with-comments'], save_by_name=args['--save-by-name'],
+ with_comments=args.get('--with-comments', False), save_by_name=args['--save-by-name'],
limit=args['--limit'], project=project)
return
if args['save_post']:
t_id = args['<topic_id>']
fname = args['<out_file>']
if args['--save-html']:
- save_html(t_id, filename=fname, with_comments=args['--with-comments'], project=project)
+ save_html(t_id, filename=fname, with_comments=args.get('--with-comments', False), project=project)
else:
- save_pdf(t_id, filename=fname, with_comments=args['--with-comments'], project=project)
+ save_pdf(t_id, filename=fname, with_comments=args.get('--with-comments', False), project=project)
# if args['save_posts']:
# print('Not implemented yet')
# return