Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/icoz/habraparse.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoricoz <icoz.vt@gmail.com>2016-06-01 23:56:55 +0300
committericoz <icoz.vt@gmail.com>2016-06-01 23:56:55 +0300
commita12ea54bc620845f4b97c8bc4d99c438211d1621 (patch)
tree1686245a1307ac80e76b1f2fb85e216e0903c0af
parentd96a9a7c3e5828adaded35f131241a5c3a1cf853 (diff)
Поправлен разбор комментариев (#8). Добавлена выборка времени комментария.
Изменено внутреннее представление коментариев. Теперь это dict(). При генерации разобранной статьи добавлена новая функция generate_comments(), которая рекурсивно строит дерево коментариев.
-rw-r--r--habr/topic.py19
-rwxr-xr-xhabraparse.py70
2 files changed, 59 insertions, 30 deletions
diff --git a/habr/topic.py b/habr/topic.py
index dff7fb7..6446a99 100644
--- a/habr/topic.py
+++ b/habr/topic.py
@@ -64,14 +64,27 @@ class TMTopic(object):
self.post['comments'] = []
# bug in class 'comments_list ' - space added
# comments = doc.xpath("//div[@class='comments_list ']//div[@class='comment_item']")
- comments = doc.xpath("//ul[@id='comments-list']//li[@class='comment_item']")
- self.post['comments_count'] = len(comments)
+ # comments = doc.xpath("//ul[@id='comments-list']//li[@class='comment_item']")
# record = (author, text)
authors = list(map(lambda x: x.text, doc.xpath("//ul[@id='comments-list']//a[@class='comment-item__username']")))
cmt_texts = list(map(lambda x: x.text.strip(), doc.xpath("//ul[@id='comments-list']//div[starts-with(@class,'message html_format ')]")))
c_id = list(map(lambda x: int(x.attrib['id'][8:]), doc.xpath("//ul[@id='comments-list']//li[@class='comment_item']")))
p_id = list(map(lambda x: int(x.attrib['data-parent_id']), doc.xpath("//ul[@id='comments-list']//span[@class='parent_id']")))
- self.post['comments'] = tuple(zip(authors, cmt_texts, c_id, p_id))
+ time = list(map(lambda x: x.text.strip(), doc.xpath("//ul[@id='comments-list']//time")))
+ tpl = tuple(zip(authors, cmt_texts, c_id, p_id, time))
+ self.post['comments'] = tuple(
+ map(
+ lambda x:
+ {
+ 'author':x[0],
+ 'text': x[1],
+ 'c_id': x[2],
+ 'p_id': x[3],
+ 'time': x[4],
+ },
+ tpl)
+ )
+ self.post['comments_count'] = len(self.post['comments'])
# self.post['comments'] = list()
# for c in comments:
diff --git a/habraparse.py b/habraparse.py
index ede96c6..8dcdb8d 100755
--- a/habraparse.py
+++ b/habraparse.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
+from pprint import pprint
import sys
from weasyprint import HTML, CSS
@@ -10,11 +11,44 @@ from habr.user import HabraUser, GeektimesUser
__author__ = 'icoz'
+def generate_comments(cmnts, id=0):
+ html_subcmnt = '''
+ <ul class="reply_comments" id="reply_comments_{c_id}">
+ {list_cmnts}
+ </ul>
+ '''
+ html_cmnt = '''
+ <li class="comment_item" id="comment_{c_id}">
+ <span class="parent_id" data-parent_id="{p_id}"></span>
+ <div class="comment_body ">
+ <div class="info comments-list__item comment-item " rel="{c_id}">
+ <span class="comment-item__user-info" data-user-login="{user}">
+ <a href="https://habrahabr.ru/users/{user}/" class="comment-item__username">{user}</a>
+ <time class="comment-item__time_published">{time}</time>
+ </span>
+ <div class="message html_format ">
+ {cmnt_text}
+ </div>
+ </div>
+ </div>
+ '''
+ cmnts2 = tuple(filter(lambda x: x['p_id'] == id, cmnts))
+ if len(cmnts2) == 0: return ''
+ out = ''
+ for c in cmnts2:
+ out += html_cmnt.format(c_id=c['c_id'], p_id=id, user=c['author'], time=c['time'], cmnt_text=c['text'])
+ out += html_subcmnt.format(c_id = c['c_id'], list_cmnts=generate_comments(cmnts, c['c_id']))
+ return out
def prepare_html(topic, with_comments=False):
t = topic
# <link href="http://habrahabr.ru/styles/1412005750/printer.css" rel="stylesheet" media="print" />
# <link href="http://habrahabr.ru/styles/1412005750/assets/global_main.css" rel="stylesheet" media="all" />
+ # worked. 01/06/2016 <link href="http://habrahabr.ru/styles/1412005750/assets/post_common_css.css" rel="stylesheet" media="all" />
+ # <link href="https://habracdn.net/habr/styles/1464788371/_build/global_main.css" rel="stylesheet" media="all" />
+ # <link href="https://habracdn.net/habr/styles/1464788371/_build/company_post_show_common.css" rel="stylesheet" media="all" />
+ # <link href="https://habracdn.net/habr/styles/1464788371/_build/post_common_css.css" rel="stylesheet" media="all" />
+
html_head = '''
<html>
<head>
@@ -38,23 +72,13 @@ def prepare_html(topic, with_comments=False):
</div>
'''
html_cmnts = '''
- <h2>Комментарии</h2>
- <ul id="comments-list">
- {comments}
- </ul>
- '''
- html_cmnt = '''
- <li class="comment_item" id="comment_{c_id}">
- <span class="parent_id" data-parent_id="{p_id}"></span>
- <div class="comment_body ">
- <div class="info comments-list__item comment-item " rel="{c_id}">
- <span class="comment-item__user-info" rel="user-popover" data-user-login="{user}">
- <a href="https://habrahabr.ru/users/{user}/" class="comment-item__username">{user}</a>
- </span>
- <div class="message html_format ">
- {cmnt}
- </div>
- </div>
+ <div class="comments_list " id="comments">
+ <h2 class="title ">
+ Комментарии (<span id="comments_count">{cmnts_count}</span>)
+ </h2>
+ <ul id="comments-list">
+ {comments}
+ </ul>
</div>
'''
html_foot = '''
@@ -65,17 +89,9 @@ def prepare_html(topic, with_comments=False):
</html>
'''
if with_comments:
- cmnts = ''
html_format = html_head + html_cmnts + html_foot
- # print("t.comments()=", len(t.comments()))
- l = 0
- for c in t.comments():
- user, cmnt, c_id, p_id = c
- cmnts += html_cmnt.format(user=user, cmnt=cmnt, c_id=c_id, p_id=p_id)
- # print('cmnts.len=', len(cmnts))
- # print('l=', l)
- html = html_format.format(title=t.title(), author=t.author(), text=t.text(), comments=cmnts)
- # print('html.len=', len(html))
+ html = html_format.format(title=t.title(), author=t.author(), text=t.text(),
+ comments=generate_comments(t.comments(), 0), cmnts_count=t.comments_count())
else:
html_format = html_head + html_foot
html = html_format.format(title=t.title(), author=t.author(), text=t.text())