From f42f21e910b3d08a612c815c71bf0727a2738553 Mon Sep 17 00:00:00 2001 From: kvakanet Date: Sun, 12 Feb 2017 13:50:35 +0600 Subject: Add files via upload --- habraparse.py | 22 ++++++++++++++++------ requirements.txt | 6 +++--- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/habraparse.py b/habraparse.py index 2f3f8e6..c455089 100755 --- a/habraparse.py +++ b/habraparse.py @@ -55,16 +55,23 @@ def prepare_html(topic, with_comments=False): {title} + + + +
-
-
+ +
+ +

{title}

{text}
@@ -89,11 +96,13 @@ def prepare_html(topic, with_comments=False): ''' if with_comments: html_format = html_head + html_cmnts + html_foot - html = html_format.format(title=t.title(), author=t.author(), text=t.text(), + html = html_format.format(title=t.title(), author=t.author(), author_url=t.author_url(), desc=t.desc(), text=t.text(), + addstyle=t.styles(), keywords=t.keywords(), comments=generate_comments(t.comments(), 0), cmnts_count=t.comments_count()) else: html_format = html_head + html_foot - html = html_format.format(title=t.title(), author=t.author(), text=t.text()) + html = html_format.format(title=t.title(), author=t.author(), author_url=t.author_url(), desc=t.desc(), text=t.text(), + addstyle=t.styles(), keywords=t.keywords() ) html = str(html).replace('"//habrastorage.org', '"https://habrastorage.org') return html @@ -137,7 +146,8 @@ def save_pdf(topic_id: int, filename: str, with_comments: bool = False, project: ht = HabraTopic(topic_id) html = prepare_html(ht, with_comments=with_comments) - css = CSS(string='@page { size: A4; margin: 1cm !important}') + css = CSS(string='@page { size: A4; margin: 1cm; !important;} img { width: 100%; height: auto; !important; }') + #css = CSS(string='@page { size: A4 landscape; margin: 1cm !important}') HTML(string=html).write_pdf(filename, stylesheets=[css]) diff --git a/requirements.txt b/requirements.txt index ddad416..83aed1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -cairocffi==0.7.2 -CairoSVG==2.0.0 +cairocffi>=0.7.2 +CairoSVG>=2.0.0 cffi==1.9.1 cssselect==1.0.0 docopt==0.6.2 @@ -11,5 +11,5 @@ Pyphen==0.9.4 requests==2.12.3 six==1.10.0 tinycss==0.4 -WeasyPrint==0.33 +WeasyPrint>=0.34 webencodings==0.5 -- cgit v1.2.3 From 0ff2e01bdea653df17bd4dd118da3c14c2d34177 Mon Sep 17 00:00:00 2001 From: kvakanet Date: Sun, 12 Feb 2017 13:54:35 +0600 Subject: =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D1=8B?= =?UTF-8?q?=20=D0=BC=D0=B5=D1=82=D0=BE=D0=B4=D1=8B=20=D0=B4=D0=BB=D1=8F=20?= =?UTF-8?q?=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=B2=20PDF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- habr/topic.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/habr/topic.py b/habr/topic.py index 5436d1f..fdf56f9 100644 --- a/habr/topic.py +++ b/habr/topic.py @@ -31,7 +31,7 @@ class TMTopic(object): return self.url def _getTopicUrl(self, topic_id): - return str('http://{domain}/post/{tid}/').format(domain=self.domain, tid=topic_id) + return str('https://{domain}/post/{tid}/').format(domain=self.domain, tid=topic_id) def _parseTopic(self): ''' @@ -53,7 +53,27 @@ class TMTopic(object): tmp = doc.xpath("//div[@class='author-info__username']//a[@class='author-info__nickname']") or \ doc.xpath("//div[@class='author-info__username']//a[@class='author-info__name']") or \ doc.xpath("//div[@class='author-info__username']//span[@class='author-info__name']") - self.post['author'] = tmp[0].text if len(tmp) else '' + if len(tmp): + self.post['author_url'] = ('https://' + self.domain + tmp[0].attrib['href'] ) + self.post['author'] = tmp[0].text + else: + self.post['author_url']= '' + self.post['author'] = '' + ### + post_desc = doc.xpath("//meta[@name='description']/@content") + self.post['desc'] = post_desc[0].strip("\r\n") + # + tmp = doc.xpath("//link[@rel='stylesheet'][@media='all']") + style = '\n'.join([ str(etree.tostring(st,pretty_print=True, method='html').decode('utf-8')).strip("\n\r") for st in tmp ]) + self.post['styles'] = style if len(style)>2 else '' + # + #tmp = doc.xpath("//style[@type='text/css']") + #style =''.join([ str(etree.tostring(st,pretty_print=True, method='html').decode('utf-8')) for st in tmp ]) + #self.post['styles'] += style if (len(style)>2) else '' + # + post_keywords = doc.xpath("//meta[@name='keywords']/@content") + self.post['keywords'] = post_keywords[0].strip("\r\n") + ### # bug in class 'infopanel ' - space added tmp = doc.xpath( "//ul[@class='postinfo-panel postinfo-panel_post']//span[@class='oting-wjt__counter-score js-score']") @@ -109,7 +129,19 @@ class TMTopic(object): def author(self): return deepcopy(self.post['author']) +### + def author_url(self): + return deepcopy(self.post['author_url']) + + def desc(self): + return deepcopy(self.post['desc']) + + def styles(self): + return deepcopy(self.post['styles']) + def keywords(self): + return deepcopy(self.post['keywords']) +### def text(self): return deepcopy(self.post['text']) -- cgit v1.2.3 From 37f4f3c2bce7bcd3049c3f814fe226cfde7ccde6 Mon Sep 17 00:00:00 2001 From: kvakanet Date: Sun, 12 Feb 2017 14:05:53 +0600 Subject: Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index afbc220..18ee00c 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,11 @@ Usage: ``` Changelog: +*12.02.2017* +- Добавлены мета теги для лучшего поиска PDF +- Исправлена вставка автора(вставляется ссылка на автора рабочая) +- Решана проблема с маштабированием изображений + *01.02.2015* - исправлены ошибки - добавлена поддержка Geektimes.ru и Megamozg.ru -- cgit v1.2.3