Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/icoz/habraparse.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoricoz <icoz.vt@gmail.com>2017-02-12 16:32:22 +0300
committerGitHub <noreply@github.com>2017-02-12 16:32:22 +0300
commitc66b4859fa5244a166cefd96761fc8360a5fe3b8 (patch)
tree08b8c0be285cdc3e76588ce1a1a98d7e1e30bb1f
parent8cd076fe9f58bdab7e45c54a6d5a035eb4fb2264 (diff)
parent37f4f3c2bce7bcd3049c3f814fe226cfde7ccde6 (diff)
Merge pull request #13 from kvakanet/master
Исправления от @kvakanet Изменено составление url с http на https Исправлено формирования url на автора(и добавлен новый метод author_url) Добавлены методы для получения некоторых мета данных и стилей и добавления их в PDF Исправлено маштабирование изображений Чуть-чуть обновил библиотеки для проекта
-rw-r--r--README.md5
-rw-r--r--habr/topic.py36
-rwxr-xr-xhabraparse.py22
-rw-r--r--requirements.txt6
4 files changed, 58 insertions, 11 deletions
diff --git a/README.md b/README.md
index afbc220..18ee00c 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,11 @@ Usage:
```
Changelog:
+*12.02.2017*
+- Добавлены мета теги для лучшего поиска PDF
+- Исправлена вставка автора(вставляется ссылка на автора рабочая)
+- Решана проблема с маштабированием изображений
+
*01.02.2015*
- исправлены ошибки
- добавлена поддержка Geektimes.ru и Megamozg.ru
diff --git a/habr/topic.py b/habr/topic.py
index 5436d1f..fdf56f9 100644
--- a/habr/topic.py
+++ b/habr/topic.py
@@ -31,7 +31,7 @@ class TMTopic(object):
return self.url
def _getTopicUrl(self, topic_id):
- return str('http://{domain}/post/{tid}/').format(domain=self.domain, tid=topic_id)
+ return str('https://{domain}/post/{tid}/').format(domain=self.domain, tid=topic_id)
def _parseTopic(self):
'''
@@ -53,7 +53,27 @@ class TMTopic(object):
tmp = doc.xpath("//div[@class='author-info__username']//a[@class='author-info__nickname']") or \
doc.xpath("//div[@class='author-info__username']//a[@class='author-info__name']") or \
doc.xpath("//div[@class='author-info__username']//span[@class='author-info__name']")
- self.post['author'] = tmp[0].text if len(tmp) else ''
+ if len(tmp):
+ self.post['author_url'] = ('https://' + self.domain + tmp[0].attrib['href'] )
+ self.post['author'] = tmp[0].text
+ else:
+ self.post['author_url']= ''
+ self.post['author'] = ''
+ ###
+ post_desc = doc.xpath("//meta[@name='description']/@content")
+ self.post['desc'] = post_desc[0].strip("\r\n")
+ #
+ tmp = doc.xpath("//link[@rel='stylesheet'][@media='all']")
+ style = '\n'.join([ str(etree.tostring(st,pretty_print=True, method='html').decode('utf-8')).strip("\n\r") for st in tmp ])
+ self.post['styles'] = style if len(style)>2 else ''
+ #
+ #tmp = doc.xpath("//style[@type='text/css']")
+ #style =''.join([ str(etree.tostring(st,pretty_print=True, method='html').decode('utf-8')) for st in tmp ])
+ #self.post['styles'] += style if (len(style)>2) else ''
+ #
+ post_keywords = doc.xpath("//meta[@name='keywords']/@content")
+ self.post['keywords'] = post_keywords[0].strip("\r\n")
+ ###
# bug in class 'infopanel ' - space added
tmp = doc.xpath(
"//ul[@class='postinfo-panel postinfo-panel_post']//span[@class='oting-wjt__counter-score js-score']")
@@ -109,7 +129,19 @@ class TMTopic(object):
def author(self):
return deepcopy(self.post['author'])
+###
+ def author_url(self):
+ return deepcopy(self.post['author_url'])
+
+ def desc(self):
+ return deepcopy(self.post['desc'])
+
+ def styles(self):
+ return deepcopy(self.post['styles'])
+ def keywords(self):
+ return deepcopy(self.post['keywords'])
+###
def text(self):
return deepcopy(self.post['text'])
diff --git a/habraparse.py b/habraparse.py
index 2f3f8e6..c455089 100755
--- a/habraparse.py
+++ b/habraparse.py
@@ -55,16 +55,23 @@ def prepare_html(topic, with_comments=False):
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<meta charset="UTF-8">
<title>{title}</title>
+ <meta name="author" content="{author}">
+ <meta name="generator" content="habraparse">
+ <meta name="description" content="{desc}">
+ <meta name="keywords" content="{keywords}">
</head>
<body>
<div id="layout">
<div class="inner">
<div class="content_left">
- <div class="post_show">
- <div class="post shortcuts_item">
+ <!-- <div class="post_show"> -->
+ <div class="post__body post__body_full">
+ <!-- <div class="post shortcuts_item"> -->
+ <div class="content html_format">
<h1 class="title"><span class="post_title">{title}</span></h1>
<div class="author">
- <a title="Автор текста" href="http://habrahabr.ru/users/{author}/" >{author}</a>
+ <!-- <a title="Автор текста" href="http://habrahabr.ru/users/{author}/" >{author}</a> -->
+ <a title="Автор текста" href="{author_url}" >{author}</a>
</div>
{text}
</div>
@@ -89,11 +96,13 @@ def prepare_html(topic, with_comments=False):
'''
if with_comments:
html_format = html_head + html_cmnts + html_foot
- html = html_format.format(title=t.title(), author=t.author(), text=t.text(),
+ html = html_format.format(title=t.title(), author=t.author(), author_url=t.author_url(), desc=t.desc(), text=t.text(),
+ addstyle=t.styles(), keywords=t.keywords(),
comments=generate_comments(t.comments(), 0), cmnts_count=t.comments_count())
else:
html_format = html_head + html_foot
- html = html_format.format(title=t.title(), author=t.author(), text=t.text())
+ html = html_format.format(title=t.title(), author=t.author(), author_url=t.author_url(), desc=t.desc(), text=t.text(),
+ addstyle=t.styles(), keywords=t.keywords() )
html = str(html).replace('"//habrastorage.org', '"https://habrastorage.org')
return html
@@ -137,7 +146,8 @@ def save_pdf(topic_id: int, filename: str, with_comments: bool = False, project:
ht = HabraTopic(topic_id)
html = prepare_html(ht, with_comments=with_comments)
- css = CSS(string='@page { size: A4; margin: 1cm !important}')
+ css = CSS(string='@page { size: A4; margin: 1cm; !important;} img { width: 100%; height: auto; !important; }')
+ #css = CSS(string='@page { size: A4 landscape; margin: 1cm !important}')
HTML(string=html).write_pdf(filename, stylesheets=[css])
diff --git a/requirements.txt b/requirements.txt
index ddad416..83aed1c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-cairocffi==0.7.2
-CairoSVG==2.0.0
+cairocffi>=0.7.2
+CairoSVG>=2.0.0
cffi==1.9.1
cssselect==1.0.0
docopt==0.6.2
@@ -11,5 +11,5 @@ Pyphen==0.9.4
requests==2.12.3
six==1.10.0
tinycss==0.4
-WeasyPrint==0.33
+WeasyPrint>=0.34
webencodings==0.5