diff options
author | kvakanet <kvakanet@users.noreply.github.com> | 2017-02-12 10:54:35 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-02-12 10:54:35 +0300 |
commit | 0ff2e01bdea653df17bd4dd118da3c14c2d34177 (patch) | |
tree | caeb15de1181ffef6a6c47f0a7cd81c0b50a3a77 | |
parent | f42f21e910b3d08a612c815c71bf0727a2738553 (diff) |
Добавлены методы для добавления в PDF
-rw-r--r-- | habr/topic.py | 36 |
1 files changed, 34 insertions, 2 deletions
diff --git a/habr/topic.py b/habr/topic.py index 5436d1f..fdf56f9 100644 --- a/habr/topic.py +++ b/habr/topic.py @@ -31,7 +31,7 @@ class TMTopic(object): return self.url def _getTopicUrl(self, topic_id): - return str('http://{domain}/post/{tid}/').format(domain=self.domain, tid=topic_id) + return str('https://{domain}/post/{tid}/').format(domain=self.domain, tid=topic_id) def _parseTopic(self): ''' @@ -53,7 +53,27 @@ class TMTopic(object): tmp = doc.xpath("//div[@class='author-info__username']//a[@class='author-info__nickname']") or \ doc.xpath("//div[@class='author-info__username']//a[@class='author-info__name']") or \ doc.xpath("//div[@class='author-info__username']//span[@class='author-info__name']") - self.post['author'] = tmp[0].text if len(tmp) else '' + if len(tmp): + self.post['author_url'] = ('https://' + self.domain + tmp[0].attrib['href'] ) + self.post['author'] = tmp[0].text + else: + self.post['author_url']= '' + self.post['author'] = '' + ### + post_desc = doc.xpath("//meta[@name='description']/@content") + self.post['desc'] = post_desc[0].strip("\r\n") + # + tmp = doc.xpath("//link[@rel='stylesheet'][@media='all']") + style = '\n'.join([ str(etree.tostring(st,pretty_print=True, method='html').decode('utf-8')).strip("\n\r") for st in tmp ]) + self.post['styles'] = style if len(style)>2 else '' + # + #tmp = doc.xpath("//style[@type='text/css']") + #style =''.join([ str(etree.tostring(st,pretty_print=True, method='html').decode('utf-8')) for st in tmp ]) + #self.post['styles'] += style if (len(style)>2) else '' + # + post_keywords = doc.xpath("//meta[@name='keywords']/@content") + self.post['keywords'] = post_keywords[0].strip("\r\n") + ### # bug in class 'infopanel ' - space added tmp = doc.xpath( "//ul[@class='postinfo-panel postinfo-panel_post']//span[@class='oting-wjt__counter-score js-score']") @@ -109,7 +129,19 @@ class TMTopic(object): def author(self): return deepcopy(self.post['author']) +### + def author_url(self): + return deepcopy(self.post['author_url']) + + def desc(self): + return deepcopy(self.post['desc']) + + def styles(self): + return deepcopy(self.post['styles']) + def keywords(self): + return deepcopy(self.post['keywords']) +### def text(self): return deepcopy(self.post['text']) |