Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/icoz/habraparse.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoricoz <icoz.vt@gmail.com>2014-05-25 00:43:55 +0400
committericoz <icoz.vt@gmail.com>2014-05-25 00:43:55 +0400
commit44eb76db23eac8c1f49cd5c61d5096c3b14f545f (patch)
tree2aedad1f31a1c4f989e132966398a5943bd9ff4d /habraparse.py
init
Some stuff in habraparse.py written
Diffstat (limited to 'habraparse.py')
-rw-r--r--habraparse.py145
1 files changed, 145 insertions, 0 deletions
diff --git a/habraparse.py b/habraparse.py
new file mode 100644
index 0000000..a94bf56
--- /dev/null
+++ b/habraparse.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from unittest import TestCase
+
+from lxml import html
+import requests
+
+
+__author__ = 'icoz'
+
+'''
+topic has:
+post_title .text
+post shortcuts_item .id = post_<id>
+hubs[] -> .text
+'''
+
+
+def getUserName(doc):
+ username = ''
+
+
+def getUserCompanyList(doc):
+ return list()
+
+
+def getUserHubList(doc):
+ return list()
+
+
+def parseUserpage(url):
+ user = dict()
+ user_karma = dict()
+ user_profile = dict()
+ user_activity = dict()
+
+ topic = requests.get(url).text
+ doc = html.document_fromstring(topic)
+
+ p_tags = doc.xpath("//div[@class='user_profile']//ul[@id='people-tags']//a/span")
+ rdate = doc.xpath("//div[@class='user_profile']//dd[@class='grey']")[0].text
+
+ user['username'] = doc.xpath("//div[@class='user_header']/h2/a").pop().text
+ user_karma['karma'] = float(doc.xpath("//div[@class='karma']//div[@class='num']").pop().text.replace(',', '.'))
+ user_karma['karma_vote'] = int(doc.xpath("//div[@class='karma']/div[@class='votes']").pop().text.split(' ')[0])
+ user_karma['rating'] = float(doc.xpath("//div[@class='rating']/div[@class='num']").pop().text.replace(',', '.'))
+ user_profile['fullname'] = doc.xpath("//div[@class='user_profile']/div[@class='fullname']").pop().text.strip()
+ user_karma['rating_place'] = int(
+ doc.xpath("//div[@class='user_profile']/div[@class='rating-place']").pop().text.split('-')[0])
+ user_profile['birthday'] = doc.xpath("//div[@class='user_profile']//dd[@class='bday']")[0].text
+ user_profile['country'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='country-name']")[0].text
+ user_profile['region'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='region']")[0].text
+ user_profile['city'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='city']")[0].text
+ user_profile['people_tags'] = [i for i in map(lambda x: x.text, p_tags)]
+ user_profile['registraion_date'] = rdate[:rdate.index('\r\n')]
+
+ user_activity['followers_count'] = int(
+ doc.xpath("//div[@class='stats']/div[@id='followers_count']/a").pop().text.split(' ')[0])
+ user_activity['posts_count'] = int(
+ doc.xpath("//div[@class='stats']/div[@class='item posts_count']/a").pop().text.split(' ')[0])
+ user_activity['comments_count'] = int(
+ doc.xpath("//div[@class='stats']/div[@class='item comments_count']/a").pop().text.split(' ')[0])
+
+ user['company_list'] = getUserCompanyList(doc)
+ user['hubs_list'] = getUserHubList(doc)
+ user['profile'] = user_profile
+ user['activity'] = user_activity
+ user['karma'] = user_karma
+ return user
+
+
+def parseTopic(url):
+ '''
+ returns info
+ '''
+ post = dict()
+ topic = requests.get(url).text
+ doc = html.document_fromstring(topic)
+ post['author'] = getTopicAuthor(doc)
+ post['hubs'] = getTopicHubs(doc)
+ post['title'] = getTopicTitle(doc)
+ post['text'] = getTopicText(doc)
+ post['comments'] = getTopicComments(doc)
+ return post
+
+
+def getTopicHubs(topic):
+ try:
+ hubs = []
+ div = topic.find_class('hubs')[0]
+ for link in div.iterlinks():
+ hubs.append(link[0].text)
+ return hubs
+ except:
+ return None
+
+
+def getTopicAuthor(topic):
+ try:
+ div = topic.find_class('author')[0]
+ links = [l for l in div.iterlinks()]
+ author = links[0][0].text
+ return author
+ except:
+ return None
+
+
+def getTopicText(topic):
+ pass
+
+
+def getTopicTitle(topic):
+ return topic.find_class('post_title')[0].text
+
+
+def getTopicComments(topic):
+ pass
+
+
+def getUserInfo(user):
+ pass
+
+
+def getUrlForUsername(username):
+ return 'http://habrahabr.ru/users/{}/'.format(username)
+
+
+class TestParse(TestCase):
+ def setUp(self):
+ pass
+
+ def test_parseTopic(self):
+ url = 'http://habrahabr.ru/post/208802/'
+ d = parseTopic(url)
+ self.assertEqual(d['author'], 'icoz')
+ d['hubs'].sort()
+ self.assertSequenceEqual(d['hubs'], ['PDF', 'Python'])
+ self.assertEqual(d['title'], 'Экспорт Избранного на Хабре в PDF')
+ # d['comments']
+
+ def test_parseUserpage(self):
+ username = 'icoz'
+ url = getUrlForUsername(username)
+ user = parseUserpage(url)
+ print(user)