Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/icoz/habraparse.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/habr
diff options
context:
space:
mode:
authoricoz <icoz.vt@gmail.com>2015-02-01 21:33:30 +0300
committericoz <icoz.vt@gmail.com>2015-02-01 21:33:30 +0300
commitb263fe35ced1247452ea63ca06ea6d3bcde7948f (patch)
treecabe279b24105168c6abed56e437b89c12285d7e /habr
parent03547173c23632acbd5950760dd068c1a4b30ae3 (diff)
Adding support for geektimes, megamozg
HabraTopic -> TMTopic Created classes: HabraTopic(TMTopic), GeektimesTopic(TMTopic), MegamozgTopic(TMTopic) HabraUser -> TMUser Created classes: HabraUser(TMUser), GeektimesUser(TMUser), MegamozgUser(TMUser) Added tests in TMUser._parseUserpage()
Diffstat (limited to 'habr')
-rw-r--r--habr/topic.py66
-rw-r--r--habr/user.py131
2 files changed, 159 insertions, 38 deletions
diff --git a/habr/topic.py b/habr/topic.py
index ef1dd9d..91e7820 100644
--- a/habr/topic.py
+++ b/habr/topic.py
@@ -12,25 +12,27 @@ class PostDeleted(Exception):
pass
-class HabraTopic(object):
- def __init__(self, topic_id):
+class TMTopic(object):
+ def __init__(self, topic_id, domain='habrahabr.ru'):
'''
init
:param topic_id: str or int with topic id
:return:
'''
+ self.domain = domain
if isinstance(topic_id, (str, int)):
- self.url = self.getTopicUrl(topic_id)
self.post = dict()
self._topic_id = topic_id
+ self.url = self._getTopicUrl(topic_id)
self._parseTopic()
else:
raise TypeError('topic_id must be str or int!')
+ def getTopicUrl(self):
+ return self.url
- @staticmethod
- def getTopicUrl(topic_id):
- return str('http://habrahabr.ru/post/{}/').format(topic_id)
+ def _getTopicUrl(self, topic_id):
+ return str('http://{domain}/post/{tid}/').format(domain=self.domain, tid=topic_id)
def _parseTopic(self):
'''
@@ -84,6 +86,19 @@ class HabraTopic(object):
return self._topic_id
+class HabraTopic(TMTopic):
+ def __init__(self, topic_id):
+ super().__init__(topic_id, domain='habrahabr.ru')
+
+class GeektimesTopic(TMTopic):
+ def __init__(self, topic_id):
+ super().__init__(topic_id, domain='geektimes.ru')
+
+class MegamozgTopic(TMTopic):
+ def __init__(self, topic_id):
+ super().__init__(topic_id, domain='megamozg.ru')
+
+
import pprint
@@ -105,3 +120,42 @@ class TestHabraTopic(TestCase):
pp.pprint(t.title())
pp.pprint(t.post['comments_count'])
pp.pprint(t.post['rating'])
+
+
+class TestGTTopic(TestCase):
+ def test_topic(self):
+ t = GeektimesTopic(243447)
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint(t.author())
+ self.assertEqual(t.author(), 'SOUNDPAL')
+ pp.pprint(t.title())
+ pp.pprint(t.post['comments_count'])
+ pp.pprint(t.post['rating'])
+
+ def test_topic2(self):
+ t = GeektimesTopic(245130)
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint(t.author())
+ self.assertEqual(t.author(), 'Robotex')
+ pp.pprint(t.title())
+ pp.pprint(t.post['comments_count'])
+ pp.pprint(t.post['rating'])
+
+class TestMMTopic(TestCase):
+ def test_topic(self):
+ t = MegamozgTopic(418)
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint(t.author())
+ self.assertEqual(t.author(), 'Kirilkin')
+ pp.pprint(t.title())
+ pp.pprint(t.post['comments_count'])
+ pp.pprint(t.post['rating'])
+
+ def test_topic2(self):
+ t = MegamozgTopic(8568)
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint(t.author())
+ self.assertEqual(t.author(), 'jasiejames')
+ pp.pprint(t.title())
+ pp.pprint(t.post['comments_count'])
+ pp.pprint(t.post['rating'])
diff --git a/habr/user.py b/habr/user.py
index 42d1d37..bae5d34 100644
--- a/habr/user.py
+++ b/habr/user.py
@@ -20,14 +20,16 @@ def get_pages(doc):
return pages
-class HabraUser(object):
- def __init__(self, username, need_favorites=False, need_user_posts=False):
+class TMUser(object):
+ def __init__(self, username, need_favorites=False, need_user_posts=False, domain='habrahabr.ru'):
+ self._domain = domain
self._username = username
self._user = dict()
self._user_karma = dict()
self._user_profile = dict()
self._user_activity = dict()
+ print(self._genUrlForUsername(username))
req_data = requests.get(self._genUrlForUsername(username)).text
self._doc = html.document_fromstring(req_data)
self._parseUserpage()
@@ -79,7 +81,6 @@ class HabraUser(object):
string with URL
'''
return self._genUrlForUsername(username) + 'favorites/'
- # 'http://habrahabr.ru/users/{}/favorites'.format(username)
def _genUrlForUsername(self, username):
'''
@@ -90,7 +91,7 @@ class HabraUser(object):
:return:
string with URL
'''
- return 'http://habrahabr.ru/users/{}/'.format(username)
+ return 'http://{domain}/users/{username}/'.format(domain=self._domain, username=username)
def _getUserCompanyList(self):
@@ -110,36 +111,48 @@ class HabraUser(object):
def _parseUserpage(self):
-
+ # print(self._doc)
p_tags = self._doc.xpath("//div[@class='user_profile']//ul[@id='people-tags']//a/span")
registration_date = self._doc.xpath("//div[@class='user_profile']//dd[@class='grey']")[0].text
- self._user['username'] = self._doc.xpath("//div[@class='user_header']/h2/a").pop().text
- self._user_karma['karma'] = float(
- self._doc.xpath("//div[@class='karma']//div[@class='num']").pop().text.replace(',', '.'))
- self._user_karma['karma_vote'] = int(
- self._doc.xpath("//div[@class='karma']/div[@class='votes']").pop().text.split(' ')[0])
- self._user_karma['rating'] = float(
- self._doc.xpath("//div[@class='rating']/div[@class='num']").pop().text.replace(',', '.'))
- self._user_profile['fullname'] = self._doc.xpath(
- "//div[@class='user_profile']/div[@class='fullname']").pop().text.strip()
- self._user_karma['rating_place'] = int(
- self._doc.xpath("//div[@class='user_profile']/div[@class='rating-place']").pop().text.split('-')[0])
- if len(self._doc.xpath("//div[@class='user_profile']//dd[@class='bday']")):
- self._user_profile['birthday'] = self._doc.xpath("//div[@class='user_profile']//dd[@class='bday']")[0].text
- self._user_profile['country'] = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='country-name']")[
- 0].text
- self._user_profile['region'] = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='region']")[0].text
- self._user_profile['city'] = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='city']")[0].text
+ tmp = self._doc.xpath("//div[@class='user_header']/h2/a")
+ self._user['username'] = tmp.pop().text if len(tmp) else ''
+
+ tmp = self._doc.xpath("//div[@class='karma']//div[@class='num']")
+ self._user_karma['karma'] = float(tmp.pop().text.replace(',', '.')) if len(tmp) else 0.0
+
+ tmp = self._doc.xpath("//div[@class='karma']/div[@class='votes']")
+ self._user_karma['karma_vote'] = int(tmp.pop().text.split(' ')[0]) if len(tmp) else 0
+
+ tmp = self._doc.xpath("//div[@class='rating']/div[@class='num']")
+ self._user_karma['rating'] = float(tmp.pop().text.replace(',', '.')) if len(tmp) else 0.0
+
+ tmp = self._doc.xpath("//div[@class='user_profile']/div[@class='fullname']")
+ self._user_profile['fullname'] = tmp.pop().text.strip() if len(tmp) else ''
+
+ tmp = self._doc.xpath("//div[@class='user_profile']/div[@class='rating-place']")
+ self._user_karma['rating_place'] = int(tmp.pop().text.split('-')[0]) if len(tmp) else 0
+
+ tmp = self._doc.xpath("//div[@class='user_profile']//dd[@class='bday']")
+ self._user_profile['birthday'] = tmp[0].text if len(tmp) else ''
+
+ tmp = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='country-name']")
+ self._user_profile['country'] = tmp[0].text if len(tmp) else ''
+ tmp = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='region']")
+ self._user_profile['region'] = tmp[0].text if len(tmp) else ''
+ tmp = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='city']")
+ self._user_profile['city'] = tmp[0].text if len(tmp) else ''
self._user_profile['people_tags'] = [i for i in map(lambda x: x.text, p_tags)]
self._user_profile['registraion_date'] = registration_date[:registration_date.index('\r\n')]
- self._user_activity['followers_count'] = int(
- self._doc.xpath("//div[@class='stats']/div[@id='followers_count']/a").pop().text.split(' ')[0])
- self._user_activity['posts_count'] = int(
- self._doc.xpath("//div[@class='stats']/div[@class='item posts_count']/a").pop().text.split(' ')[0])
- self._user_activity['comments_count'] = int(
- self._doc.xpath("//div[@class='stats']/div[@class='item comments_count']/a").pop().text.split(' ')[0])
+ tmp = self._doc.xpath("//div[@class='stats']/div[@id='followers_count']/a")
+ self._user_activity['followers_count'] = int(tmp.pop().text.split(' ')[0]) if len(tmp) else 0
+
+ tmp = self._doc.xpath("//div[@class='stats']/div[@class='item posts_count']/a")
+ self._user_activity['posts_count'] = int(tmp.pop().text.split(' ')[0]) if len(tmp) else 0
+
+ tmp = self._doc.xpath("//div[@class='stats']/div[@class='item comments_count']/a")
+ self._user_activity['comments_count'] = int(tmp.pop().text.split(' ')[0]) if len(tmp) else 0
self._user['company_list'] = self._getUserCompanyList()
self._user['hubs_list'] = self._getUserHubList()
@@ -167,7 +180,7 @@ class HabraUser(object):
# topic_id =
out[f.text] = str(f.attrib['href']).split('/')[-2]
for p in range(2, pages):
- url = 'http://habrahabr.ru/users/{0}/favorites/page{1}/'.format(self._username, p)
+ url = 'http://{0}/users/{1}/favorites/page{2}/'.format(self._domain, self._username, p)
# if show_progress:
# print('parsing page{0}... url={1}'.format(p, url))
doc = html.document_fromstring(requests.get(url).text)
@@ -202,6 +215,21 @@ class HabraUser(object):
return out
+class HabraUser(TMUser):
+ def __init__(self, username, need_favorites=False, need_user_posts=False):
+ super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='habrahabr.ru')
+
+
+class GeektimesUser(TMUser):
+ def __init__(self, username, need_favorites=False, need_user_posts=False):
+ super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='geektimes.ru')
+
+
+class MegamozgUser(TMUser):
+ def __init__(self, username, need_favorites=False, need_user_posts=False):
+ super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='megamozg.ru')
+
+
import pprint
@@ -225,7 +253,46 @@ class Test_HabraUser(TestCase):
pp = pprint.PrettyPrinter(indent=4)
pp.pprint('userposts=')
pp.pprint(hu.user_posts())
- # out = getFavForUsername('icoz')
- # pp = pprint.PrettyPrinter(indent=4)
- # pp.pprint(out)
+
+
+class Test_GeektimesUser(TestCase):
+ def setUp(self):
+ self.hu = GeektimesUser('icoz')
+ pass
+
+ def test_parseUserpage(self):
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint(self.hu.activity())
+ pp.pprint(self.hu.profile())
+ pp.pprint(self.hu.karma())
+
+ # def test_favs(self):
+ # pp = pprint.PrettyPrinter(indent=4)
+
+ def test_user_posts(self):
+ hu = GeektimesUser('Zelenyikot')
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint('userposts=')
+ pp.pprint(hu.user_posts())
+
+
+class Test_MegamozgUser(TestCase):
+ def setUp(self):
+ self.hu = MegamozgUser('icoz')
+ pass
+
+ def test_parseUserpage(self):
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint(self.hu.activity())
+ pp.pprint(self.hu.profile())
+ pp.pprint(self.hu.karma())
+
+ # def test_favs(self):
+ # pp = pprint.PrettyPrinter(indent=4)
+
+ def test_user_posts(self):
+ hu = MegamozgUser('Zelenyikot')
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint('userposts=')
+ pp.pprint(hu.user_posts())