diff options
author | icoz <icoz.vt@gmail.com> | 2014-08-27 23:07:21 +0400 |
---|---|---|
committer | icoz <icoz.vt@gmail.com> | 2014-08-27 23:07:21 +0400 |
commit | 795368dfdafcdbd2297a0b5cc97674f02f9e7e2e (patch) | |
tree | 1d2fccec199df452e5d1a6e4315659a2683091dd | |
parent | d6bd7888fa5857719bec4ef0040388092974e1cc (diff) |
Refactor:
Made package.
Added class HabraUser
-rw-r--r-- | habr/__init__.py | 1 | ||||
-rw-r--r-- | habr/user.py | 139 | ||||
-rw-r--r-- | habraparse.py | 117 |
3 files changed, 140 insertions, 117 deletions
diff --git a/habr/__init__.py b/habr/__init__.py new file mode 100644 index 0000000..5fbbe4a --- /dev/null +++ b/habr/__init__.py @@ -0,0 +1 @@ +__author__ = 'vlad' diff --git a/habr/user.py b/habr/user.py new file mode 100644 index 0000000..af19270 --- /dev/null +++ b/habr/user.py @@ -0,0 +1,139 @@ +from unittest import TestCase +from lxml import html +import requests + +__author__ = 'vlad' + +class HabraUser(object): + def __init__(self, username): + self.username = username + self.user = dict() + self.user_karma = dict() + self.user_profile = dict() + self.user_activity = dict() + + req_data = requests.get(self._genUrlForUsername(username)).text + self.doc = html.document_fromstring(req_data) + + self.user_favorites = self._getFavorites() + + def favorites(self): + pass + + def _genFavoritesUrlByUser(self, username): + ''' + Generates favirites URL using username + + :param id: + string with username + :return: + string with URL + ''' + return self._genUrlForUsername(username)+'favorites/' + # 'http://habrahabr.ru/users/{}/favorites'.format(username) + + def _genUrlForUsername(self, username): + ''' + Generates user-page URL using username + + :param id: + string with username + :return: + string with URL + ''' + return 'http://habrahabr.ru/users/{}/'.format(username) + + + def _getUserCompanyList(self): + out = [] + cmpns = self.doc.xpath("//div[@class='user_profile']/dl[@id='favorite_companies_list']//a") + for company in cmpns: + out.append((company.text, company.attrib['href'])) + return out + + + def _getUserHubList(self): + out = [] + hubs = self.doc.xpath("//div[@class='user_profile']/dl[@class='hubs_list']//a[@class='cross']") + for hub in hubs: + out.append((hub.text, hub.attrib['href'])) + return out + + + def _parseUserpage(self, url): + + p_tags = self.doc.xpath("//div[@class='user_profile']//ul[@id='people-tags']//a/span") + registration_date = self.doc.xpath("//div[@class='user_profile']//dd[@class='grey']")[0].text + + self.user['username'] = self.doc.xpath("//div[@class='user_header']/h2/a").pop().text + self.user_karma['karma'] = float(self.doc.xpath("//div[@class='karma']//div[@class='num']").pop().text.replace(',', '.')) + self.user_karma['karma_vote'] = int(self.doc.xpath("//div[@class='karma']/div[@class='votes']").pop().text.split(' ')[0]) + self.user_karma['rating'] = float(self.doc.xpath("//div[@class='rating']/div[@class='num']").pop().text.replace(',', '.')) + self.user_profile['fullname'] = self.doc.xpath("//div[@class='user_profile']/div[@class='fullname']").pop().text.strip() + self.user_karma['rating_place'] = int( + self.doc.xpath("//div[@class='user_profile']/div[@class='rating-place']").pop().text.split('-')[0]) + self.user_profile['birthday'] = self.doc.xpath("//div[@class='user_profile']//dd[@class='bday']")[0].text + self.user_profile['country'] = self.doc.xpath("//div[@class='user_profile']//dd/a[@class='country-name']")[0].text + self.user_profile['region'] = self.doc.xpath("//div[@class='user_profile']//dd/a[@class='region']")[0].text + self.user_profile['city'] = self.doc.xpath("//div[@class='user_profile']//dd/a[@class='city']")[0].text + self.user_profile['people_tags'] = [i for i in map(lambda x: x.text, p_tags)] + self.user_profile['registraion_date'] = registration_date[:registration_date.index('\r\n')] + + self.user_activity['followers_count'] = int( + self.doc.xpath("//div[@class='stats']/div[@id='followers_count']/a").pop().text.split(' ')[0]) + self.user_activity['posts_count'] = int( + self.doc.xpath("//div[@class='stats']/div[@class='item posts_count']/a").pop().text.split(' ')[0]) + self.user_activity['comments_count'] = int( + self.doc.xpath("//div[@class='stats']/div[@class='item comments_count']/a").pop().text.split(' ')[0]) + + self.user['company_list'] = self._getUserCompanyList() + self.user['hubs_list'] = self._getUserHubList() + self.user['profile'] = self.user_profile + self.user['activity'] = self.user_activity + self.user['karma'] = self.user_karma + + + def _getFavorites(self, show_progress = False): + """ + Returns list of ('topic_name', 'topic_url') + + :param username: + string of username, ex. 'some_user' + :return: + list of ('topic_name', 'topic_id') + """ + url = self._genFavoritesUrlByUser(self.username) + doc = html.document_fromstring(requests.get(url).text) + out = dict() + pages = int(doc.xpath("//ul[@id='nav-pages']//noindex/a")[-1].attrib['href'][-3:-1]) + favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']") + for f in favs: + out[f.text] = f.attrib['href'][-7:-1] + for p in range(2, pages): + url = 'http://habrahabr.ru/users/{0}/favorites/page{1}/'.format(self.username, p) + if show_progress: + print('parsing page{0}... url={1}'.format(p,url)) + doc = html.document_fromstring(requests.get(url).text) + favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']") + for f in favs: + out[f.text] = f.attrib['href'][-7:-1] + return out + +import pprint + +class Test_HabraUser(TestCase): + def setUp(self): + pass + + def test_parseUserpage(self): + hu = HabraUser('icoz') + pp = pprint.PrettyPrinter(indent=4) + pp.pprint(hu.user_favorites) + + def test_favs(self): + pass + # out = getFavForUsername('icoz') + # pp = pprint.PrettyPrinter(indent=4) + # pp.pprint(out) + + diff --git a/habraparse.py b/habraparse.py index dbbc8fa..c94c78c 100644 --- a/habraparse.py +++ b/habraparse.py @@ -16,62 +16,8 @@ hubs[] -> .text ''' -def getUserCompanyList(doc): - out = [] - cmpns = doc.xpath("//div[@class='user_profile']/dl[@id='favorite_companies_list']//a") - for company in cmpns: - out.append((company.text, company.attrib['href'])) - return out -def getUserHubList(doc): - out = [] - hubs = doc.xpath("//div[@class='user_profile']/dl[@class='hubs_list']//a[@class='cross']") - for hub in hubs: - out.append((hub.text, hub.attrib['href'])) - return out - - -def parseUserpage(url): - user = dict() - user_karma = dict() - user_profile = dict() - user_activity = dict() - - topic = requests.get(url).text - doc = html.document_fromstring(topic) - - p_tags = doc.xpath("//div[@class='user_profile']//ul[@id='people-tags']//a/span") - rdate = doc.xpath("//div[@class='user_profile']//dd[@class='grey']")[0].text - - user['username'] = doc.xpath("//div[@class='user_header']/h2/a").pop().text - user_karma['karma'] = float(doc.xpath("//div[@class='karma']//div[@class='num']").pop().text.replace(',', '.')) - user_karma['karma_vote'] = int(doc.xpath("//div[@class='karma']/div[@class='votes']").pop().text.split(' ')[0]) - user_karma['rating'] = float(doc.xpath("//div[@class='rating']/div[@class='num']").pop().text.replace(',', '.')) - user_profile['fullname'] = doc.xpath("//div[@class='user_profile']/div[@class='fullname']").pop().text.strip() - user_karma['rating_place'] = int( - doc.xpath("//div[@class='user_profile']/div[@class='rating-place']").pop().text.split('-')[0]) - user_profile['birthday'] = doc.xpath("//div[@class='user_profile']//dd[@class='bday']")[0].text - user_profile['country'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='country-name']")[0].text - user_profile['region'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='region']")[0].text - user_profile['city'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='city']")[0].text - user_profile['people_tags'] = [i for i in map(lambda x: x.text, p_tags)] - user_profile['registraion_date'] = rdate[:rdate.index('\r\n')] - - user_activity['followers_count'] = int( - doc.xpath("//div[@class='stats']/div[@id='followers_count']/a").pop().text.split(' ')[0]) - user_activity['posts_count'] = int( - doc.xpath("//div[@class='stats']/div[@class='item posts_count']/a").pop().text.split(' ')[0]) - user_activity['comments_count'] = int( - doc.xpath("//div[@class='stats']/div[@class='item comments_count']/a").pop().text.split(' ')[0]) - - user['company_list'] = getUserCompanyList(doc) - user['hubs_list'] = getUserHubList(doc) - user['profile'] = user_profile - user['activity'] = user_activity - user['karma'] = user_karma - return user - def parseTopic(url): ''' @@ -128,16 +74,6 @@ def getUserInfo(user): pass -def genUrlForUsername(username): - ''' - Generates user-page URL using username - - :param id: - string with username - :return: - string with URL - ''' - return 'http://habrahabr.ru/users/{}/'.format(username) def genTopicUrlByID(id): ''' @@ -151,47 +87,8 @@ def genTopicUrlByID(id): return 'http://habrahabr.ru/post/{}/'.format(id) -def genFavoritesUrlByUser(username): - ''' - Generates favirites URL using username - - :param id: - string with username - :return: - string with URL - ''' - return genUrlForUsername(username)+'favorites/' - # 'http://habrahabr.ru/users/{}/favorites'.format(username) -def getFavForUsername(username): - """ - Returns list of ('topic_name', 'topic_url') - - :param username: - string of username, ex. 'some_user' - :return: - list of ('topic_name', 'topic_id') - """ - url = genFavoritesUrlByUser(username) - doc = html.document_fromstring(requests.get(url).text) - out = [] - pages = int(doc.xpath("//ul[@id='nav-pages']//noindex/a")[-1].attrib['href'][-3:-1]) - favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']") - for f in favs: - # print(f.text, f.attrib['href'][-7:-1]) - out.append((f.text, f.attrib['href'][-7:-1])) - for p in range(2, pages): - url = 'http://habrahabr.ru/users/{0}/favorites/page{1}/'.format(username, p) - # print('parsing page{0}... url={1}'.format(p,url)) - doc = html.document_fromstring(requests.get(url).text) - favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']") - for f in favs: - # print(f.text, f.attrib['href'][-7:-1]) - out.append((f.text, f.attrib['href'][-7:-1])) - out.sort() - return out - class TestParse(TestCase): def setUp(self): @@ -208,17 +105,3 @@ class TestParse(TestCase): # d['comments'] print(d['text']) - def test_parseUserpage(self): - username = 'icoz' - url = genUrlForUsername(username) - user = parseUserpage(url) - print(user) - - def test_favs(self): - import pprint - - out = getFavForUsername('icoz') - pp = pprint.PrettyPrinter(indent=4) - pp.pprint(out) - - |