Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/icoz/habraparse.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoricoz <icoz.vt@gmail.com>2014-08-27 23:07:21 +0400
committericoz <icoz.vt@gmail.com>2014-08-27 23:07:21 +0400
commit795368dfdafcdbd2297a0b5cc97674f02f9e7e2e (patch)
tree1d2fccec199df452e5d1a6e4315659a2683091dd
parentd6bd7888fa5857719bec4ef0040388092974e1cc (diff)
Refactor:
Made package. Added class HabraUser
-rw-r--r--habr/__init__.py1
-rw-r--r--habr/user.py139
-rw-r--r--habraparse.py117
3 files changed, 140 insertions, 117 deletions
diff --git a/habr/__init__.py b/habr/__init__.py
new file mode 100644
index 0000000..5fbbe4a
--- /dev/null
+++ b/habr/__init__.py
@@ -0,0 +1 @@
+__author__ = 'vlad'
diff --git a/habr/user.py b/habr/user.py
new file mode 100644
index 0000000..af19270
--- /dev/null
+++ b/habr/user.py
@@ -0,0 +1,139 @@
+from unittest import TestCase
+from lxml import html
+import requests
+
+__author__ = 'vlad'
+
+class HabraUser(object):
+ def __init__(self, username):
+ self.username = username
+ self.user = dict()
+ self.user_karma = dict()
+ self.user_profile = dict()
+ self.user_activity = dict()
+
+ req_data = requests.get(self._genUrlForUsername(username)).text
+ self.doc = html.document_fromstring(req_data)
+
+ self.user_favorites = self._getFavorites()
+
+ def favorites(self):
+ pass
+
+ def _genFavoritesUrlByUser(self, username):
+ '''
+ Generates favirites URL using username
+
+ :param id:
+ string with username
+ :return:
+ string with URL
+ '''
+ return self._genUrlForUsername(username)+'favorites/'
+ # 'http://habrahabr.ru/users/{}/favorites'.format(username)
+
+ def _genUrlForUsername(self, username):
+ '''
+ Generates user-page URL using username
+
+ :param id:
+ string with username
+ :return:
+ string with URL
+ '''
+ return 'http://habrahabr.ru/users/{}/'.format(username)
+
+
+ def _getUserCompanyList(self):
+ out = []
+ cmpns = self.doc.xpath("//div[@class='user_profile']/dl[@id='favorite_companies_list']//a")
+ for company in cmpns:
+ out.append((company.text, company.attrib['href']))
+ return out
+
+
+ def _getUserHubList(self):
+ out = []
+ hubs = self.doc.xpath("//div[@class='user_profile']/dl[@class='hubs_list']//a[@class='cross']")
+ for hub in hubs:
+ out.append((hub.text, hub.attrib['href']))
+ return out
+
+
+ def _parseUserpage(self, url):
+
+ p_tags = self.doc.xpath("//div[@class='user_profile']//ul[@id='people-tags']//a/span")
+ registration_date = self.doc.xpath("//div[@class='user_profile']//dd[@class='grey']")[0].text
+
+ self.user['username'] = self.doc.xpath("//div[@class='user_header']/h2/a").pop().text
+ self.user_karma['karma'] = float(self.doc.xpath("//div[@class='karma']//div[@class='num']").pop().text.replace(',', '.'))
+ self.user_karma['karma_vote'] = int(self.doc.xpath("//div[@class='karma']/div[@class='votes']").pop().text.split(' ')[0])
+ self.user_karma['rating'] = float(self.doc.xpath("//div[@class='rating']/div[@class='num']").pop().text.replace(',', '.'))
+ self.user_profile['fullname'] = self.doc.xpath("//div[@class='user_profile']/div[@class='fullname']").pop().text.strip()
+ self.user_karma['rating_place'] = int(
+ self.doc.xpath("//div[@class='user_profile']/div[@class='rating-place']").pop().text.split('-')[0])
+ self.user_profile['birthday'] = self.doc.xpath("//div[@class='user_profile']//dd[@class='bday']")[0].text
+ self.user_profile['country'] = self.doc.xpath("//div[@class='user_profile']//dd/a[@class='country-name']")[0].text
+ self.user_profile['region'] = self.doc.xpath("//div[@class='user_profile']//dd/a[@class='region']")[0].text
+ self.user_profile['city'] = self.doc.xpath("//div[@class='user_profile']//dd/a[@class='city']")[0].text
+ self.user_profile['people_tags'] = [i for i in map(lambda x: x.text, p_tags)]
+ self.user_profile['registraion_date'] = registration_date[:registration_date.index('\r\n')]
+
+ self.user_activity['followers_count'] = int(
+ self.doc.xpath("//div[@class='stats']/div[@id='followers_count']/a").pop().text.split(' ')[0])
+ self.user_activity['posts_count'] = int(
+ self.doc.xpath("//div[@class='stats']/div[@class='item posts_count']/a").pop().text.split(' ')[0])
+ self.user_activity['comments_count'] = int(
+ self.doc.xpath("//div[@class='stats']/div[@class='item comments_count']/a").pop().text.split(' ')[0])
+
+ self.user['company_list'] = self._getUserCompanyList()
+ self.user['hubs_list'] = self._getUserHubList()
+ self.user['profile'] = self.user_profile
+ self.user['activity'] = self.user_activity
+ self.user['karma'] = self.user_karma
+
+
+ def _getFavorites(self, show_progress = False):
+ """
+ Returns list of ('topic_name', 'topic_url')
+
+ :param username:
+ string of username, ex. 'some_user'
+ :return:
+ list of ('topic_name', 'topic_id')
+ """
+ url = self._genFavoritesUrlByUser(self.username)
+ doc = html.document_fromstring(requests.get(url).text)
+ out = dict()
+ pages = int(doc.xpath("//ul[@id='nav-pages']//noindex/a")[-1].attrib['href'][-3:-1])
+ favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']")
+ for f in favs:
+ out[f.text] = f.attrib['href'][-7:-1]
+ for p in range(2, pages):
+ url = 'http://habrahabr.ru/users/{0}/favorites/page{1}/'.format(self.username, p)
+ if show_progress:
+ print('parsing page{0}... url={1}'.format(p,url))
+ doc = html.document_fromstring(requests.get(url).text)
+ favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']")
+ for f in favs:
+ out[f.text] = f.attrib['href'][-7:-1]
+ return out
+
+import pprint
+
+class Test_HabraUser(TestCase):
+ def setUp(self):
+ pass
+
+ def test_parseUserpage(self):
+ hu = HabraUser('icoz')
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint(hu.user_favorites)
+
+ def test_favs(self):
+ pass
+ # out = getFavForUsername('icoz')
+ # pp = pprint.PrettyPrinter(indent=4)
+ # pp.pprint(out)
+
+
diff --git a/habraparse.py b/habraparse.py
index dbbc8fa..c94c78c 100644
--- a/habraparse.py
+++ b/habraparse.py
@@ -16,62 +16,8 @@ hubs[] -> .text
'''
-def getUserCompanyList(doc):
- out = []
- cmpns = doc.xpath("//div[@class='user_profile']/dl[@id='favorite_companies_list']//a")
- for company in cmpns:
- out.append((company.text, company.attrib['href']))
- return out
-def getUserHubList(doc):
- out = []
- hubs = doc.xpath("//div[@class='user_profile']/dl[@class='hubs_list']//a[@class='cross']")
- for hub in hubs:
- out.append((hub.text, hub.attrib['href']))
- return out
-
-
-def parseUserpage(url):
- user = dict()
- user_karma = dict()
- user_profile = dict()
- user_activity = dict()
-
- topic = requests.get(url).text
- doc = html.document_fromstring(topic)
-
- p_tags = doc.xpath("//div[@class='user_profile']//ul[@id='people-tags']//a/span")
- rdate = doc.xpath("//div[@class='user_profile']//dd[@class='grey']")[0].text
-
- user['username'] = doc.xpath("//div[@class='user_header']/h2/a").pop().text
- user_karma['karma'] = float(doc.xpath("//div[@class='karma']//div[@class='num']").pop().text.replace(',', '.'))
- user_karma['karma_vote'] = int(doc.xpath("//div[@class='karma']/div[@class='votes']").pop().text.split(' ')[0])
- user_karma['rating'] = float(doc.xpath("//div[@class='rating']/div[@class='num']").pop().text.replace(',', '.'))
- user_profile['fullname'] = doc.xpath("//div[@class='user_profile']/div[@class='fullname']").pop().text.strip()
- user_karma['rating_place'] = int(
- doc.xpath("//div[@class='user_profile']/div[@class='rating-place']").pop().text.split('-')[0])
- user_profile['birthday'] = doc.xpath("//div[@class='user_profile']//dd[@class='bday']")[0].text
- user_profile['country'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='country-name']")[0].text
- user_profile['region'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='region']")[0].text
- user_profile['city'] = doc.xpath("//div[@class='user_profile']//dd/a[@class='city']")[0].text
- user_profile['people_tags'] = [i for i in map(lambda x: x.text, p_tags)]
- user_profile['registraion_date'] = rdate[:rdate.index('\r\n')]
-
- user_activity['followers_count'] = int(
- doc.xpath("//div[@class='stats']/div[@id='followers_count']/a").pop().text.split(' ')[0])
- user_activity['posts_count'] = int(
- doc.xpath("//div[@class='stats']/div[@class='item posts_count']/a").pop().text.split(' ')[0])
- user_activity['comments_count'] = int(
- doc.xpath("//div[@class='stats']/div[@class='item comments_count']/a").pop().text.split(' ')[0])
-
- user['company_list'] = getUserCompanyList(doc)
- user['hubs_list'] = getUserHubList(doc)
- user['profile'] = user_profile
- user['activity'] = user_activity
- user['karma'] = user_karma
- return user
-
def parseTopic(url):
'''
@@ -128,16 +74,6 @@ def getUserInfo(user):
pass
-def genUrlForUsername(username):
- '''
- Generates user-page URL using username
-
- :param id:
- string with username
- :return:
- string with URL
- '''
- return 'http://habrahabr.ru/users/{}/'.format(username)
def genTopicUrlByID(id):
'''
@@ -151,47 +87,8 @@ def genTopicUrlByID(id):
return 'http://habrahabr.ru/post/{}/'.format(id)
-def genFavoritesUrlByUser(username):
- '''
- Generates favirites URL using username
-
- :param id:
- string with username
- :return:
- string with URL
- '''
- return genUrlForUsername(username)+'favorites/'
- # 'http://habrahabr.ru/users/{}/favorites'.format(username)
-def getFavForUsername(username):
- """
- Returns list of ('topic_name', 'topic_url')
-
- :param username:
- string of username, ex. 'some_user'
- :return:
- list of ('topic_name', 'topic_id')
- """
- url = genFavoritesUrlByUser(username)
- doc = html.document_fromstring(requests.get(url).text)
- out = []
- pages = int(doc.xpath("//ul[@id='nav-pages']//noindex/a")[-1].attrib['href'][-3:-1])
- favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']")
- for f in favs:
- # print(f.text, f.attrib['href'][-7:-1])
- out.append((f.text, f.attrib['href'][-7:-1]))
- for p in range(2, pages):
- url = 'http://habrahabr.ru/users/{0}/favorites/page{1}/'.format(username, p)
- # print('parsing page{0}... url={1}'.format(p,url))
- doc = html.document_fromstring(requests.get(url).text)
- favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']")
- for f in favs:
- # print(f.text, f.attrib['href'][-7:-1])
- out.append((f.text, f.attrib['href'][-7:-1]))
- out.sort()
- return out
-
class TestParse(TestCase):
def setUp(self):
@@ -208,17 +105,3 @@ class TestParse(TestCase):
# d['comments']
print(d['text'])
- def test_parseUserpage(self):
- username = 'icoz'
- url = genUrlForUsername(username)
- user = parseUserpage(url)
- print(user)
-
- def test_favs(self):
- import pprint
-
- out = getFavForUsername('icoz')
- pp = pprint.PrettyPrinter(indent=4)
- pp.pprint(out)
-
-