1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
# coding: utf-8
# BlackSmith-bot module.
# © simpleApps, 21.05.2012.
# This module contains main web\
# functions for site parsing.
import urllib, urllib2, re
## HTML Unescape and <br> tag replace.
import htmlentitydefs
edefs = dict()
for Name, Numb in htmlentitydefs.name2codepoint.iteritems():
edefs[Name] = unichr(Numb)
del Name, Numb, htmlentitydefs
compile_ehtmls = re.compile("&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def uHTML(data):
if data.count("&"):
def e_sb(co):
co = co.group(1)
if co.startswith("#"):
if chr(120) == co[1].lower():
Char, c06 = co[2:], 16
else:
Char, c06 = co[1:], 10
try:
Numb = int(Char, c06)
assert (-1 < Numb < 65535)
Char = unichr(Numb)
except:
Char = edefs.get(Char, "&%s;" % co)
else:
Char = edefs.get(co, "&%s;" % co)
return Char
data = compile_ehtmls.sub(e_sb, data)
data = re.sub("</?br */?>", "\n", data)
return data
# TODO: remove this function
def regexp(reg, string, findall = 1):
reg = re.compile(reg, re.IGNORECASE | re.DOTALL)
if findall:
reg = reg.findall(string)
else:
return reg.search(string)
return reg
## Get HTML tag.
def getTagData(tag, data, close_tag = 0):
if not close_tag:
close_tag = tag
pattern = re.compile("<%(tag)s.*?>(.*?)</%(close_tag)s>" % vars(), flags=re.S+re.IGNORECASE)
tagData = pattern.search(data)
if tagData:
tagData = tagData.group(1)
return tagData or " "
def getTagArg(tag, argv, data, close_tag = 0):
if not close_tag:
close_tag = tag
pattern = re.compile("<%(tag)s.? %(argv)s=[\"']?(.*?)[\"']?\">(.*?)</%(close_tag)s>" % vars(), flags=re.DOTALL|re.IGNORECASE)
tagData = pattern.search(data)
if tagData:
tagData = tagData.group(1)
return tagData or " "
def stripTags(data, subBy = str(), pattern = "<[^<>]+>"):
pattern = re.compile(pattern)
return pattern.sub(subBy, data)
## Format size.
def byteFormat(size):
if size < 1024: return '%sb' % int(size)
for t in ('kB','MB','GB'):
size = size / 1024.0
if size < 1024: break
return '%.2f%s' % (size,t)
|