1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
# -*- coding:utf-8 -*-
import libxml2dom, sys, requests
reload(sys)
sys.setdefaultencoding('utf-8')
def affect(points, keep_ratio, ratio, power):
keep = points * keep_ratio
if ratio >= 1.: return points
return keep + (points - keep) * pow(ratio, power)
def get_root_domain(host):
'''get host's root domain'''
parts = host.split('.')
if parts[-2] in ('com', 'edu', 'net', 'gov', 'org', 'co'):
host = '.'.join(host.split('.')[-3:])
else:
host = '.'.join(host.split('.')[-2:])
return host
def get_point(div, root_domain):
'''get element point'''
a_elements = div.getElementsByTagName("a")
link_density = linktext_count = totaltext_count = 0.001
container_count = innerlink_count = 0.001
for a in a_elements:
href = a.getAttribute('href')
if not href or not href.lower().startswith('http') or root_domain in href:
innerlink_count += 1
continue
if '?' in href:
continue
link_density += 1
if a.childNodes:
if a.childNodes.length>0 and a.childNodes[0].value: linktext_count += len(a.childNodes[0].value)
if '_blank' == a.getAttribute('target'):
link_density += 1
for t in div.childNodes:
if t.nodeValue:
totaltext_count += len(t.nodeValue)
else:
container_count += 1
points = (link_density - innerlink_count) * 1000
if points < 0: return 0
points = affect(points, 0.1, linktext_count / totaltext_count, 2.)
points = affect(points, 0.1, link_density / container_count, 1.)
if points < 1000: points = 0
return points
def get_rss_url(doc, url):
'''
get rss url
when have rel attribute and the attribute is alternate, get score
when have href attribute and the attribute does not have comment or rpc or rsd , get score
when have title attribute and the attribute has RSS or ATOM , get another one score
'''
link_elements = doc.getElementsByTagName('link')
_list=[]
for link in link_elements:
_link_score = 10;
_rel = link.getAttribute("rel")
_href = link.getAttribute("href").lower()
_title = link.getAttribute('title')
if not _href:
_link_score=_link_score-10
if not _rel or "alternate"!=_rel or "comment" in _href or "rpc" in _href or "rsd" in _href:
_link_score=_link_score-5
if _title and ("RSS" in _title or "ATOM" in _title):
_link_score=_link_score+1
if not _href.startswith('http'):
_href='%s%s'%(url, _href)
if _link_score>=10:_list.append(_href)
return _list
def get_all_links(lists, url):
'''all may be blogroll area's links'''
result=[]
for d in lists:
_result = None
links = d.getElementsByTagName("a")
_result = [[a.getAttribute('href'), a.childNodes[0].value] for a in links if a.getAttribute('href').startswith('http') and url not in a.getAttribute('href')]
if _result:
result.extend(_result)
return result
def get_blogroll(doc, url, tag="ul"):
'''
get friend links
'''
div_elements = doc.getElementsByTagName(tag)
_root_domain = get_root_domain(url)
if div_elements:
d = [div for div in div_elements if get_point(div, _root_domain) > 0]
return get_all_links(d, url)
return None
url='http://simple-is-better.com/'
content = requests.get(url).content
doc = libxml2dom.parseString(content, html=1)
print(get_rss_url(doc, url))
def write_to_file(lists):
'''write to file'''
if not lists: return
f = open('link.txt','ab+')
for l in lists:
_link = l[0]
_text = l[1] and l[1].strip() or "None"
if _link:
f.write(_link+'||'+_text+"\r\n")
f.close()
write_to_file(get_blogroll(doc, url))
write_to_file(get_blogroll(doc, url, 'table'))
|