Java自动提取网站的友情链接
文章目录
代码
package com.au92.api;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 链接提取器
* <p>
* 从网页中提取链接并写入文件
* <p>
* 依赖 jsoup 库
* <p>
* https://jsoup.org/
* <p>
*/
public class LinkExtractor {
public static void main(String[] args) throws IOException {
String url = "https://www.au92.com/";
Document doc = Jsoup.connect(url).get();
// 获取 RSS 订阅源链接并写入文件
List<String> rssUrls = getRssUrls(doc, url);
writeToFile(rssUrls, "rss.txt");
// 获取博客侧边栏中的链接并写入文件
List<String> blogrollUrls = getBlogrollUrls(doc, url);
writeToFile(blogrollUrls, "blogroll.txt");
}
/**
* 获取 RSS 订阅源链接
*
* @param doc HTML 文档
* @param url URL
* @return RSS 订阅源链接列表
*/
private static List<String> getRssUrls(Document doc, String url) {
List<String> rssUrls = new ArrayList<>();
Elements links = doc.select("link[rel=alternate][type=application/rss+xml], link[rel=alternate][type=application/atom+xml]");
for (Element link : links) {
String href = link.attr("href");
if (href.startsWith("http")) {
rssUrls.add(href);
} else {
rssUrls.add(url + href);
}
}
return rssUrls;
}
/**
* 获取博客侧边栏中的链接
*
* @param doc HTML 文档
* @param url URL
* @return 博客侧边栏中的链接列表
*/
private static List<String> getBlogrollUrls(Document doc, String url) {
List<String> blogrollUrls = new ArrayList<>();
Elements divs = doc.select("div, ul, ol, dl, table");
for (Element div : divs) {
if (getPoint(div, getRootDomain(url)) > 0) {
Elements links = div.select("a[href^=http]:not([href*=" + url + "])");
for (Element link : links) {
blogrollUrls.add(link.attr("href"));
}
}
}
return blogrollUrls;
}
/**
* 获取元素的分数
*
* @param div HTML 元素
* @param rootDomain 根域名
* @return 元素的分数
*/
private static int getPoint(Element div, String rootDomain) {
Elements links = div.select("a[href^=http]");
double linkDensity = 0.001;
double linkTextCount = 0.001;
double totalTextCount = 0.001;
double containerCount = 0.001;
double innerLinkCount = 0.001;
for (Element link : links) {
String href = link.attr("href");
if (href.isEmpty() || href.toLowerCase().startsWith("javascript") || href.contains("?") || rootDomain.contains(href)) {
innerLinkCount++;
} else {
linkDensity++;
if (!link.text().isEmpty()) {
linkTextCount += link.text().length();
}
if ("_blank".equals(link.attr("target"))) {
linkDensity++;
}
}
}
for (Element child : div.children()) {
if (child.hasText()) {
totalTextCount += child.text().length();
} else {
containerCount++;
}
}
int points = (int) ((linkDensity - innerLinkCount) * 1000);
if (points < 0) {
return 0;
}
points = affect(points, 0.1, linkTextCount / totalTextCount, 2);
points = affect(points, 0.1, linkDensity / containerCount, 1);
if (points < 1000) {
points = 0;
}
return points;
}
/**
* 获取 URL 的根域名
*
* @param url URL
* @return 根域名
*/
private static String getRootDomain(String url) {
String[] parts = url.split("\\.");
if (parts[parts.length - 2].matches("com|edu|net|gov|org|co")) {
return parts[parts.length - 3] + "." + parts[parts.length - 2] + "." + parts[parts.length - 1];
} else {
return parts[parts.length - 2] + "." + parts[parts.length - 1];
}
}
/**
* 计算元素的分数
*
* @param points 分数
* @param keepRatio 保留分数的比例
* @param ratio 链接文本比率
* @param power 幂次方
* @return 计算后的分数
*/
private static int affect(int points, double keepRatio, double ratio, double power) {
int keep = (int) (points * keepRatio);
if (ratio >= 1) {
return points;
}
return keep + (int) ((points - keep) * Math.pow(ratio, power));
}
/**
* 将链接列表写入文件
*
* @param urls 链接列表
* @param filename 文件名
* @throws IOException
*/
private static void writeToFile(List<String> urls, String filename) throws IOException {
if (urls.isEmpty()) {
return;
}
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) {
for (String url : urls) {
writer.write(url + "\n");
}
}
}
}
为啥有这个代码
在翻历史上曾经写过的一些代码时候发现了曾经的一个 Python 脚本,在个人网站层出不穷的年代自动收集一些高质量的友链列表是有必要的。所以当年为了自动化操作(学 python)写了一个 Python 的脚本自动干这个事,现在主要语言切到了 java,就用 java 重新写一遍
python版本
# -*- coding:utf-8 -*-
import libxml2dom, sys, requests
reload(sys)
sys.setdefaultencoding('utf-8')
def affect(points, keep_ratio, ratio, power):
keep = points * keep_ratio
if ratio >= 1.: return points
return keep + (points - keep) * pow(ratio, power)
def get_root_domain(host):
'''get host's root domain'''
parts = host.split('.')
if parts[-2] in ('com', 'edu', 'net', 'gov', 'org', 'co'):
host = '.'.join(host.split('.')[-3:])
else:
host = '.'.join(host.split('.')[-2:])
return host
def get_point(div, root_domain):
'''get element point'''
a_elements = div.getElementsByTagName("a")
link_density = linktext_count = totaltext_count = 0.001
container_count = innerlink_count = 0.001
for a in a_elements:
href = a.getAttribute('href')
if not href or not href.lower().startswith('http') or root_domain in href:
innerlink_count += 1
continue
if '?' in href:
continue
link_density += 1
if a.childNodes:
if a.childNodes.length>0 and a.childNodes[0].value: linktext_count += len(a.childNodes[0].value)
if '_blank' == a.getAttribute('target'):
link_density += 1
for t in div.childNodes:
if t.nodeValue:
totaltext_count += len(t.nodeValue)
else:
container_count += 1
points = (link_density - innerlink_count) * 1000
if points < 0: return 0
points = affect(points, 0.1, linktext_count / totaltext_count, 2.)
points = affect(points, 0.1, link_density / container_count, 1.)
if points < 1000: points = 0
return points
def get_rss_url(doc, url):
'''
get rss url
when have rel attribute and the attribute is alternate, get score
when have href attribute and the attribute does not have comment or rpc or rsd , get score
when have title attribute and the attribute has RSS or ATOM , get another one score
'''
link_elements = doc.getElementsByTagName('link')
_list=[]
for link in link_elements:
_link_score = 10;
_rel = link.getAttribute("rel")
_href = link.getAttribute("href").lower()
_title = link.getAttribute('title')
if not _href:
_link_score=_link_score-10
if not _rel or "alternate"!=_rel or "comment" in _href or "rpc" in _href or "rsd" in _href:
_link_score=_link_score-5
if _title and ("RSS" in _title or "ATOM" in _title):
_link_score=_link_score+1
if not _href.startswith('http'):
_href='%s%s'%(url, _href)
if _link_score>=10:_list.append(_href)
return _list
def get_all_links(lists, url):
'''all may be blogroll area's links'''
result=[]
for d in lists:
_result = None
links = d.getElementsByTagName("a")
_result = [[a.getAttribute('href'), a.childNodes[0].value] for a in links if a.getAttribute('href').startswith('http') and url not in a.getAttribute('href')]
if _result:
result.extend(_result)
return result
def get_blogroll(doc, url, tag="ul"):
'''
get friend links
'''
div_elements = doc.getElementsByTagName(tag)
_root_domain = get_root_domain(url)
if div_elements:
d = [div for div in div_elements if get_point(div, _root_domain) > 0]
return get_all_links(d, url)
return None
url='http://simple-is-better.com/'
content = requests.get(url).content
doc = libxml2dom.parseString(content, html=1)
print(get_rss_url(doc, url))
def write_to_file(lists):
'''write to file'''
if not lists: return
f = open('link.txt','ab+')
for l in lists:
_link = l[0]
_text = l[1] and l[1].strip() or "None"
if _link:
f.write(_link+'||'+_text+"\r\n")
f.close()
write_to_file(get_blogroll(doc, url))
write_to_file(get_blogroll(doc, url, 'table'))
文章作者 pengxiaochao
上次更新 2023-06-16
许可协议 不允许任何形式转载。