Java自动提取网站的友情链接

代码

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176


package com.au92.api;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * 链接提取器
 * <p>
 * 从网页中提取链接并写入文件
 * <p>
 * 依赖 jsoup 库
 * <p>
 * https://jsoup.org/
 * <p>
 */
public class LinkExtractor {
    public static void main(String[] args) throws IOException {
        String url = "https://www.au92.com/";
        Document doc = Jsoup.connect(url).get();

        // 获取 RSS 订阅源链接并写入文件
        List<String> rssUrls = getRssUrls(doc, url);
        writeToFile(rssUrls, "rss.txt");

        // 获取博客侧边栏中的链接并写入文件
        List<String> blogrollUrls = getBlogrollUrls(doc, url);
        writeToFile(blogrollUrls, "blogroll.txt");
    }

    /**
     * 获取 RSS 订阅源链接
     *
     * @param doc HTML 文档
     * @param url URL
     * @return RSS 订阅源链接列表
     */
    private static List<String> getRssUrls(Document doc, String url) {
        List<String> rssUrls = new ArrayList<>();
        Elements links = doc.select("link[rel=alternate][type=application/rss+xml], link[rel=alternate][type=application/atom+xml]");
        for (Element link : links) {
            String href = link.attr("href");
            if (href.startsWith("http")) {
                rssUrls.add(href);
            } else {
                rssUrls.add(url + href);
            }
        }
        return rssUrls;
    }

    /**
     * 获取博客侧边栏中的链接
     *
     * @param doc HTML 文档
     * @param url URL
     * @return 博客侧边栏中的链接列表
     */
    private static List<String> getBlogrollUrls(Document doc, String url) {
        List<String> blogrollUrls = new ArrayList<>();
        Elements divs = doc.select("div, ul, ol, dl, table");
        for (Element div : divs) {
            if (getPoint(div, getRootDomain(url)) > 0) {
                Elements links = div.select("a[href^=http]:not([href*=" + url + "])");
                for (Element link : links) {
                    blogrollUrls.add(link.attr("href"));
                }
            }
        }
        return blogrollUrls;
    }

    /**
     * 获取元素的分数
     *
     * @param div        HTML 元素
     * @param rootDomain 根域名
     * @return 元素的分数
     */
    private static int getPoint(Element div, String rootDomain) {
        Elements links = div.select("a[href^=http]");
        double linkDensity = 0.001;
        double linkTextCount = 0.001;
        double totalTextCount = 0.001;
        double containerCount = 0.001;
        double innerLinkCount = 0.001;
        for (Element link : links) {
            String href = link.attr("href");
            if (href.isEmpty() || href.toLowerCase().startsWith("javascript") || href.contains("?") || rootDomain.contains(href)) {
                innerLinkCount++;
            } else {
                linkDensity++;
                if (!link.text().isEmpty()) {
                    linkTextCount += link.text().length();
                }
                if ("_blank".equals(link.attr("target"))) {
                    linkDensity++;
                }
            }
        }
        for (Element child : div.children()) {
            if (child.hasText()) {
                totalTextCount += child.text().length();
            } else {
                containerCount++;
            }
        }
        int points = (int) ((linkDensity - innerLinkCount) * 1000);
        if (points < 0) {
            return 0;
        }
        points = affect(points, 0.1, linkTextCount / totalTextCount, 2);
        points = affect(points, 0.1, linkDensity / containerCount, 1);
        if (points < 1000) {
            points = 0;
        }
        return points;
    }

    /**
     * 获取 URL 的根域名
     *
     * @param url URL
     * @return 根域名
     */
    private static String getRootDomain(String url) {
        String[] parts = url.split("\\.");
        if (parts[parts.length - 2].matches("com|edu|net|gov|org|co")) {
            return parts[parts.length - 3] + "." + parts[parts.length - 2] + "." + parts[parts.length - 1];
        } else {
            return parts[parts.length - 2] + "." + parts[parts.length - 1];
        }
    }

    /**
     * 计算元素的分数
     *
     * @param points    分数
     * @param keepRatio 保留分数的比例
     * @param ratio     链接文本比率
     * @param power     幂次方
     * @return 计算后的分数
     */
    private static int affect(int points, double keepRatio, double ratio, double power) {
        int keep = (int) (points * keepRatio);
        if (ratio >= 1) {
            return points;
        }
        return keep + (int) ((points - keep) * Math.pow(ratio, power));
    }

    /**
     * 将链接列表写入文件
     *
     * @param urls     链接列表
     * @param filename 文件名
     * @throws IOException
     */
    private static void writeToFile(List<String> urls, String filename) throws IOException {
        if (urls.isEmpty()) {
            return;
        }
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) {
            for (String url : urls) {
                writer.write(url + "\n");
            }
        }
    }
}

为啥有这个代码

在翻历史上曾经写过的一些代码时候发现了曾经的一个 Python 脚本，在个人网站层出不穷的年代自动收集一些高质量的友链列表是有必要的。所以当年为了自动化操作（学 python）写了一个 Python 的脚本自动干这个事，现在主要语言切到了 java，就用 java 重新写一遍

python版本

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119


# -*- coding:utf-8 -*-
import libxml2dom, sys, requests

reload(sys)
sys.setdefaultencoding('utf-8')

def affect(points, keep_ratio, ratio, power):
    keep = points * keep_ratio
    if ratio >= 1.: return points
    return keep + (points - keep) * pow(ratio, power)

def get_root_domain(host):
    '''get host's root domain'''
    parts = host.split('.')
    if parts[-2] in ('com', 'edu', 'net', 'gov', 'org', 'co'):
        host = '.'.join(host.split('.')[-3:])
    else:
        host = '.'.join(host.split('.')[-2:])
    return host

def get_point(div, root_domain):
    '''get element point'''
    a_elements = div.getElementsByTagName("a")
    link_density = linktext_count = totaltext_count = 0.001
    container_count = innerlink_count = 0.001
    for a in a_elements:
        href = a.getAttribute('href')
        if not href or not href.lower().startswith('http') or root_domain in href:
            innerlink_count += 1
            continue
        if '?' in href:
            continue
        link_density += 1
        if a.childNodes:
            if a.childNodes.length>0 and a.childNodes[0].value: linktext_count += len(a.childNodes[0].value)
        if '_blank' == a.getAttribute('target'):
            link_density += 1

    for t in div.childNodes:
        if t.nodeValue:
            totaltext_count += len(t.nodeValue)
        else:
            container_count += 1
    points = (link_density - innerlink_count) * 1000
    if points < 0: return 0

    points = affect(points, 0.1, linktext_count / totaltext_count, 2.)
    points = affect(points, 0.1, link_density / container_count, 1.)

    if points < 1000: points = 0
    return points

def get_rss_url(doc, url):
    '''
    get rss url

    when have rel attribute and the attribute is alternate, get score
    when have href attribute and the attribute does not have comment or rpc or rsd , get score
    when have title attribute and the attribute has RSS or ATOM , get another one score
    '''
    link_elements = doc.getElementsByTagName('link')
    _list=[]
    for link in link_elements:
        _link_score = 10;
        _rel = link.getAttribute("rel")
        _href = link.getAttribute("href").lower()
        _title = link.getAttribute('title')
        if not _href:
            _link_score=_link_score-10
        if not _rel or "alternate"!=_rel or "comment" in _href or "rpc" in _href or "rsd" in _href:
            _link_score=_link_score-5
        if _title and ("RSS" in _title or "ATOM" in _title):
            _link_score=_link_score+1
        if not _href.startswith('http'):
            _href='%s%s'%(url, _href)
        if _link_score>=10:_list.append(_href)
    return _list

def get_all_links(lists, url):
    '''all may be blogroll area's links'''
    result=[]
    for d in lists:
        _result = None
        links = d.getElementsByTagName("a")
        _result = [[a.getAttribute('href'), a.childNodes[0].value] for a in links if a.getAttribute('href').startswith('http') and url not in a.getAttribute('href')]
        if _result:
            result.extend(_result)
    return result

def get_blogroll(doc, url, tag="ul"):
    '''
    get friend links
    '''
    div_elements = doc.getElementsByTagName(tag)
    _root_domain = get_root_domain(url)
    if div_elements:
        d = [div for div in div_elements if get_point(div, _root_domain) > 0]
        return get_all_links(d, url)
    return None

url='http://simple-is-better.com/'
content = requests.get(url).content
doc = libxml2dom.parseString(content, html=1)

print(get_rss_url(doc, url))

def write_to_file(lists):
    '''write to file'''
    if not lists: return
    f = open('link.txt','ab+')
    for l in lists:
        _link = l[0]
        _text = l[1] and l[1].strip() or "None"
        if _link:
            f.write(_link+'||'+_text+"\r\n")
    f.close()

write_to_file(get_blogroll(doc, url))
write_to_file(get_blogroll(doc, url, 'table'))

文章目录

代码

为啥有这个代码

python版本