使用jsoup入门java爬虫 案例

1.导入依赖

    <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.14.3</version> <!-- or latest version -->
        </dependency>

2.静态工具类



public class ProxyIP {


    public static String getOne() throws IOException {
        String Proxyurl =
                "填入提取链接 一次获取一个 ";
        String ip = Jsoup.connect(Proxyurl).get().body().text();

        return ip;
    }

}

3. 爬取页面信息

image-20220311175245433

4.使用 Jsoup 爬取内容

package com.sgg.main;

import com.sgg.main.proxy.ProxyIP;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.Proxy;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;


public class Index {

    public static void main(String[] args) throws IOException {


        ArrayList<String> urls = new ArrayList<>();

        ArrayList<String> names = new ArrayList<>();

        //            创建httpClient实例
        String indexUrl = "https://xxxxxxxxxx";


        Document document = Jsoup.connect(indexUrl).get();

        Elements article = document.getElementsByTag("article");

        article.forEach(new Consumer<Element>() {
            @Override
            public void accept(Element element) {
                Node node = element.childNode(0);
                Node parentNode = node.childNode(0).childNode(0).parentNode();
        
                String nameUrl = parentNode.childNode(0).childNode(0).toString();
                names.add(nameUrl);
                Attributes attributes = node.attributes();

                urls.add(href);
            }
        });


        //开启线程
        ExecutorService executorService = Executors.newFixedThreadPool(urls.size());


        for (int i = 0; i < urls.size(); i++) {
            //每个连接地址使用一个线程
            int finalI = i;

            executorService.execute(() -> {
                while (true) {
                    String[] split = null;
                    try {
                        split = ProxyIP.getOne().split(":");
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    try {
                        //搜索引擎搜索帖子  优化搜索排名
                        String decode = URLEncoder.encode(names.get(finalI), "utf-8");

						// 搜索RUL
                        String searchUrl =
                                "https:xxxxxxxxxxxx";

                        Connection.Response search = Jsoup.connect(searchUrl).proxy(split[0], Integer.parseInt(split[1])).ignoreContentType(true).execute();


                        Connection.Response response = Jsoup.connect(urls.get(finalI)).proxy(split[0], Integer.parseInt(split[1])).ignoreContentType(true).execute();

                        System.out.println(names.get(finalI).substring(0,5)+" 搜索状态: "+search.statusCode()+"  --->  "+urls.get(finalI).split("details/")[1] + "  访问状态 " + response.statusCode());

                        try {
                            TimeUnit.SECONDS.sleep(30+ new Random().nextInt(10));
                        } catch (InterruptedException e) {

                        }
                    } catch (IOException e) {
                        System.out.println(names.get(finalI).substring(0,5)+"     "+urls.get(finalI).split("details/")[1] + "   访问出错了");
                        System.out.println(e.getMessage());
                    }
                }
            });
        }
    }


}

评论 14
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

秋日的晚霞

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值