获取网站数据_获取网页数据-CSDN博客

本文链接：https://blog.csdn.net/lcannal/article/details/131617482

这两段代码展示了如何使用Java进行网络请求，分别实现从网页抓取图片并下载到本地，以及获取指定网站的数据。第一部分利用HttpURLConnection下载包含.webp格式的图片，第二部分则使用Jsoup库解析HTML，提取特定类名的新闻标题。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

(1)获取网站图片并下载指定目录

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.UUID;

public class Main {
	public static void main(String[] args) throws IOException {
		try {
			URL url = new URL("https://movie.douban.com/");
			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
			// 设置请求头
			conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
			conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.1.4031 SLBChan/33");
			//读取网页所有内容
			BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream()));

			String line = null;
			while ((line = br.readLine()) != null) {
				line = line.trim();
//				System.out.println(line);
				// 截取图片链接
				if (line.startsWith("<img")&&line.contains(".webp")) {
					System.out.println(line);
					int beginindex = line.indexOf("https://");
					int endindex = line.indexOf(".webp")+5;
					//截取连接
					String webp = line.substring(beginindex, endindex);
					System.out.println(webp);
					URL webpUrl = new URL(webp);

					//图片输入流
					//读取图片内容
					InputStream in = webpUrl.openConnection().getInputStream();
					//输出到指定路径并命名
					try (BufferedOutputStream bos = new BufferedOutputStream(
							new FileOutputStream("/Users/liyangyang/javaABC/douban/"
									+ UUID.randomUUID().toString().substring(0, 5) + ".webp"))) {

						byte[] buff = new byte[1024];
						int len = -1;
						while ((len = in.read(buff)) != -1) {
							bos.write(buff, 0, len);
						}
					} catch (Exception e) {
						// TODO: handle exception
					}
				}
			}
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

(2)获取网站指定数据并输出

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;

import javax.net.ssl.HttpsURLConnection;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main {
	public static void main(String[] args) {
		try {
			// 创建URL对象，代表一个网址(统一资源定位符)
			URL url = new URL("https://tech.163.com");
			
			// 发起请求，打开应用程序与服务器之间的连接
			HttpsURLConnection connection =  (HttpsURLConnection)url.openConnection();
			connection.setRequestMethod("GET"); // 请求方式
			connection.setConnectTimeout(3000); // 超时时间
			connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
			connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36");
			connection.connect(); // 发起请求
			
			// 处理响应
			// 读取来自服务器的响应内容
			StringBuilder response = new StringBuilder(8192);
			try(BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(),StandardCharsets.UTF_8))){
				String line = null;
				while((line = reader.readLine()) != null) {
					response.append(line);
				}
			}
			
			// jsoup开源库：解析HTML格式的工具
			// 加载整个网页html内容
			Document htmlDocument = Jsoup.parse(response.toString());
			
			// 查找<div class = "newsDigest">
			Elements elements = htmlDocument.select("div[class=ac_title]");
			for(Element div : elements) {
				// 获取div标签下的第1个子节点(Element)
				// Element p = div.firstElementChild();
				
				// 获取指定下标的子节点(Element)
				Element p = div.child(1);
				
				// 获取新闻标题
				String title = p.text(); // 获取标签之间的文本内容
				System.out.println(title);
			}
			
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
}