(1)获取网站图片并下载指定目录
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.UUID;
public class Main {
public static void main(String[] args) throws IOException {
try {
URL url = new URL("https://movie.douban.com/");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
// 设置请求头
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.1.4031 SLBChan/33");
//读取网页所有内容
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line = null;
while ((line = br.readLine()) != null) {
line = line.trim();
// System.out.println(line);
// 截取图片链接
if (line.startsWith("<img")&&line.contains(".webp")) {
System.out.println(line);
int beginindex = line.indexOf("https://");
int endindex = line.indexOf(".webp")+5;
//截取连接
String webp = line.substring(beginindex, endindex);
System.out.println(webp);
URL webpUrl = new URL(webp);
//图片输入流
//读取图片内容
InputStream in = webpUrl.openConnection().getInputStream();
//输出到指定路径并命名
try (BufferedOutputStream bos = new BufferedOutputStream(
new FileOutputStream("/Users/liyangyang/javaABC/douban/"
+ UUID.randomUUID().toString().substring(0, 5) + ".webp"))) {
byte[] buff = new byte[1024];
int len = -1;
while ((len = in.read(buff)) != -1) {
bos.write(buff, 0, len);
}
} catch (Exception e) {
// TODO: handle exception
}
}
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
(2)获取网站指定数据并输出
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import javax.net.ssl.HttpsURLConnection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static void main(String[] args) {
try {
// 创建URL对象,代表一个网址(统一资源定位符)
URL url = new URL("https://tech.163.com");
// 发起请求,打开应用程序与服务器之间的连接
HttpsURLConnection connection = (HttpsURLConnection)url.openConnection();
connection.setRequestMethod("GET"); // 请求方式
connection.setConnectTimeout(3000); // 超时时间
connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36");
connection.connect(); // 发起请求
// 处理响应
// 读取来自服务器的响应内容
StringBuilder response = new StringBuilder(8192);
try(BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(),StandardCharsets.UTF_8))){
String line = null;
while((line = reader.readLine()) != null) {
response.append(line);
}
}
// jsoup开源库:解析HTML格式的工具
// 加载整个网页html内容
Document htmlDocument = Jsoup.parse(response.toString());
// 查找<div class = "newsDigest">
Elements elements = htmlDocument.select("div[class=ac_title]");
for(Element div : elements) {
// 获取div标签下的第1个子节点(Element)
// Element p = div.firstElementChild();
// 获取指定下标的子节点(Element)
Element p = div.child(1);
// 获取新闻标题
String title = p.text(); // 获取标签之间的文本内容
System.out.println(title);
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}