2 Star 12 Fork 4

祁雪 / 抖音网页爬虫

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
CrawlerDouYin.java 7.67 KB
一键复制 编辑 原始数据 按行查看 历史
祁雪 提交于 2022-01-18 10:13 . 更新
package com.code.crawler;
import com.code.stringutil.StringUtils;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.getApiData.GetUrlFile;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.io.File;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/**
* 获取抖音网页版数据
*
* @author 祁雪
*/
public class CrawlerDouYin {
/**
* 爬取用户发布页内容
*
* @param url
* @throws Exception
*/
public static void getUserIssue(String url,
String savePath) throws Exception {
WebClient webClient = new WebClient(BrowserVersion.CHROME);// 开启一个狗狗浏览器客户端对象
// 当JS执行出错的时候是否抛出异常
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 当HTTP的状态非200时是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setActiveXNative(false);
// 启用css加载
webClient.getOptions().setCssEnabled(false);
// 启用js加载
webClient.getOptions().setJavaScriptEnabled(true);
// 支持ajax加载
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
try {
System.out.println("进入用户页: " + url);
HtmlPage page = webClient.getPage(url);
System.out.println("检索视频详情页链接");
Document document = Jsoup.parse(page.asXml());
Elements as = document.select("a");
String title = document.selectFirst("h1").text();
String path = savePath + File.separator + title;
List<String> urls = new ArrayList<>();
for (Element a : as) {
if (a.hasAttr("href")) {
String href = a.attr("href");
if (href.startsWith("https://www.douyin.com/video/")) {
urls.add(href);
}
}
}
getVideoInfo(urls, path, null);
} catch (Exception e) {
e.printStackTrace();
}
webClient.close();
}
public static void getVideoInfo(List<String> urls,
String savePath, WebDriver webDriver) throws Exception {
if (webDriver == null) {
String driver = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe";
System.setProperty("webdriver.chrome.driver", driver);
ChromeOptions chromeOptions = new ChromeOptions();
// chromeOptions.addArguments("-headless");
webDriver = new ChromeDriver(chromeOptions);
webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
}
for (int i = 0; i < urls.size(); i++) {
String url = urls.get(i);
System.out.println("进入详情: " + url);
// 访问地址
webDriver.get(url);
Thread.sleep(10000);
Document document = Jsoup.parse(webDriver.getPageSource());
Element video = document.selectFirst("video").selectFirst("source");
Element h1 = document.selectFirst("h1");
if (video != null) {
String src = "https:" + video.attr("src");
System.out.println("下载视频: " + src);
// 下载视频
String fileName = h1.text() + ".mp4";
fileName = fileName.replaceAll("\\||\\?", "");
System.out.println("文件名称: " + fileName);
File file = new File(savePath, fileName);
if (file.exists()) {
System.out.println(fileName + " => 已下载");
continue;
}
GetUrlFile.downLoadHttpUrl(src, savePath, fileName);
}
}
webDriver.close();
}
/**
* 搜索抖音
*
* @param key 搜索词
* @param sort 排序 0.综合排序 1.最多点赞 2.最新发布
* @param savePath 保存位置
*/
public static void search(String key,
int sort,
String savePath) {
String driver = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe";
System.setProperty("webdriver.chrome.driver", driver);
ChromeOptions chromeOptions = new ChromeOptions();
// chromeOptions.addArguments("-headless");
WebDriver webDriver = new ChromeDriver(chromeOptions);
webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
String url = "https://www.douyin.com/search/{0}?publish_time=0&sort_type={1}&source=search_history&type=video&aid={2}";
try {
url = StringUtils.format(url, key, String.valueOf(sort), UUID.randomUUID().toString());
System.out.println("访问: " + url);
webDriver.get(url);
System.out.println("等待手动通过验证");
Thread.sleep(10000);
System.out.println("刷新页面");
webDriver.get(url);
Thread.sleep(10000);
System.out.println("检索dom");
Document document = Jsoup.parse(webDriver.getPageSource());
Elements as = document.select("a");
List<String> urls = new ArrayList<>();
for (Element a : as) {
if (a.hasAttr("href")) {
String href = a.attr("href");
if (href.startsWith("https://www.douyin.com/video/")) {
urls.add(href);
}
if (href.startsWith("//www.douyin.com/video/")) {
urls.add("https:" + href);
}
}
}
urls = urls.stream().distinct().collect(Collectors.toList());
System.out.println("已筛选出" + urls.size() + "条视频");
String path = savePath + File.separator + "search";
getVideoInfo(urls, path, webDriver);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
String url = "https://www.douyin.com/user/MS4wLjABAAAAQq8_8RtbEcjUAdeLAKETmbrvt6jdPfRNoI60SkH6J1I?enter_method=search_result&extra_params=%7B%22search_id%22%3A%22202109241115100101310570861F1E9FD3%22%2C%22search_result_id%22%3A%2216622344680%22%2C%22search_keyword%22%3A%22%E7%BE%8E%E5%A5%B3%22%2C%22search_type%22%3A%22video%22%7D&enter_from=search_result";
// String url = "https://www.douyin.com/video/7008058051248901410";
String videoSavePath = "D:\\爬虫文件\\抖音";
String fileName = "测试.mp4";
// getUserIssue(url, videoSavePath);
// 排序 0.综合排序 1.最多点赞 2.最新发布
search("JK", 0, videoSavePath);
// getVideoInfo(url, videoSavePath, fileName);
// String tets = "姐姐的腰不是腰?#变速扭胯舞 #肚皮舞 #御姐.mp4";
// System.out.println(tets.replaceAll("\\||\\?", ""));
}
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/LSE_QX/tiktok-crawler.git
git@gitee.com:LSE_QX/tiktok-crawler.git
LSE_QX
tiktok-crawler
抖音网页爬虫
master

搜索帮助

344bd9b3 5694891 D2dac590 5694891