1 Star 0 Fork 0

wang/MaterialSearch

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
process_pexels.py 3.45 KB
一键复制 编辑 原始数据 按行查看 历史
"""
目前这个脚本是单独运行的
Sitemap下载:https://www.pexels.com/sitemaps/en-US/video-sitemap10.xml.gz 数字范围1~10
下载完成后放到 sitemaps/pexels_video/ 目录下,再运行这个脚本
"""
import glob
import xml.etree.ElementTree as ET
from database import add_pexels_video, is_pexels_video_exist
from models import DatabaseSessionPexelsVideo, create_tables
from process_assets import process_web_image
# logger = logging.getLogger(__name__)
def handel_xml(video_sitemap_xml):
tree = ET.parse(video_sitemap_xml)
root = tree.getroot()
# 找到所有的video元素
video_elements = root.findall(".//{http://www.google.com/schemas/sitemap-video/1.1}video")
print("Total videos:", len(video_elements))
i = 0
# 遍历每个video元素并提取元数据
for video_element in video_elements:
i += 1
content_loc = video_element.find("{http://www.google.com/schemas/sitemap-video/1.1}content_loc").text
duration = video_element.find("{http://www.google.com/schemas/sitemap-video/1.1}duration").text
view_count = video_element.find("{http://www.google.com/schemas/sitemap-video/1.1}view_count").text
thumbnail_loc = video_element.find("{http://www.google.com/schemas/sitemap-video/1.1}thumbnail_loc").text
title = video_element.find("{http://www.google.com/schemas/sitemap-video/1.1}title").text
description = video_element.find("{http://www.google.com/schemas/sitemap-video/1.1}description").text
# 在这里可以使用提取到的元数据进行处理
duration = int(duration)
if not view_count:
continue
view_count = int(view_count)
title = title.strip()
description = description.strip()
if title.startswith("Video Of "):
title = title[len("Video Of "):]
if title.endswith(" · Free Stock Video"):
title = title[:-len(" · Free Stock Video")]
if description.startswith("One of many great free stock videos from Pexels. This video is about "):
description = description[len("One of many great free stock videos from Pexels. This video is about "):]
# print("Content Location:", content_loc)
# print("Duration:", duration)
# print("View Count:", view_count)
# print("Thumbnail Location:", thumbnail_loc)
# print("Title:", title)
# print("Description:", description)
# print("----")
with DatabaseSessionPexelsVideo() as session:
if is_pexels_video_exist(session, thumbnail_loc):
# print(f"视频已存在:{thumbnail_loc}")
continue
thumbnail_feature = process_web_image(thumbnail_loc + "?fm=webp&fit=corp&min-w=640&h=480")
if thumbnail_feature is None:
print("获取视频缩略图特征失败,跳过该视频")
continue
print(f"[{i}/{len(video_elements)}]新增视频:{thumbnail_loc}", end=" \r")
add_pexels_video(
session,
content_loc=content_loc,
duration=duration,
view_count=view_count,
thumbnail_loc=thumbnail_loc,
title=title,
description=description,
thumbnail_feature=thumbnail_feature.tobytes(),
)
if __name__ == '__main__':
create_tables()
for xml in glob.glob("sitemaps/pexels_video/*.xml"):
print(xml)
handel_xml(xml)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/wangziyuyu/MaterialSearch.git
git@gitee.com:wangziyuyu/MaterialSearch.git
wangziyuyu
MaterialSearch
MaterialSearch
main

搜索帮助