1 Star 0 Fork 0

hehuolong / AI-Shorts-Creator

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
autocropper.py 21.28 KB
一键复制 编辑 原始数据 按行查看 历史
# -*- coding: utf-8 -*-
Automatically generated by Colaboratory.
Original file is located at
Cell 1: Installing necessary libraries
!pip install pytube
!pip install opencv-python
!pip install openai
!pip install youtube-transcript-api
"""Cell 2: Importing libraries and setting OpenAI API key"""
from pytube import YouTube
import cv2
import subprocess
import openai
import numpy as np
import json
import math
import pdb
from youtube_transcript_api import YouTubeTranscriptApi
openai.api_key = '' # Replace with your actual OpenAI API key
"""Cell 3: Download YouTube Video function"""
def download_video(url, filename):
yt = YouTube(url)
video = yt.streams.filter(file_extension='mp4').first()
# Download the video
#Segment Video function
def segment_video(response):
for i, segment in enumerate(response):
start_time = math.floor(float(segment.get("start_time", 0)))
end_time = math.ceil(float(segment.get("end_time", 0))) + 2
output_file = f"output{str(i).zfill(3)}.mp4"
command = f"ffmpeg -i input_video.mp4 -ss {start_time} -to {end_time} -c copy {output_file}"
subprocess.call(command, shell=True)
#Face Detection function
def detect_faces(video_file):
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
# Load the video
cap = cv2.VideoCapture(video_file)
faces = []
# Detect and store unique faces
while len(faces) < 5:
ret, frame = cap.read()
if ret:
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
detected_faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
# Iterate through the detected faces
for face in detected_faces:
# Check if the face is already in the list of faces
if not any(np.array_equal(face, f) for f in faces):
# Print the number of unique faces detected so far
print(f"Number of unique faces detected: {len(faces)}")
# Release the video capture object
# If faces detected, return the list of faces
if len(faces) > 0:
return faces
# If no faces detected, return None
return None
#Crop Video function
import cv2
import cv2
def crop_video(faces, input_file, output_file):
if len(faces) > 0:
# Constants for cropping
CROP_RATIO = 0.9 # Adjust the ratio to control how much of the face is visible in the cropped video
VERTICAL_RATIO = 9 / 16 # Aspect ratio for the vertical video
# Read the input video
cap = cv2.VideoCapture(input_file)
# Get the frame dimensions
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Calculate the target width and height for cropping (vertical format)
target_height = int(frame_height * CROP_RATIO)
target_width = int(target_height * VERTICAL_RATIO)
# Create a VideoWriter object to save the output video
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
output_video = cv2.VideoWriter(output_file, fourcc, 30.0, (target_width, target_height))
# Loop through each frame of the input video
while True:
ret, frame = cap.read()
# If no more frames, break out of the loop
if not ret:
# Iterate through each detected face
for face in faces:
# Unpack the face coordinates
x, y, w, h = face
# Calculate the crop coordinates
crop_x = max(0, x + (w - target_width) // 2) # Adjust the crop region to center the face
crop_y = max(0, y + (h - target_height) // 2)
crop_x2 = min(crop_x + target_width, frame_width)
crop_y2 = min(crop_y + target_height, frame_height)
# Crop the frame based on the calculated crop coordinates
cropped_frame = frame[crop_y:crop_y2, crop_x:crop_x2]
# Resize the cropped frame to the target dimensions
resized_frame = cv2.resize(cropped_frame, (target_width, target_height))
# Write the resized frame to the output video
# Release the input and output video objects
print("Video cropped successfully.")
print("No faces detected in the video.")
except Exception as e:
print(f"Error during video cropping: {str(e)}")
def crop_video2(faces, input_file, output_file):
if len(faces) > 0:
# Constants for cropping
CROP_RATIO = 0.9 # Adjust the ratio to control how much of the face is visible in the cropped video
VERTICAL_RATIO = 9 / 16 # Aspect ratio for the vertical video
BATCH_DURATION = 5 # Duration of each batch in seconds
# Read the input video
cap = cv2.VideoCapture(input_file)
# Get the frame dimensions
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Calculate the target width and height for cropping (vertical format)
target_height = int(frame_height * CROP_RATIO)
target_width = int(target_height * VERTICAL_RATIO)
# Calculate the number of frames per batch
frames_per_batch = int(cap.get(cv2.CAP_PROP_FPS) * BATCH_DURATION)
# Create a VideoWriter object to save the output video
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
output_video = cv2.VideoWriter(output_file, fourcc, 30.0, (target_width, target_height))
# Loop through each batch of frames
while True:
ret, frame = cap.read()
# If no more frames, break out of the loop
if not ret:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # Convert frame to BGR color format
# Iterate through each detected face
for face in faces:
# Unpack the face coordinates
x, y, w, h = face
# Calculate the crop coordinates
crop_x = max(0, x + (w - target_width) // 2) # Adjust the crop region to center the face
crop_y = max(0, y + (h - target_height) // 2)
crop_x2 = min(crop_x + target_width, frame_width)
crop_y2 = min(crop_y + target_height, frame_height)
# Crop the frame based on the calculated crop coordinates
cropped_frame = frame[crop_y:crop_y2, crop_x:crop_x2]
# Resize the cropped frame to the target dimensions
resized_frame = cv2.resize(cropped_frame, (target_width, target_height))
# Write the resized frame to the output video
# Check if the current frame index is divisible by frames_per_batch
if cap.get(cv2.CAP_PROP_POS_FRAMES) % frames_per_batch == 0:
# Analyze the lip movement or facial muscle activity within the batch
is_talking = is_talking_in_batch(resized_frame)
# Adjust the focus based on the speaking activity
# Release the input and output video objects
print("Video cropped successfully.")
print("No faces detected in the video.")
except Exception as e:
print(f"Error during video cropping: {str(e)}")
def is_talking_in_batch(frames):
# Calculate the motion between consecutive frames
motion_scores = []
for i in range(len(frames) - 1):
frame1 = frames[i]
frame2 = frames[i+1]
motion_score = calculate_motion_score(frame1, frame2) # Replace with your motion analysis function
# Determine if talking behavior is present based on motion scores
threshold = 0.5 # Adjust the threshold as needed
talking = any(score > threshold for score in motion_scores)
return talking
def calculate_motion_score(frame1, frame2):
# Convert frames to grayscale
gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
# Calculate dense optical flow
flow = cv2.calcOpticalFlowFarneback(gray1, gray2, None, 0.5, 3, 15, 3, 5, 1.2, 0)
# Calculate magnitude of optical flow vectors
magnitude = np.sqrt(flow[..., 0] ** 2 + flow[..., 1] ** 2)
# Calculate motion score as the average magnitude of optical flow vectors
motion_score = np.mean(magnitude)
return motion_score
def adjust_focus(frame, talking):
if talking:
# Apply visual effects or adjustments to emphasize the speaker
# For example, you can add a bounding box or overlay text on the frame
# indicating the speaker is talking
# You can also experiment with resizing or positioning the frame to
# focus on the talking person
# Example: Draw a bounding box around the face region
face_coordinates = get_face_coordinates(frame) # Replace with your face detection logic
if face_coordinates is not None:
x, y, w, h = face_coordinates
cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
return frame
def get_face_coordinates(frame):
# Load the pre-trained Haar cascade classifier for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
# Convert frame to grayscale for face detection
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# Detect faces in the frame
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
if len(faces) > 0:
# Return the coordinates of the first detected face
x, y, w, h = faces[0]
return x, y, w, h
# If no face detected, return None
return None
def get_transcript(video_id):
# Get the transcript for the given YouTube video ID
transcript = YouTubeTranscriptApi.get_transcript(video_id)
# Format the transcript for feeding into GPT-4
formatted_transcript = ''
for entry in transcript:
start_time = "{:.2f}".format(entry['start'])
end_time = "{:.2f}".format(entry['start'] + entry['duration'])
text = entry['text']
formatted_transcript += f"{start_time} --> {end_time} : {text}\n"
return transcript
#Analyze transcript with GPT-3 function
"start_time": 97.19,
"end_time": 127.43,
"description": "Spoken Text here"
"duration":36 #Length in seconds
"start_time": 169.58,
"end_time": 199.10,
"description": "Spoken Text here"
def analyze_transcript(transcript):
prompt = f"This is a transcript of a video. Please identify the 3 most viral sections from the whole, make sure they are more than 30 seconds in duration,Make Sure you provide extremely accurate timestamps respond only in this format {response_obj} \n Here is the Transcription:\n{transcript}"
messages = [
{"role": "system", "content": "You are a ViralGPT helpful assistant. You are master at reading youtube transcripts and identifying the most Interesting and Viral Content"},
{"role": "user", "content": prompt}
response = openai.ChatCompletion.create(
return response.choices[0]['message']
"""Main function and execution"""
interseting_seg='''[{'text': 'happiness through Curiosity on Dr', 'start': 0.0, 'duration': 4.82}, {'text': 'eclipse', 'start': 2.28, 'duration': 2.54}, {'text': 'little rookie question for you okay and', 'start': 6.899, 'duration': 4.021}, {'text': "I'm asking this on behalf of mainstream", 'start': 9.24, 'duration': 3.6}, {'text': 'media how do you feel when you see', 'start': 10.92, 'duration': 5.4}, {'text': 'movies like pathan or tiger or any', 'start': 12.84, 'duration': 5.939}, {'text': "Indian I think we haven't got the art of", 'start': 16.32, 'duration': 4.5}, {'text': 'doing those movies you think they can be', 'start': 18.779, 'duration': 4.321}, {'text': 'done better oh yes I mean they can be', 'start': 20.82, 'duration': 3.42}, {'text': 'realistic', 'start': 23.1, 'duration': 3.12}, {'text': "okay we're not realistic what you see", 'start': 24.24, 'duration': 4.32}, {'text': 'what is not realistic about them huh', 'start': 26.22, 'duration': 4.219}, {'text': "it's not realistic", 'start': 28.56, 'duration': 4.38}, {'text': "you're trying to make a James Bond movie", 'start': 30.439, 'duration': 5.741}, {'text': 'which is also not realistic okay', 'start': 32.94, 'duration': 5.88}, {'text': 'then you have this story of the isi girl', 'start': 36.18, 'duration': 4.74}, {'text': 'in the raw man', 'start': 38.82, 'duration': 4.86}, {'text': 'living happily ever after I mean', 'start': 40.92, 'duration': 4.639}, {'text': 'take a break', 'start': 43.68, 'duration': 7.08}, {'text': 'has that ever happened not really right', 'start': 45.559, 'duration': 7.48}, {'text': 'no the whole atmospherics of the whole', 'start': 50.76, 'duration': 3.54}, {'text': 'thing you know', 'start': 53.039, 'duration': 3.36}, {'text': "I haven't seen batana and I won't see it", 'start': 54.3, 'duration': 5.099}, {'text': "because I don't think it is an accurate", 'start': 56.399, 'duration': 4.98}, {'text': "depiction it's not an accurate I'm not", 'start': 59.399, 'duration': 4.941}, {'text': 'going to waste my time', 'start': 61.379, 'duration': 2.961}, {'text': 'and I laughed and I enjoyed that because', 'start': 65.18, 'duration': 6.28}, {'text': 'it was so quaint', 'start': 68.04, 'duration': 5.7}, {'text': 'not because it was defeating anything', 'start': 71.46, 'duration': 3.659}, {'text': 'yeah', 'start': 73.74, 'duration': 5.4}, {'text': 'like you had that other movie of um', 'start': 75.119, 'duration': 7.5}, {'text': 'war that they can no this was this', 'start': 79.14, 'duration': 5.82}, {'text': 'fellow Salman Khan going under a tunnel', 'start': 82.619, 'duration': 5.281}, {'text': 'into Pakistan to deliver a girl who had', 'start': 84.96, 'duration': 4.88}, {'text': 'got legendary', 'start': 87.9, 'duration': 4.14}, {'text': 'but whatever', 'start': 89.84, 'duration': 4.86}, {'text': 'I mean', 'start': 92.04, 'duration': 2.66}, {'text': 'could I exaggerated okay this is not you', 'start': 95.46, 'duration': 5.4}, {'text': 'have to have entertainment which is fun', 'start': 99.0, 'duration': 4.079}, {'text': 'and realistic you should see that movie', 'start': 100.86, 'duration': 3.36}, {'text': 'The', 'start': 103.079, 'duration': 4.86}, {'text': 'Bridge of spies hey that is a real movie', 'start': 104.22, 'duration': 6.78}, {'text': 'okay that is how real spy movies are', 'start': 107.939, 'duration': 5.521}, {'text': 'made what does a real spy movie', 'start': 111.0, 'duration': 5.46}, {'text': 'constitute it means dealing with actual', 'start': 113.46, 'duration': 5.64}, {'text': 'facts no no blonde round no nothing', 'start': 116.46, 'duration': 4.74}, {'text': "around it's okay living a lonely life", 'start': 119.1, 'duration': 4.799}, {'text': "you're living on by yourself living your", 'start': 121.2, 'duration': 6.0}, {'text': 'cover story he able uh', 'start': 123.899, 'duration': 5.821}, {'text': 'with goldfish was actually a notice so', 'start': 127.2, 'duration': 3.839}, {'text': 'he was doing paintings he used to make', 'start': 129.72, 'duration': 3.78}, {'text': 'him make money out of it and but he was', 'start': 131.039, 'duration': 5.161}, {'text': 'doing this other job also so running is', 'start': 133.5, 'duration': 5.099}, {'text': 'espionage ring', 'start': 136.2, 'duration': 4.92}, {'text': 'and they show all that how a documents', 'start': 138.599, 'duration': 5.22}, {'text': 'are exchanged or document information is', 'start': 141.12, 'duration': 4.86}, {'text': 'exchanged you have things called letter', 'start': 143.819, 'duration': 5.941}, {'text': 'dead letter boxes a dead letter box in', 'start': 145.98, 'duration': 7.2}, {'text': 'in Espionage is a place it could be a', 'start': 149.76, 'duration': 6.42}, {'text': "book let's say or or that statue I put", 'start': 153.18, 'duration': 6.48}, {'text': 'my UBS under it', 'start': 156.18, 'duration': 5.279}, {'text': 'and leave it', 'start': 159.66, 'duration': 5.46}, {'text': 'and leave a sign outside on some tree or', 'start': 161.459, 'duration': 4.801}, {'text': 'a wall', 'start': 165.12, 'duration': 5.759}, {'text': "that I've I've fed the the dead litter", 'start': 166.26, 'duration': 6.42}, {'text': 'box okay so the other chap comes and', 'start': 170.879, 'duration': 3.661}, {'text': 'picks it up and takes it away the two', 'start': 172.68, 'duration': 4.26}, {'text': 'never meet based on the true nature of', 'start': 174.54, 'duration': 3.72}, {'text': 'espionage', 'start': 176.94, 'duration': 4.2}, {'text': "which Indian actor's style would be best", 'start': 178.26, 'duration': 7.259}, {'text': 'suited to portray the character of a spy', 'start': 181.14, 'duration': 6.84}, {'text': 'you know I I saw um', 'start': 185.519, 'duration': 4.921}, {'text': 'three three three actors were three or', 'start': 187.98, 'duration': 4.679}, {'text': 'four actors were very good this kind of', 'start': 190.44, 'duration': 3.299}, {'text': 'a thing', 'start': 192.659, 'duration': 3.901}, {'text': 'who could fit into these kind Sorrows', 'start': 193.739, 'duration': 6.481}, {'text': 'not giving any order of preference but', 'start': 196.56, 'duration': 7.02}, {'text': 'I like nawazuddin Siddiqui I used to', 'start': 200.22, 'duration': 5.599}, {'text': 'like Imran Khan', 'start': 203.58, 'duration': 4.439}, {'text': 'Irfan Khan sorry', 'start': 205.819, 'duration': 6.28}, {'text': 'and he was he was a consummate actor', 'start': 208.019, 'duration': 8.821}, {'text': 'Anup anupam care and', 'start': 212.099, 'duration': 8.241}, {'text': 'these two actors', 'start': 216.84, 'duration': 3.5}, {'text': 'the one who played family man um', 'start': 220.62, 'duration': 6.96}, {'text': 'very good okay they could fit into the', 'start': 224.84, 'duration': 8.02}, {'text': 'room and Mishra pankaj Mishra foreign', 'start': 227.58, 'duration': 5.28}, {'text': '[Music]', 'start': 233.72, 'duration': 3.11}, {'text': "spy all right it's a cold war story", 'start': 259.699, 'duration': 6.461}, {'text': "about the the it's actually based on", 'start': 263.52, 'duration': 5.179}, {'text': 'this Cambridge 5.', 'start': 266.16, 'duration': 5.9}, {'text': 'you know the Cambridge five those', 'start': 268.699, 'duration': 6.341}, {'text': 'Kim philby and others who were spying', 'start': 272.06, 'duration': 6.1}, {'text': 'for who were actually with the MI6 but', 'start': 275.04, 'duration': 6.0}, {'text': 'it was actually a KGB agent okay the', 'start': 278.16, 'duration': 5.58}, {'text': 'real mole and he would have been Chief', 'start': 281.04, 'duration': 4.08}, {'text': 'maybe one day', 'start': 283.74, 'duration': 4.08}, {'text': 'at the not been caught out', 'start': 285.12, 'duration': 7.579}, {'text': 'so on that is made a novel Tinker spy', 'start': 287.82, 'duration': 7.26}, {'text': "it's beautifully done the book is", 'start': 292.699, 'duration': 6.241}, {'text': 'marvelous and the acting and the', 'start': 295.08, 'duration': 3.86}, {'text': 'you should watch it okay and watch this', 'start': 302.78, 'duration': 6.04}, {'text': 'uh Bridge of spies if you enjoyed this', 'start': 305.88, 'duration': 5.9}, {'text': 'video subscribe TRS clips for more', 'start': 308.82, 'duration': 15.86}, {'text': '[Music]', 'start': 311.78, 'duration': 15.33}, {'text': 'thank you', 'start': 324.68, 'duration': 8.55}, {'text': '[Music]', 'start': 327.11, 'duration': 6.12}]''';
def main():
url = 'https://www.youtube.com/watch?v='+video_id # Replace with your video's URL
filename = 'input_video.mp4'
transcript = get_transcript(video_id)
interesting_segment = analyze_transcript(transcript)
content = interesting_segment["content"]
parsed_content = json.loads(content)
# Loop through each segment
for i in range(0, 3): # Replace 3 with the actual number of segments
input_file = f'output{str(i).zfill(3)}.mp4'
output_file = f'output_cropped{str(i).zfill(3)}.mp4'
faces = detect_faces(input_file)
crop_video(faces, input_file, output_file)
# Assume you have a way to get the transcript. This is not shown here.
# Replace with actual transcript
# Run the main function
马建仓 AI 助手


344bd9b3 5694891 D2dac590 5694891