-
Notifications
You must be signed in to change notification settings - Fork 555
/
Copy pathyoutube.py
101 lines (89 loc) · 4.34 KB
/
youtube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from langchain.docstore.document import Document
from src.shared.llm_graph_builder_exception import LLMGraphBuilderException
from youtube_transcript_api import YouTubeTranscriptApi
import logging
from urllib.parse import urlparse,parse_qs
from difflib import SequenceMatcher
from datetime import timedelta
from src.shared.constants import YOUTUBE_CHUNK_SIZE_SECONDS
import os
import re
def get_youtube_transcript(youtube_id):
try:
proxy = os.environ.get("YOUTUBE_TRANSCRIPT_PROXY")
proxies = { 'https': proxy }
transcript_pieces = YouTubeTranscriptApi.fetch(youtube_id, proxies = proxies)
print(f" KAUSTUBH {transcript_pieces}")
return transcript_pieces
except Exception as e:
message = f"Youtube transcript is not available for youtube Id: {youtube_id}"
raise LLMGraphBuilderException(message)
def get_youtube_combined_transcript(youtube_id):
try:
transcript_dict = get_youtube_transcript(youtube_id)
transcript=''
for td in transcript_dict:
transcript += ''.join(td['text'])+" "
return transcript
except Exception as e:
message = f"Youtube transcript is not available for youtube Id: {youtube_id}"
raise LLMGraphBuilderException(message)
def create_youtube_url(url):
you_tu_url = "https://www.youtube.com/watch?v="
u_pars = urlparse(url)
quer_v = parse_qs(u_pars.query).get('v')
if quer_v:
return you_tu_url + quer_v[0].strip()
pth = u_pars.path.split('/')
if pth:
return you_tu_url + pth[-1].strip()
def get_documents_from_youtube(url):
try:
match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',url)
transcript= get_youtube_transcript(match.group(1))
transcript_content=''
counter = YOUTUBE_CHUNK_SIZE_SECONDS
pages = []
for i, td in enumerate(transcript):
if td['start'] < counter:
transcript_content += ''.join(td['text'])+" "
else :
transcript_content += ''.join(td['text'])+" "
pages.append(Document(page_content=transcript_content.strip(), metadata={'start_timestamp':str(timedelta(seconds = counter-YOUTUBE_CHUNK_SIZE_SECONDS)).split('.')[0], 'end_timestamp':str(timedelta(seconds = td['start'])).split('.')[0]}))
counter += YOUTUBE_CHUNK_SIZE_SECONDS
transcript_content=''
pages.append(Document(page_content=transcript_content.strip(), metadata={'start_timestamp':str(timedelta(seconds = counter-YOUTUBE_CHUNK_SIZE_SECONDS)).split('.')[0], 'end_timestamp':str(timedelta(seconds =transcript[-1]['start'] if transcript else counter)).split('.')[0]})) # Handle empty transcript_pieces
file_name = match.group(1)#youtube_transcript[0].metadata["snippet"]["title"]
return file_name, pages
except Exception as e:
error_message = str(e)
logging.exception(f'Exception in reading transcript from youtube:{error_message}')
raise LLMGraphBuilderException(error_message)
def get_calculated_timestamps(chunks, youtube_id):
logging.info('Calculating timestamps for chunks')
max_start_similarity=0
max_end_similarity=0
transcript = get_youtube_transcript(youtube_id)
for chunk in chunks:
start_content = chunk.page_content[:40].strip().replace('\n', ' ')
end_content = chunk.page_content[-40:].strip().replace('\n', ' ')
for segment in transcript:
segment['text'] = segment['text'].replace('\n', ' ')
start_similarity = SequenceMatcher(None, start_content, segment['text'])
end_similarity = SequenceMatcher(None, end_content, segment['text'])
if start_similarity.ratio() > max_start_similarity:
max_start_similarity = start_similarity.ratio()
start_time = segment['start']
if end_similarity.ratio() > max_end_similarity:
max_end_similarity = end_similarity.ratio()
end_time = segment['start']+segment['duration']
chunk.metadata['start_timestamp'] = str(timedelta(seconds = start_time)).split('.')[0]
chunk.metadata['end_timestamp'] = str(timedelta(seconds = end_time)).split('.')[0]
max_start_similarity=0
max_end_similarity=0
return chunks
def get_chunks_with_timestamps(chunks):
logging.info('adding end_timestamp to chunks')
for chunk in chunks :
chunk.metadata['end_timestamp'] = str(timedelta(seconds = chunk.metadata['start_seconds']+60)).split('.')[0]
return chunks