-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.py
1148 lines (990 loc) · 51.2 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import ast
import base64
import configparser
import cv2
import datetime
#import enchant
import io
import json
import moviepy.editor as mp
import noisereduce as nr
import numpy as np
import openai
import metatagger
import librosa
#import nltk
import os
import PIL
import pytesseract
import re
import shutil
import sys
import time
import tkinter as tk
import torch
import torchvision.transforms as transforms
import tqdm
import traceback
import urllib.request
import vlc
#from alive_progress import alive_bar
from difflib import SequenceMatcher
from google.cloud import vision
from moviepy.editor import AudioFileClip
#from nltk.corpus import words
from openai import RateLimitError, BadRequestError, InternalServerError, APITimeoutError
from skimage.metrics import structural_similarity as ssim
#from spellchecker import SpellChecker
from tkinter import ttk
from torch import hub
#from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor #Wav2Vec2Tokenizer
scriptPath = os.path.realpath(os.path.dirname(__file__))
config = configparser.ConfigParser()
config.read(os.path.join(scriptPath,'config.ini'))
json_file = config['directories']['json file']
openai_client = openai.OpenAI(api_key=config['analysis']['openai api key'])
openai_client.api_key = config['analysis']['openai api key']
#nltk.download('words')
# Define the transformation to apply to video frames
transform = transforms.Compose([
transforms.ToPILImage(),
transforms.Resize((112, 112)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
])
# Set up Tesseract OCR
#pytesseract.pytesseract.tesseract_cmd = config['analysis']['tesseract executable']
#tesseract_config = f"--psm 3 -l eng" # Set language to English
#ocr_confidence = float(config['analysis']['tesseract confidence'])
#spell = SpellChecker()
class VideoPlayer(tk.Frame):
def __init__(self, master=None, media_path=None):
super().__init__(master)
self.media_path = media_path
self.instance = vlc.Instance("--no-xlib --quiet")
self.player = self.instance.media_player_new()
self.media = self.instance.media_new(media_path)
self.player.set_media(self.media)
self.video_canvas = tk.Canvas(self)
self.video_canvas.pack()
self.player.play()
self.player.set_hwnd(self.video_canvas.winfo_id())
# Create a progress bar and label
self.progress_var = tk.DoubleVar()
self.progress_bar = ttk.Progressbar(self, variable=self.progress_var, maximum=100, length=375)
self.progress_bar.pack(fill=tk.Y, padx=10)
# Bind the click event to seek_to_position method
self.progress_bar.bind("<Button-1>", self.seek_to_position)
# Start checking the video state
self.update_progress()
def update_progress(self):
state = self.player.get_state()
if state == vlc.State.Playing or state == vlc.State.Paused:
total_time = self.player.get_length() / 1000 # Total time in seconds
if total_time > 0:
current_time = self.player.get_time() / 1000 # Current time in seconds
self.progress_var.set((current_time / total_time) * 100)
if state == vlc.State.Ended:
self.player.stop()
self.player.set_time(0)
self.player.play()
self.after(100, self.update_progress)
def load_media(self, new_media_path):
# Load a new media source
new_media = self.instance.media_new(new_media_path)
self.player.set_media(new_media)
self.player.play() # Start playing the new video
def seek_to_position(self, event):
# Get the position where the click occurred
click_position = event.x / self.progress_bar.winfo_width()
# Set the player's position to the target time
total_time = self.player.get_length() / 1000 # Total time in seconds
target_time = total_time * click_position
self.player.set_time(int(target_time * 1000))
def play(self):
self.player.play()
def pause(self):
self.player.pause()
def stop(self):
self.player.stop()
def release(self):
self.player.stop()
self.player.release()
def step_backward(self):
fps = self.player.get_fps() # Get the frames per second of the video
current_time = self.player.get_time() # Get the current playback time in milliseconds
frame_duration_ms = int(1000 / fps) # Duration of each frame in milliseconds
target_time = max(current_time - frame_duration_ms, 0) # Move back one frame (in milliseconds)
self.player.set_time(target_time) # Seek to the new time
self.update_progress()
def step_forward(self):
fps = self.player.get_fps() # Get the frames per second of the video
current_time = self.player.get_time() # Get the current playback time in milliseconds
frame_duration_ms = int(1000 / fps) # Duration of each frame in milliseconds
target_time = min(current_time + frame_duration_ms, self.media.get_duration()) # Move forward one frame (in milliseconds)
self.player.set_time(target_time) # Seek to the new time
self.update_progress()
def seek_to_frame(self, frame_number):
fps = self.media.get_fps()
target_time = int((frame_number / fps) * 1000)
self.player.set_time(target_time)
self.update_progress()
def go_to_first_frame(self):
if self.player.is_playing():
self.player.pause()
self.player.set_position(0.0)
self.update_progress()
def go_to_last_frame(self):
if self.player.is_playing():
self.player.pause()
duration_ms = self.player.get_length()
frame_rate = self.player.get_fps()
total_frames = int(duration_ms * frame_rate / 1000)
last_frame_index = total_frames - 1
self.player.set_time(int((last_frame_index / frame_rate) * 1000))
self.update_progress()
def go_forward_x_percent(self,x=1):
duration_ms = self.player.get_length()
step_ms = duration_ms // 100 # 1% of the video duration in milliseconds
current_position_ms = self.player.get_time()
new_position_ms = current_position_ms + step_ms*x
if new_position_ms <= duration_ms:
self.player.set_time(new_position_ms)
self.update_progress()
def go_backward_x_percent(self,x=1):
duration_ms = self.player.get_length()
step_ms = duration_ms // 100 # 1% of the video duration in milliseconds
current_position_ms = self.player.get_time()
new_position_ms = current_position_ms - step_ms*x
if new_position_ms >= 0:
self.player.set_time(new_position_ms)
self.update_progress()
def set_volume(self, volume):
self.player.audio_set_volume(volume)
def update_timecode(video_player,timecode,gui_window,frame_offset=0):
current_ms = video_player.player.get_time()
total_ms = video_player.player.get_length()
frame_rate = video_player.player.get_fps()
current_hours, remaining_minutes = divmod(current_ms // 1000, 3600)
current_minutes, current_seconds = divmod(remaining_minutes, 60)
total_hours, total_remaining_minutes = divmod(total_ms // 1000, 3600)
total_minutes, total_seconds = divmod(total_remaining_minutes, 60)
current_frame = str(int(current_ms / 1000 * frame_rate)+frame_offset)
total_frames = str(int(total_ms / 1000 * frame_rate)+frame_offset)
if int(total_hours) > 0:
timecode_string = (
f"{current_frame}/{total_frames} | "
f"{current_hours:02d}:{current_minutes:02d}:{current_seconds:02d}/"
f"{total_hours:02d}:{total_minutes:02d}:{total_seconds:02d}"
)
else:
timecode_string = (
f"{current_frame}/{total_frames} | "
f"{current_minutes:02d}:{current_seconds:02d}/"
f"{total_minutes:02d}:{total_seconds:02d}"
)
timecode.set(timecode_string)
gui_window.after(33, lambda: update_timecode(video_player,timecode,gui_window,frame_offset))
return current_ms,current_frame
def show_video_player(window_object,file_path,item,frame_offset=0):
global start_frame_var
global end_frame_var
video_player = VideoPlayer(window_object, media_path=file_path)
# Update the entry widgets with current item values and populate the clip_widget_mapping list
volume_frame = tk.Frame(window_object)
# Add a volume bar
volume_scale = ttk.Scale(volume_frame, from_=100, to=0, orient=tk.VERTICAL, length=250, command=lambda v: video_player.player.audio_set_volume(round((float(v)))))
#volume_scale.set(75) # Set initial volume
volume_scale.grid(row=0, column=0)
# Add a mute button
mute_button = ttk.Button(volume_frame, text="Mute", command=lambda: video_player.player.audio_set_mute(not video_player.player.audio_get_mute()))
mute_button.grid(row=1, column=0, padx=5, sticky="nsew")
control_frame = ttk.Frame(window_object)
# Buttons to step back x%
extra_jump_back_button = ttk.Button(control_frame, text="<<<", command=lambda: video_player.go_backward_x_percent(x=5))
extra_jump_back_button.grid(row=0, column=0, sticky="nsew")
jump_back_button = ttk.Button(control_frame, text="<<", command=lambda: video_player.go_backward_x_percent(x=0.5))
jump_back_button.grid(row=0, column=1, sticky="nsew")
# Add buttons for step backward and step forward
step_backward_button = ttk.Button(control_frame, text="<", command=video_player.step_backward)
step_backward_button.grid(row=0, column=2, sticky="nsew")
pause_button = ttk.Button(control_frame, text="►/‖", command=video_player.pause)
pause_button.grid(row=0, column=3, sticky="nsew")
step_forward_button = ttk.Button(control_frame, text=">", command=video_player.step_forward)
step_forward_button.grid(row=0, column=4, sticky="nsew")
# Buttons to step forward x%
jump_forward_button = ttk.Button(control_frame, text=">>", command=lambda: video_player.go_forward_x_percent(x=0.5))
jump_forward_button.grid(row=0, column=5, sticky="nsew")
extra_jump_forward_button = ttk.Button(control_frame, text=">>>", command=lambda: video_player.go_forward_x_percent(x=5))
extra_jump_forward_button.grid(row=0, column=6, sticky="nsew")
timecode = tk.StringVar()
timecode.set('TIMECODE')
time_label = tk.Label(window_object,textvariable=timecode,font=("Lucida Console", 12, "normal"))
current_frame,current_ms = update_timecode(video_player,timecode,window_object,frame_offset)
# Entry widgets to show the start and end frames
'''start_frame_var = tk.StringVar()
start_ms_var = tk.StringVar()
start_frame_entry = ttk.Entry(control_frame, textvariable=start_frame_var, width=8)
start_frame_entry.grid(row=1, column=2, padx=5, pady=5)
end_frame_var = tk.StringVar()
end_ms_var = tk.StringVar()
end_frame_entry = ttk.Entry(control_frame, textvariable=end_frame_var, width=8)
end_frame_entry.grid(row=1, column=3, padx=5, pady=5)
def set_current_frame(frame_offset,video_player,start=True):
current_ms = video_player.player.get_time()
frame_rate = video_player.player.get_fps()
current_frame = str(int(current_ms / 1000 * frame_rate)+frame_offset)
if start is True:
start_frame_var.set(current_frame)
start_ms_var.set(current_ms)
start_frame_entry.update()
else:
end_frame_var.set(current_frame)
end_ms_var.set(current_ms)
end_frame_entry.update
def set_start_end(frame_offset,video_player,start=True):
total_ms = video_player.player.get_length()
frame_rate = video_player.player.get_fps()
total_frames = str(int(total_ms / 1000 * frame_rate)+frame_offset)
if start is True:
start_frame_var.set(frame_offset)
start_frame_entry.update()
else:
end_frame_var.set(total_frames)
end_frame_entry.update
# Buttons to get the current frame and update start and end frame entry widgets
set_first_frame_button = ttk.Button(control_frame, text="First In",command=lambda: set_start_end(frame_offset,video_player,start=True))
set_first_frame_button.grid(row=1, column=0, padx=5, pady=5)
set_start_frame_button = ttk.Button(control_frame, text="Current In",command=lambda: set_current_frame(frame_offset,video_player,start=True))
set_start_frame_button.grid(row=1, column=1, padx=5, pady=5)
set_end_frame_button = ttk.Button(control_frame, text="Current Out",command=lambda: set_current_frame(frame_offset,video_player,start=False))
set_end_frame_button.grid(row=1, column=4, padx=5, pady=5)
set_last_frame_button = ttk.Button(control_frame, text="Last Out",command=lambda: set_start_end(frame_offset,video_player,start=False))
set_last_frame_button.grid(row=1, column=5, padx=5, pady=5)
# Button to initiate the new_video_clip function
def create_new_clip():
new_clip_data={}
new_clip_data['source_file'] = file_path
new_clip_data['tape_id'] = item.get("Tape ID", "")
if new_clip_data['tape_id'] == "":
new_clip_data['tape_id'] = item.get("Tape_ID", "")
new_clip_data['start_frame'] = int(start_frame_var.get())
new_clip_data['end_frame'] = int(end_frame_var.get())
# Calculate start_ms and end_ms based on frame numbers and frame rate
frame_rate = video_player.player.get_fps()
new_clip_data['start_ms'] = int(new_clip_data['start_frame'] / frame_rate * 1000)
new_clip_data['end_ms'] = int(new_clip_data['end_frame'] / frame_rate * 1000)
# Validate start_ms and end_ms
if new_clip_data['start_ms'] < new_clip_data['end_ms']:
new_video_clip(new_clip_data)
else:
error_message="Start Frame ["+str(new_clip_data['start_frame'])+"] is higher than End Frame ["+str(new_clip_data['end_frame'])+"]"
messagebox.showinfo("ERROR!", error_message,icon=messagebox.ERROR)
create_new_clip_button = ttk.Button(control_frame, text="New Clip",
command=create_new_clip)
create_new_clip_button.grid(row=1, column=6, padx=5, pady=5)'''
return video_player,volume_frame,control_frame,time_label
def new_video_clip(new_clip_data):
file_extension = os.path.basename(new_clip_data['source_file']).split('.')[1]
base_file = os.path.join(os.path.dirname(json_file),new_clip_data['tape_id']+'.'+file_extension)
output_name = new_clip_data['tape_id']+'_'+str(new_clip_data['start_frame'])+'-'+str(new_clip_data['end_frame'])+'.'+file_extension
output_path = os.path.join(os.path.dirname(json_file),new_clip_data['tape_id'])
import editor
try:
editor.splitVideo(base_file,new_clip_data['start_ms']/1000,new_clip_data['end_ms']/1000,output_name,output_path)
except Exception as e:
print(e)
return
completion_message = f"{output_name} created in {output_path}"
messagebox.showinfo("Clip Creation Complete", completion_message,icon=messagebox.INFO)
def enhance_edges(image):
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply Gaussian blur
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
# Perform histogram equalization
equalized = cv2.equalizeHist(blurred)
# Apply adaptive thresholding
threshold = cv2.adaptiveThreshold(equalized, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
# Perform morphological operations (optional)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
morphed = cv2.morphologyEx(threshold, cv2.MORPH_CLOSE, kernel)
return morphed
def is_frame_similar(frame1, frame2, threshold):
# Convert frames to grayscale
frame1_gray = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
frame2_gray = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
# Calculate SSIM score between frames
score = ssim(frame1_gray, frame2_gray)
# If the SSIM score is above the threshold, frames are considered similar
return score >= threshold
def perform_local_text_detection(frame):
# Convert the frame to grayscale for better text recognition
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# Apply image preprocessing techniques if necessary (e.g., thresholding, denoising)
# Perform text detection using Tesseract OCR
text = pytesseract.image_to_string(gray)
# Return the detected text if any
if text.strip():
return text.strip()
else:
return None
def label_detection(client, image_content, is_vision_image=False):
if is_vision_image == False:
_, frame_bytes = cv2.imencode('.jpg', image_content)
image = vision.Image(content=frame_bytes.tobytes())
else:
image = image_content
response = client.label_detection(image=image)
if response.error.message:
raise Exception(
'{}\nFor more info on error messages, check: '
'https://cloud.google.com/apis/design/errors'.format(
response.error.message))
return None
else:
labels = response.label_annotations
label_descriptions = [label.description for label in labels if label.score >= 0.9]
return label_descriptions
def logo_detection(client, image_content, is_vision_image=False):
if is_vision_image == False:
_, frame_bytes = cv2.imencode('.jpg', image_content)
image = vision.Image(content=frame_bytes.tobytes())
else:
image = image_content
response = client.logo_detection(image=image)
if response.error.message:
raise Exception(
'{}\nFor more info on error messages, check: '
'https://cloud.google.com/apis/design/errors'.format(
response.error.message))
return None
else:
logos = response.logo_annotations
logo_descriptions = [logo.description for logo in logos if logo.score >= 0.9]
return logo_descriptions
def detect_text_in_video(video_path, frames, x, y, similarity_threshold,client,redirector=None):
print("[ACTION] Extracting text from the video")
if redirector != None:
redirector.progress_var.set(0)
redirector.progress_widget['maximum'] = len(frames)
redirector.progress_label.config(text="")
frame_count = len(frames)
text = []
prev_texts = []
logo_descriptions = []
prev_frame = None # Store the previous frame
prev_f = -50
count = 0
#spell_checker = SpellChecker()
batch_size = 1
start_time = time.time()
p = 0
#with alive_bar(frame_count, force_tty=False) as bar:
for f, frame in enumerate(frames):
if f <= 5:
if redirector != None:
p+=1
progress_list = get_eta(start_time,p,frame_count)
progress(redirector.progress_widget,p,batch_size,redirector.progress_label,progress_list,redirector.progress_var)
#bar()
continue
if prev_frame is not None:
if is_frame_similar(frame, prev_frame, similarity_threshold) or prev_f > f-25:
# Skip text detection if the frame is similar to the previous frame
if redirector != None:
p+=1
progress_list = get_eta(start_time,p,frame_count)
progress(redirector.progress_widget,p,batch_size,redirector.progress_label,progress_list,redirector.progress_var)
#bar()
continue
detected_text = perform_local_text_detection(frame)
if detected_text:
#print("Text Detected!")
prev_f = f
# Convert frame to bytes
_, frame_bytes = cv2.imencode('.jpg', frame)
frame_image = vision.Image(content=frame_bytes.tobytes())
logo_set = set(logo_descriptions)
logo_data = set(logo_detection(client, frame_image, is_vision_image=True))
count += 1
logo_descriptions = list(logo_set.union(logo_data))
response = client.text_detection(image=frame_image)
count += 1
frame_text_data = response.text_annotations
if response.error.message:
raise Exception(
'{}\nFor more info on error messages, check: '
'https://cloud.google.com/apis/design/errors'.format(
response.error.message))
elif frame_text_data != []:
frame_text = frame_text_data[0].description.replace('\n',' ').strip()
frame_text = re.sub(r'[^\w\s\d!"#$%&\'()*+,\-./:;<=>?@[\\]^_`{|}~]', '', frame_text)
text_width, text_height = cv2.getTextSize(frame_text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0]
if frame_text != '':
frame_words = frame_text.split()
real_frame_words = [word for word in frame_words if word.lower() in set(words.words()) or word.isdigit()]
if real_frame_words and len(real_frame_words) > 3:
filtered_text = ' '.join(real_frame_words)
# Check similarity with previous texts
similarity_found = any(similarity_ratio(filtered_text, prev_text) >= 0.90 for prev_text in prev_texts)
if not similarity_found and len(frames) > 1 and len(filtered_text) > 3:
text.append(filtered_text.strip())
prev_texts.append(filtered_text.strip())
print("[TEXT]: " + filtered_text)
#bar()
if redirector != None:
p+=1
progress_list = get_eta(start_time,p,frame_count)
progress(redirector.progress_widget,p,batch_size,redirector.progress_label,progress_list,redirector.progress_var)
prev_frame = frame # Update the previous frame
#video.release()
#cv2.destroyAllWindows()
print("[INFO] Detection Usage Count: "+str(count))
return text, count, logo_descriptions
def similarity_ratio(a, b):
return SequenceMatcher(None, a, b).ratio()
'''def extract_audio_transcript(audio_file, language):
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name, language=language)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
desired_sampling_rate = 16000
waveform, sample_rate = librosa.load(audio_file, sr=desired_sampling_rate)
# Convert to mono if necessary
if len(waveform.shape) > 1 and waveform.shape[0] > 1:
waveform = waveform.mean(axis=0, keepdims=True)
# Perform noise reduction
waveform = nr.reduce_noise(y=waveform, sr=sample_rate)
# Preprocess the audio waveform
input_values = processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_values
# Perform speech-to-text inference
with torch.no_grad():
logits = model(input_values).logits
# Decode the predicted tokens
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription'''
def get_video_metadata(json_file, clip_filename, frame_rate=30):
# Load the JSON data from the file
with open(json_file) as file:
data = json.load(file)
# Extract the tape ID, start frame, and end frame from the clip filename
#print(os.path.basename(clip_filename))
clip_parts = os.path.basename(clip_filename).split('_')
#print(clip_parts)
tape_id = clip_parts[0]
#print(tape_id)
frame_range = clip_parts[1].split('-')
#print(frame_range)
start_frame = int(frame_range[0])
end_frame = int(frame_range[1].split('.')[0])
# Find the matching entry in the JSON data
for entry in data:
if entry['Tape_ID'] == tape_id:
segment_start = entry['Segment Start']
segment_end = entry['Segment End']
start_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(segment_start.split(':'))))
end_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(segment_end.split(':'))))
# Convert start and end times to frames
start_frame_json = int(start_time * frame_rate)
end_frame_json = int(end_time * frame_rate)
if start_frame_json <= start_frame and end_frame_json >= end_frame:
return tape_id, entry['Network/Station'], entry['Programs'], entry['Recording Date'], entry['Location'], start_frame, end_frame
# Return None if no matching entry is found
return None
def generate_summary(audio_text,metadata,frames):
#screen_text_string = '\n'.join(screen_text)
# Join elements of nested lists, convert all elements to strings
'''logos = ['\n'.join(item) if isinstance(item, list) else item for item in logos]
# Flatten the list and convert elements to strings
logos = [item for sublist in logos for item in sublist]
logos_string = '\n'.join(logos)'''
video_context = ''
response = None
if metadata:
video_context += "This is the metadata for the program this video clip is from:\n Program: "+metadata[2]+'\nStation: '+metadata[1]+'\n Date Recorded: '+metadata[3]+'\n Tape ID: '+metadata[0]
'''if screen_text:
video_context += '\nThe following is text that was detected on screen during the clip, some of it may be repetitive in whole or part:\n'+screen_text_string'''
if audio_text:
video_context += '\nThe following text is a transcript of the audio from the video clip\n'+audio_text
'''if len(logos) > 0:
video_context += '\nThe following text is a list of logos detected in the video, ignore anything that did not exist before the air date.\n'+logos_string
if len(labels) > 0:
video_context += '\nThe following text is a list of labels of objects detected in three frames of the video, at the 10%, 50% and 90% duration points.\n'+logos_string'''
content = [{"type": "text","text": video_context}]
for frame in frames:
content.append({"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{frame}","detail":config['analysis']['vision detail level']}})
loops = 0
while True:
loops += 1
try:
response=openai_client.chat.completions.create(
model=config['analysis']['gpt model'],
messages=[
{"role": "system", "content": [config['analysis']['chatgpt role']]},
{"role": "assistant", "content": content},
{"role": "user", "content": [config['analysis']['chatgpt prompt']]}
],
max_tokens=4096,
)
break
except RateLimitError as e:
error_message = e.message
try:
match = re.search(r"'message': '(.*?)'", error_message)
if match:
error_text = match.group(1)
print("[ERROR] "+ error_text)
# Implement your retry logic based on the error information
time_pattern = re.compile(r'in (\d+m\d+s)')
match = time_pattern.search(error_text)
if match:
time_to_wait = match.group(1)
seconds_to_wait = int(time_to_wait.split('m')[0]) * 60 + int(time_to_wait.split('m')[1].rstrip('s'))
else:
seconds_to_wait = 15 * 60
retry_at = datetime.datetime.now() + datetime.timedelta(seconds=seconds_to_wait)
retry_at_str = retry_at.strftime('%H:%M:%S')
print("[INFO] Retrying at " + retry_at_str)
time.sleep(seconds_to_wait)
except Exception as e:
print(e)
#print(error_message)
retry_at = datetime.datetime.now()+datetime.timedelta(minutes=15)
retry_at_str = retry_at.strftime('%H:%M:%S')
print("[INFO] Retrying at "+retry_at_str)
time.sleep(900)
except BadRequestError as e:
error_message = e.message
match = re.search(r"'message': '(.*?)'", error_message)
if match:
error_text = match.group(1)
print("[ERROR] "+ error_text)
if loops < 3:
code_match = re.search(r"'code': '(.*?)'", error_message)
if code_match:
code = code_match.group(1)
if code == "content_policy_violation":
content = [{"type": "text","text": video_context}]
retry_at = datetime.datetime.now()+datetime.timedelta(seconds=loops*loops)
retry_at_str = retry_at.strftime('%H:%M:%S')
print("[INFO] Retrying at "+retry_at_str)
time.sleep(loops*loops)
else:
if loops <= 5:
retries = loops - 2
retry_at = datetime.datetime.now()+datetime.timedelta(seconds=retries*retries)
retry_at_str = retry_at.strftime('%H:%M:%S')
print("[INFO] Retrying at "+retry_at_str)
time.sleep(retries*retries)
else:
break
except (APITimeoutError, InternalServerError) as e:
error_message = e.message
match = re.search(r"'message': '(.*?)'", error_message)
if match:
error_text = match.group(1)
print("[ERROR] "+ error_text)
if loops <= 6:
retry_at = datetime.datetime.now()+datetime.timedelta(seconds=loops*loops)
retry_at_str = retry_at.strftime('%H:%M:%S')
print("[INFO] Retrying at "+retry_at_str)
time.sleep(loops*loops)
else:
break
#print(response)
finish_details = response.choices[0].finish_details['type']
summary = response.choices[0].message.content
tokens_used = response.usage.total_tokens
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
#print(response)
return summary, tokens_used, input_tokens, output_tokens, finish_details
def get_eta(start_time,f,total_frames):
seconds_elapsed = time.time() - start_time
elapsed_minutes, elapsed_seconds = divmod(seconds_elapsed, 60)
elapsed_hours, elapsed_minutes = divmod(elapsed_minutes, 60)
if elapsed_hours > 0:
time_elapsed = "{:02d}:{:02d}:{:02d}".format(round(elapsed_hours), round(elapsed_minutes), round(elapsed_seconds))
else:
time_elapsed = "{:02d}:{:02d}".format(round(elapsed_minutes), round(elapsed_seconds))
try:
frames_per_second = f / seconds_elapsed
except ZeroDivisionError:
frames_per_second = 0.001
remaining_frames = total_frames - f
if frames_per_second > 0:
remaining_time = remaining_frames / frames_per_second
else:
remaining_time = 0
minutes, seconds = divmod(remaining_time, 60)
hours, minutes = divmod(minutes, 60)
if hours > 0:
time_remaining = "{:02d}:{:02d}:{:02d}".format(round(hours), round(minutes), round(seconds))
else:
time_remaining = "{:02d}:{:02d}".format(round(minutes), round(seconds))
return time_elapsed, frames_per_second, time_remaining
def progress(progress_widget,frames_processed,batch_size,progress_label,progress_list,progress_var):
if progress_widget is not None and frames_processed % batch_size == 0:
time_elapsed, frames_per_second, time_remaining = progress_list
#print(frames_processed)
#print(progress_widget['maximum'])
percentage_complete = round((frames_processed/progress_widget['maximum'])*100)
#print(percentage_complete)
frames_per_second = "{:.2f}".format(frames_per_second)
progress_label.config(text=str(percentage_complete)+'% '+str(frames_processed)+'/'+str(progress_widget['maximum'])+', '+str(time_elapsed)+'<'+time_remaining+', '+ str(frames_per_second+'f/s'))
progress_widget['value'] = frames_processed
progress_var.set(frames_processed)
progress_widget.update()
def analyze_video(video_path,redirector,window=None):
if window:
video_player,volume_frame,control_frame,time_label = show_video_player(window,video_path,{"Tape ID": os.path.basename(video_path).split('_')[0]},int(os.path.basename(video_path).split('_')[1].split('-')[0]))
video_player.grid(row=3, column=5, columnspan=10, rowspan=5, padx=1, pady=1, sticky="nsew")
volume_frame.grid(row=3, column=17, rowspan=5, columnspan=3)
time_label.grid(row=8,column=5,columnspan=10)
control_frame.grid(row=9, column=0, columnspan=20, rowspan=2, sticky=tk.N)
starting_time = datetime.datetime.now()
try:
# Open the video file
directory, video_file_name = os.path.split(video_path)
# Get Metadata
metadata = get_video_metadata(json_file, video_file_name)
if metadata:
tape_id, station, program, recording_date, location, start_frame, end_frame = metadata
print(f"Tape ID: {tape_id}")
print(f"Program: {program}")
print(f"Station: {station}")
print(f"Date Recorded: {recording_date}")
print(f"Location Recorded: {location}")
else:
print("[INFO] Metadata not found, applying default values")
tape_id = "AAA-000"
station = "UNK"
recording_date = "1970-01-01"
program = "Unknown"
location = "USA"
start_frame = video_file_name.split('_')[0]
end_frame = video_file_name.split('_')[1].split('.')[0]
video = cv2.VideoCapture(video_path)
assert video.isOpened()
# Obtain video properties
x_shape = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
y_shape = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = video.get(cv2.CAP_PROP_FPS)
# Create a JSON object to store the results
output = {"frames": []}
frames = []
gray_frames = []
blurred_frames = []
threshold_frames = []
morphed_frames = []
sharpened_frames = []
processed_frames = []
base64_frames = []
video_name = os.path.basename(video_path)
video_name = os.path.splitext(video_name)[0]
print("[INFO] Extracting audio from video using MoviePy")
audio_file = os.path.join(scriptPath, f"{video_name}.wav")
video_mp = mp.VideoFileClip(video_path)
video_mp.audio.write_audiofile(audio_file, verbose=False, logger=None)
video_duration = video_mp.duration
video_mp.close()
wav_size = os.path.getsize(audio_file)
target_size = 26214400
if wav_size > target_size:
audio = AudioFileClip(audio_file)
mp3_file = os.path.join(scriptPath, f"{video_name}.mp3")
audio.write_audiofile(mp3_file, codec='mp3', bitrate='192k', logger="bar")
print(f"File compressed to MP3: {mp3_file}")
print("[ACTION] Extracting audio transcript")
audio_mp3 = open(os.path.join(scriptPath, f"{video_name}.mp3"), 'rb')
audio_text = openai_client.audio.transcriptions.create(model="whisper-1", file=audio_mp3)
audio_mp3.close()
os.remove(os.path.join(scriptPath, f"{video_name}.mp3"))
if redirector != None:
redirector.progress_widget['value'] = 100
redirector.progress_label.config(text="")
else:
print("[ACTION] Extracting audio transcript")
if redirector != None:
redirector.progress_widget['value'] = 0
redirector.progress_label.config(text="")
audio_wav = open(os.path.join(scriptPath, f"{video_name}.wav"), 'rb')
audio_text = openai_client.audio.transcriptions.create(model="whisper-1", file=audio_wav)
audio_wav.close()
print("[TRANSCRIPTION] "+audio_text.text)
os.remove(os.path.join(scriptPath, f"{video_name}.wav"))
if os.path.exists(os.path.join(scriptPath, f"{video_name}.mp3")):
os.remove(os.path.join(scriptPath, f"{video_name}.mp3"))
print("[ACTION] Processing Frames")
cap = cv2.VideoCapture(video_path)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
if redirector != None:
try:
redirector.progress_widget['value'] = 0
redirector.progress_widget['maximum'] = frame_count
redirector.progress_label.config(text="")
except Exception as e:
print("[ERROR] "+str(e))
pass
# Preprocess video frames
transform = transforms.Compose([
transforms.Resize((112, 112)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
batch_size = 1 # Adjust the batch size as needed
start_time = time.time()
f = 0
#with alive_bar(frame_count, force_tty=False) as bar:
while True:
if redirector != None:
progress_list = get_eta(start_time,f,frame_count)
progress(redirector.progress_widget,f,batch_size,redirector.progress_label,progress_list,redirector.progress_var)
ret, frame = video.read()
if not ret:
frame_diff = frame_count - len(frames)
while frame_diff > 0:
#bar()
frame_diff -= 1
break
# Convert to grayscale
'''gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# Apply Gaussian blur
blurred = cv2.GaussianBlur(gray, (0, 0), 1)
equalized = cv2.equalizeHist(blurred)
# Calculate the sharpening mask
mask = cv2.addWeighted(gray, 1 + 1.5, equalized, -1.5, 0)
# Convert mask to color image
mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
sharpened = cv2.add(frame, mask)
# Perform histogram equalization
pil_frame = PIL.Image.fromarray(np.uint8(frame))
processed = transform(pil_frame)'''
_, frame_buffer = cv2.imencode('.jpg', frame)
frame_base64 = base64.b64encode(frame_buffer).decode('utf-8')
# Perform morphological operations (optional)
#kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
#morphed = cv2.morphologyEx(equalized, cv2.MORPH_CLOSE, kernel)
'''frames.append(frame)
gray_frames.append(gray)
blurred_frames.append(blurred)
#threshold_frames.append(threshold)
sharpened_frames.append(sharpened)
morphed_frames.append(morphed)
processed_frames.append(processed)'''
base64_frames.append(frame_base64)
#bar()
f += 1
video.release()
cap.release()
progress_list = get_eta(start_time,frame_count,frame_count)
progress(redirector.progress_widget,frame_count,batch_size,redirector.progress_label,progress_list,redirector.progress_var)
count = 0
#text = extract_text_from_video(blurred_frames, x_shape, y_shape)
'''client = vision.ImageAnnotatorClient()
text, count, logos = detect_text_in_video(video_path, frames, x_shape, y_shape, 0.9, client,redirector)
for i, t in enumerate(text):
print(str(i)+': '+str(t))
print()'''
chosen_frames = []
labels = []
frame_interval = int(fps*float(config['analysis']['seconds per image']))
for i in range(int(fps), len(base64_frames), frame_interval):
chosen_frames.append(base64_frames[i])
'''# Calculate frame indices for the desired points
interval_size = len(frames) // 3
middle_frame_index = len(frames) // 2
quarter_frame_index = len(frames) // 10
three_quarters_frame_index = 9 * len(frames) // 10
# Choose frames from the calculated indices
chosen_frames.append(base64_frames[middle_frame_index])
chosen_frames.append(base64_frames[quarter_frame_index])
chosen_frames.append(base64_frames[three_quarters_frame_index])'''
'''for choice in chosen_frames:
logos.append(logo_detection(client, choice))
detected_labels = label_detection(client, choice)
for label in detected_labels:
labels.append(label)
count += 2
for i, t in enumerate(logos):
print(str(i)+': '+str(t))
for i, t in enumerate(labels):
print(str(i)+': '+str(t))'''
video.release()
#cv2.destroyAllWindows()
attempt = 0
while True:
try:
attempt += 1
print("[ACTION] Generating a summary using OpenAI")
summary, tokens_used, input_tokens, output_tokens, finish_details = generate_summary(audio_text.text, metadata, chosen_frames)
print("Summary generated with complete code: "+finish_details)
'''if finish_details == "stop":
print(summary)'''
print("Input Tokens Used: "+str(input_tokens),end='\n\n')
print("Output Tokens Used: "+str(output_tokens),end='\n\n')
try:
print("Saving API Usage Data")
api_usage_json = "api_usage.json"
if os.path.exists(api_usage_json):
with open(api_usage_json, "r") as jsonfile:
existing_api_usage = json.load(jsonfile)
else:
existing_api_usage = {}
current_month_year = datetime.datetime.now().strftime("%Y-%m")
chatgpt_usage = int(tokens_used)
chatgpt_input = int(input_tokens)
chatgpt_output = int(output_tokens)
if attempt == 1:
whisper_usage = round(video_duration)
#vision_usage = count
else:
whisper_usage = 0
#vision_usage = 0
if current_month_year in existing_api_usage:
month_data = existing_api_usage[current_month_year]
else:
month_data = {
'chatgpt': {
'usage': 0,
'input': 0,
'output': 0,
'cost': 0
},
'whisper': {
'usage': 0,
'cost': 0
}
}
existing_api_usage[current_month_year] = month_data
except Exception as e:
print("[ERROR]"+str(e))
traceback.print_exc()