🔒 Safely modify shared resources & Explicit count (#143)

PatrickDuncan · web-flow · commit 7ff7a506e4a0 · 2019-03-18T15:14:46.000-04:00
diff --git a/cleansio/censor/censor.py b/cleansio/censor/censor.py
@@ -1,10 +1,15 @@
 """ Censors audio chunks by muting explicit sections """
 
+from multiprocessing import Lock
 from pydub import AudioSegment
 from speech import Timestamp, Transcribe
 
 class Censor():
     """ Superclass of CensorFile and CensorRealtime """
+    lock = Lock()
+    explicit_count = 0
+    muted_timestamps = []
+
     def __init__(self, explicits):
         super().__init__()
         self.explicits = explicits
@@ -23,12 +28,16 @@ def __mute_explicits(self, file_path, audio_segment, timestamps):
         """ Go through each word, if its an explicit, mute the duration """
         muted = False
         for stamp in timestamps:
-            if stamp['word'].lower() in self.explicits: # Explicit found, mute
+            if stamp['word'] in self.explicits: # Explicit found, mute
                 audio_segment = self.__mute_explicit(audio_segment, stamp)
                 muted = True
+                chunk_index = int(file_path.split('-')[-1].split('.')[0])
+                self.__explicit_count(stamp, chunk_index * 5000)
         if muted:
-            # Overwrite the chunk with the mute(s)
+            Censor.lock.acquire()
+            # Overwrite the chunk with the mute(s) safely
             audio_segment.export(file_path, format='wav')
+            Censor.lock.release()
 
     @classmethod
     def __mute_explicit(cls, audio_segment, timestamp):
@@ -51,3 +60,29 @@ def __get_lyrics(cls, file_path, audio_segment):
     @classmethod
     def __get_timestamps(cls, lyrics):
         return Timestamp(lyrics).timestamps
+
+    @classmethod
+    def __explicit_count(cls, stamp, chunk_offset):
+        """ Count the number of explicits safely """
+        stamp['start'] += chunk_offset
+        stamp['end'] += chunk_offset
+        new_stamp = True
+        Censor.lock.acquire()
+        for mut in Censor.muted_timestamps:
+            if cls.__duplicate_stamp(mut, stamp):
+                new_stamp = False
+                break
+        if new_stamp or not Censor.muted_timestamps:
+            Censor.explicit_count += 1
+            Censor.muted_timestamps.append(stamp)
+        Censor.lock.release()
+
+    @classmethod
+    def __duplicate_stamp(cls, stamp1, stamp2):
+        """ If 2 timestamps are the same word and start and at relatively the
+            same time, then assume they're the same timestamp """
+        if stamp1['word'] == stamp2['word'] and            \
+          abs(stamp1['start'] - stamp2['start']) < 201 and \
+          abs(stamp1['end'] - stamp2['end']) < 201:
+            return True
+        return False
diff --git a/cleansio/censor/censor_file.py b/cleansio/censor/censor_file.py
@@ -41,6 +41,8 @@ def __censor_chunk(self, async_iter):
         return self.censor_audio_chunk(chunk_file_path)
 
     def __create_clean_file(self, clean_file):
+        print('Cleansio found {1}{0}{2} explicit(s)!'.format(
+            Censor.explicit_count, Fore.GREEN, Fore.RESET))
         clean_file.export(self.location, format=self.encoding)
         print(Fore.CYAN + 'Successfully created clean file, it\'s located at:')
         print(Fore.YELLOW + self.location)
diff --git a/cleansio/speech/timestamp.py b/cleansio/speech/timestamp.py
@@ -22,7 +22,7 @@ def __parse_timestamps(self):
         timestamps = []
         for word in self.lyrics:
             timestamps.append({
-                'word': word.word,
+                'word': word.word.lower(),
                 'start': gcs_time_to_ms(word.start_time),
                 'end': gcs_time_to_ms(word.end_time)
             })
diff --git a/cleansio/utils/cleanup.py b/cleansio/utils/cleanup.py
@@ -1,7 +1,5 @@
 """ Cleans up temporary files after the program runs """
 
-# environ - To read the environment variables which we use for communication
-# remove  - To remove the temporary files
 from atexit import register
 from os import environ, remove
 from signal import signal, SIGABRT, SIGILL, SIGINT, SIGSEGV, SIGTERM
diff --git a/tests/censor/test_censor.py b/tests/censor/test_censor.py
@@ -20,7 +20,7 @@ def test_censor():
         file_path = __get_file('/../data/testing.wav')
         audio_segment = AudioSegment.from_file(file_path)
         # Duplicate the audio file and begin muting the new file
-        file_path_duplicate = __get_file('/../data//testing-censored.wav')
+        file_path_duplicate = __get_file('/../data/testing-censored-0.wav')
         duplicate_file = audio_segment.export(file_path_duplicate, format = 'wav')
         audio_segment_duplicate = AudioSegment.from_file(file_path_duplicate)
         # Test that the explicits were successfully removed