unil-ish · Sophwrd · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/character.py b/character.py
@@ -0,0 +1,122 @@
+# code pour la classe Character
+import pandas as pd
+
+
+class Character:
+    # attributs de classe
+    all_characters_id = []  # une liste de tous les id de character
+    all_names_id = []
+    all_movies_id = []
+    all_genders_id = []
+    all_credits_positions_id = []
+
+    def __init__(self, character_id, name_id, movie_id, gender_id, credits_position_id):
+        self.character_id = character_id  # id d'un character en particulier
+        self.name_id = name_id
+        self.movie_id = movie_id
+        self.gender_id = gender_id
+        self.credits_position_id = credits_position_id
+
+    # donner un name_id associé à un character_id
+    def get_name_id(self, character_id):
+        # mettre deux listes ensemble et mettre cette liste dans un dictionnaire
+        merged_list = dict(zip(self.all_characters_id, self.all_names_id))
+        # cherche l'id character donné et retourne l'id name associé
+        return merged_list[character_id]
+
+    def get_movie_id(self, character_id):
+        merged_list = dict(zip(self.all_characters_id, self.all_movies_id))
+        return merged_list[character_id]
+
+    def get_gender_id(self, character_id):
+        merged_list = dict(zip(self.all_characters_id, self.all_genders_id))
+        return merged_list[character_id]
+
+    def get_credits_position_id(self, character_id):
+        merged_list = dict(zip(self.all_characters_id, self.all_credits_position_id))
+        return merged_list[character_id]
+
+    # création de @property pour pouvoir accéder aux attributs (avant __init__)
+    @property
+    def _all_names_id(self):
+        return self.all_names_id
+
+    @property
+    def _all_movies_id(self):
+        return self.all_movies_id
+
+    @property
+    def _all_genders_id(self):
+        return self.all_genders_id
+
+    @property
+    def _all_credits_positions_id(self):
+        return self.all_credits_positions_id
+
+    @property
+    def _all_characters_id(self):
+        return self.all_characters_id
+
+    # création de @classmethod
+    @classmethod
+    def get_all_names(cls, id_list):
+        merged_list = dict(zip(Character._all_characters_id, Character._all_names_id))  # dictionnaire qui met
+        # ensemble les listes all_characters_id and all_names_id
+        list_all_names = []
+        for ids in id_list:
+            list_all_names.append(merged_list[ids])
+        return list_all_names
+
+    @classmethod
+    def get_all_movies_id(cls, id_list):
+        merged_list = dict(zip(Character._all_characters_id, Character._all_movies_id))  # dictionnaire qui met
+        # ensemble les listes all_characters_id and all_names_id
+        list_all_movies = []
+        for ids in id_list:
+            list_all_movies.append(merged_list[ids])
+        return list_all_movies
+
+    @classmethod
+    def get_all_genders_id(cls, id_list):
+        merged_list = dict(zip(Character._all_characters_id, Character._all_genders_id))  # dictionnaire qui met
+        # ensemble les listes all_characters_id and all_names_id
+        list_all_genders = []
+        for ids in id_list:
+            list_all_genders.append(merged_list[ids])
+        return list_all_genders
+
+    @classmethod
+    def get_all_credits_positions_id(cls, id_list):
+        merged_list = dict(
+            zip(Character._all_characters_id, Character._all_credits_positions_id))  # dictionnaire qui met
+        # ensemble les listes all_characters_id and all_names_id
+        list_all_credits_positions = []
+        for ids in id_list:
+            list_all_credits_positions.append(merged_list[ids])
+        return list_all_credits_positions
+
+    @classmethod
+    def create_dataframe(cls, list_ids, attribute_list):
+        df = pd.DataFrame(dict(zip(list_ids, attribute_list)))
+        return df
+
+
+class CharacterHolder:
+    def get_character(self):
+        return self.character_dataset()
+
+    @staticmethod
+    def character_dataset():
+        if (
+                Character.all_characters_id
+                and Character.all_names_id
+                and Character.all_movies_id
+                and Character.all_genders_id
+                and Character.all_credits_positions_id
+        ):
+            return True
+        else:
+            return False
+
+    def create_character_dataset(self, provided_data):
+        return
diff --git a/movie.py b/movie.py
@@ -0,0 +1,146 @@
+# code pour la classe Movie
+import pandas as pd
+
+
+class Movie:
+    # attributs de classe
+    all_movies_id = []  # une liste de tous les id de movie
+    all_titles_id = []
+    all_release_years_id = []
+    all_ratings_id = []
+    all_votes_id = []
+    all_genres_id = []
+
+    def __init__(self, movie_id, title_id, release_year_id, ratings_id, votes_id, genres_id):
+        self.movie_id = movie_id  # id d'un movie en particulier
+        self.title_id = title_id
+        self.release_year_id = release_year_id
+        self.ratings_id = ratings_id
+        self.votes_id = votes_id
+        self.genres_id = genres_id
+
+    # méthodes getter, donner un title_id associé à un movie_id
+    def get_title_id(self, movie_id):
+        # mettre deux listes ensemble et mettre cette liste dans un dictionnaire
+        merged_list = dict(zip(self.all_movies_id, self.all_titles_id))
+        # cherche l'id movie donné et retourne l'id title associé
+        return merged_list[movie_id]
+
+    def get_release_year_id(self, movie_id):
+        merged_list = dict(zip(self.all_movies_id, self.all_release_years_id))
+        return merged_list[movie_id]
+
+    def get_ratings_id(self, movie_id):
+        merged_list = dict(zip(self.all_movies_id, self.all_ratings_id))
+        return merged_list[movie_id]
+
+    def get_votes_id(self, movie_id):
+        merged_list = dict(zip(self.all_movies_id, self.all_votes_id))
+        return merged_list[movie_id]
+
+    def get_genres_id(self, movie_id):
+        merged_list = dict(zip(self.all_movies_id, self.all_genres_id))
+        return merged_list[movie_id]
+
+    # création de @property pour pouvoir accéder aux attributs de classe (avant __init__)
+    @property
+    def _all_titles_id(self):
+        return self.all_titles_id
+
+    @property
+    def _all_release_years_id(self):
+        return self.all_release_years_id
+
+    @property
+    def _all_ratings_id(self):
+        return self.all_ratings_id
+
+    @property
+    def _all_votes_id(self):
+        return self.all_votes_id
+
+    @property
+    def _all_genres_id(self):
+        return self.all_genres_id
+
+    @property
+    def _all_movies_id(self):  # liste all_movies_id
+        return self.all_movies_id
+
+    # création de @classmethod
+    @classmethod
+    def get_all_titles(cls, id_list):
+        # mettre ensemble listes all_movies_id et all_titles_id, créer un dictionnaire
+        merged_list = dict(zip(Movie._all_movies_id, Movie._all_titles_id))  # dictionnaire qui met ensemble les
+        # liste vide pour mettre les titres qui correspondent aux ids dans id_list
+        list_all_titles = []
+        for ids in id_list:
+            # prendre titre associé à l'id et le met dans list_all_titles
+            list_all_titles.append(merged_list[ids])
+        return list_all_titles
+
+    @classmethod
+    def get_all_release_years(cls, id_list):
+        merged_list = dict(zip(Movie._all_movies_id, Movie._all_release_years_id))
+        list_all_release_years_id = []
+        for ids in id_list:
+            list_all_release_years_id.append(merged_list[ids])
+        return list_all_release_years_id
+
+    @classmethod
+    def get_all_ratings_id(cls, id_list):
+        merged_list = dict(zip(Movie._all_movies_id, Movie._all_ratings_id))
+        list_all_ratings_id = []
+        for ids in id_list:
+            list_all_ratings_id.append(merged_list[ids])
+        return list_all_ratings_id
+
+    @classmethod
+    def get_all_votes_id(cls, id_list):
+        merged_list = dict(zip(Movie._all_movies_id, Movie._all_votes_id))
+        list_all_votes_id = []
+        for ids in id_list:
+            list_all_votes_id.append(merged_list[ids])
+        return list_all_votes_id
+
+    @classmethod
+    def get_all_genres_id(cls, id_list):
+        merged_list = dict(zip(Movie._all_movies_id, Movie._all_genres_id))
+        list_all_genres_id = []
+        for ids in id_list:
+            list_all_genres_id.append(merged_list[ids])
+        return list_all_genres_id
+
+    # création d'un dataframe df
+    @classmethod
+    def create_dataframe(cls, list_ids, attribute_list):
+        df = pd.DataFrame(dict(zip(list_ids, attribute_list)))
+        return df
+
+
+class MovieHolder:
+    # chercher le data de movie
+    def get_movie(self):
+        # appelle la méthode
+        return self.movie_dataset()
+
+    @staticmethod
+    def movie_dataset():
+        # vérifie si les listes ne sont pas vides = True, sinon = False
+        if (
+            Movie.all_movies_id
+            and Movie.all_titles_id
+            and Movie.all_release_years_id
+            and Movie.all_ratings_id
+            and Movie.all_votes_id
+            and Movie.all_genres_id
+        ):
+            return True
+        else:
+            return False
+
+    def create_movie_dataset(self, provided_data):
+        """movie_data = read_data(provided_data)
+        for movie in movie_data:
+        """
+        return
diff --git a/movie_dialog/README.md b/movie_dialog/README.md
@@ -0,0 +1 @@
+Dossier pour les Data unzipé
diff --git a/movie_dialog/README.txt b/movie_dialog/README.txt
@@ -0,0 +1,113 @@
+Cornell Movie-Dialogs Corpus
+
+Distributed together with:
+
+"Chameleons in imagined conversations: A new approach to understanding coordination of linguistic style in dialogs"
+Cristian Danescu-Niculescu-Mizil and Lillian Lee
+Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics, ACL 2011.
+
+(this paper is included in this zip file)
+
+NOTE: If you have results to report on these corpora, please send email to cristian@cs.cornell.edu or llee@cs.cornell.edu so we can add you to our list of people using this data.  Thanks!
+
+
+Contents of this README:
+
+	A) Brief description
+	B) Files description
+	C) Details on the collection procedure
+	D) Contact
+
+
+A) Brief description:
+
+This corpus contains a metadata-rich collection of fictional conversations extracted from raw movie scripts:
+
+- 220,579 conversational exchanges between 10,292 pairs of movie characters
+- involves 9,035 characters from 617 movies
+- in total 304,713 utterances
+- movie metadata included:
+	- genres
+	- release year
+	- IMDB rating
+	- number of IMDB votes
+	- IMDB rating
+- character metadata included:
+	- gender (for 3,774 characters)
+	- position on movie credits (3,321 characters)
+
+
+B) Files description:
+
+In all files the field separator is " +++$+++ "
+
+- movie_titles_metadata.txt
+	- contains information about each movie title
+	- fields: 
+		- movieID, 
+		- movie title,
+		- movie year, 
+	   	- IMDB rating,
+		- no. IMDB votes,
+ 		- genres in the format ['genre1','genre2',�,'genreN']
+
+- movie_characters_metadata.txt
+	- contains information about each movie character
+	- fields:
+		- characterID
+		- character name
+		- movieID
+		- movie title
+		- gender ("?" for unlabeled cases)
+		- position in credits ("?" for unlabeled cases) 
+
+- movie_lines.txt
+	- contains the actual text of each utterance
+	- fields:
+		- lineID
+		- characterID (who uttered this phrase)
+		- movieID
+		- character name
+		- text of the utterance
+
+- movie_conversations.txt
+	- the structure of the conversations
+	- fields
+		- characterID of the first character involved in the conversation
+		- characterID of the second character involved in the conversation
+		- movieID of the movie in which the conversation occurred
+		- list of the utterances that make the conversation, in chronological 
+			order: ['lineID1','lineID2',�,'lineIDN']
+			has to be matched with movie_lines.txt to reconstruct the actual content
+
+- raw_script_urls.txt
+	- the urls from which the raw sources were retrieved
+
+C) Details on the collection procedure:
+
+We started from raw publicly available movie scripts (sources acknowledged in 
+raw_script_urls.txt).  In order to collect the metadata necessary for this study 
+and to distinguish between two script versions of the same movie, we automatically
+ matched each script with an entry in movie database provided by IMDB (The Internet
+ Movie Database; data interfaces available at http://www.imdb.com/interfaces). Some
+ amount of manual correction was also involved. When  more than one movie with the same
+ title was found in IMBD, the match was made with the most popular title 
+(the one that received most IMDB votes)  
+
+After discarding all movies that could not be matched or that had less than 5 IMDB 
+votes, we were left with 617 unique titles with metadata including genre, release 
+year, IMDB rating and no. of IMDB votes and cast distribution.  We then identified 
+the pairs of characters that interact and separated their conversations automatically 
+using simple data processing heuristics. After discarding all pairs that exchanged 
+less than 5 conversational exchanges there were 10,292 left, exchanging 220,579 
+conversational exchanges (304,713 utterances).  After automatically matching the names 
+of the 9,035 involved characters to the list of cast distribution, we used the 
+gender of each interpreting actor to infer the fictional gender of a subset of 
+3,321 movie characters (we raised the number of gendered 3,774 characters through
+ manual annotation). Similarly, we collected the end credit position of a subset 
+of 3,321 characters as a proxy for their status.
+
+
+D) Contact:
+
+Please email any questions to: cristian@cs.cornell.edu (Cristian Danescu-Niculescu-Mizil)