justloopingaround-scripts/utils.py at master · 0ctagon/justloopingaround-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import math
import openpyxl
import numpy as np
import datetime
import pandas as pd


months_to_num = {
    "Jan": "01",
    "Feb": "02",
    "Mar": "03",
    "Apr": "04",
    "May": "05",
    "Jun": "06",
    "Jul": "07",
    "Aug": "08",
    "Sep": "09",
    "Oct": "10",
    "Nov": "11",
    "Dec": "12",
}


class Song:
    """
    A class to represent a song with various attributes and methods to format and process these attributes.

    Attributes:
    -----------
    songID : str
        A unique identifier for the song derived from its name.
    rank : str
        The rank of the song, formatted appropriately.
    htmlID : str
        A unique identifier for the song based on its live date and media type.
    length : str
        The length of the song, formatted appropriately.
    length_DT : datetime.timedelta
        The length of the song as a timedelta object.
    media : str
        The media type of the song (e.g., YouTube, Live, Twitch).
    date_str : str
        The date string derived from the htmlID and media type.
    date_DT : datetime.datetime
        The date of the song as a datetime object.
    date_YM_DT : datetime.datetime
        The year and month of the song's date as a datetime object.
    name : str
        The name of the song, formatted appropriately.
    tempo : str
        The tempo of the song, formatted appropriately.
    comment : str
        The comment associated with the song, formatted appropriately.
    choral : str
        The choral information of the song, formatted appropriately.
    genre : str
        The genre of the song.
    URL : str
        The URL of the song.
    when_ranked_DT : datetime.datetime
        The date when the song was ranked as a datetime object.
    when_ranked_YM_DT : datetime.datetime
        The year and month when the song was ranked as a datetime object.
    live_title : str
        The live title of the song.
    live_comment : str
        The live comment associated with the song, formatted appropriately.
    """

    def __init__(
        self,
        name,
        media,
        live_date,
        rank,
        genre,
        tempo,
        length,
        comment,
        choral,
        URL,
        when_ranked,
        live_title,
        live_comment,
    ):
        self.songID = self.get_songID(name)
        self.rank = self.format_rank(rank)
        self.htmlID = self.get_htmlID(live_date, media)
        self.length = self.format_length(length, self.songID)
        self.length_DT = self.get_length_DT(self.length)
        self.media = media
        self.date_str = self.get_date_str(self.htmlID, media)
        self.date_DT = datetime.datetime(
            year=int("20" + self.date_str[0:2]),
            month=int(self.date_str[2:4]),
            day=int(self.date_str[4:6]),
        )
        self.date_YM_DT = datetime.datetime(
            year=self.date_DT.year, month=self.date_DT.month, day=1
        )
        self.name = self.format_name(name)
        self.tempo = self.format_tempo(tempo)
        self.comment = self.format_comment(comment)
        self.choral = self.format_choral(choral)
        self.genre = genre
        self.URL = URL
        self.when_ranked_DT = datetime.datetime(
            year=int("20" + str(when_ranked)[0:2]),
            month=int(str(when_ranked)[2:4]),
            day=int(str(when_ranked)[4:6]),
        )
        self.when_ranked_YM_DT = datetime.datetime(
            year=self.when_ranked_DT.year, month=self.when_ranked_DT.month, day=1
        )
        self.live_title = live_title
        self.live_comment = self.format_live_comment(live_comment)

    def get_htmlID(self, live_date, media):
        """
        Generates a unique htmlID based on the live date and media type.

        Parameters:
        live_date (str): The live date of the song.
        media (str): The media type of the song (e.g., YouTube, Live, Twitch).

        Returns:
        str: The generated htmlID.
        """
        if media in ["YouTube", "Live"]:
            day = f"0{live_date[4]}" if "," in live_date[4:6] else live_date[4:6]
            if media == "YouTube":
                return f"s{live_date[-2:]}{months_to_num[live_date[:3]]}{day}"
            else:
                return f"s{live_date[-2:]}{months_to_num[live_date[:3]]}{day}l"
        elif media == "Twitch":
            return f"s{live_date}"

    def get_date_str(self, htmlID, media):
        """
        Extracts the date string from the htmlID based on the media type.

        Parameters:
        htmlID (str): The htmlID of the song.
        media (str): The media type of the song (e.g., YouTube, Live, Twitch).

        Returns:
        str: The extracted date string.
        """
        if media == "YouTube":
            return htmlID[1:]
        elif media in ["Live", "Twitch"]:
            return htmlID[1:-1]

    def get_length_DT(self, length):
        """
        Converts the length of the song from a string to a timedelta object.

        Parameters:
        length (str): The length of the song as a string.

        Returns:
        datetime.timedelta: The length of the song as a timedelta object.
        """
        if (length not in [np.nan]) and len(length) > 3:
            if length[1] == "'":
                return datetime.timedelta(
                    minutes=int(length[0]), seconds=int(length[2] + length[3])
                )
            elif length[2] == "'":
                return datetime.timedelta(
                    minutes=int(length[0] + length[1]),
                    seconds=int(length[3] + length[4]),
                )
            else:
                return datetime.timedelta(minutes=0, seconds=0)
        else:
            return datetime.timedelta(minutes=0, seconds=0)

    def format_tempo(self, tempo):
        """
        Formats the tempo of the song.

        Parameters:
        tempo (str): The tempo of the song.

        Returns:
        str: The formatted tempo.
        """
        if "into" in tempo:
            tempo = tempo.split(" into ")[0]

        if tempo in ["medium", "medium "]:
            return "Med"
        elif tempo in ["medium fast", "slow fast"]:
            return "Fmed"
        elif tempo in ["medium slow"]:
            return "Smed"
        elif tempo in [
            "fast",
            "fast af",
            "fast fast",
            "ultra fast",
            "ULTRA FAST",
            "jsp",
        ]:
            return "Fast"
        elif tempo in [
            "slow",
            "slow ",
            "sloooow",
            "hors du temps (14'30 en vrai)",
            "out of time (14'30 in reality)",
            "-",
        ]:
            return "Slow"
        else:
            raise ValueError(f"Unknown tempo: {tempo}")

    def format_comment(self, comment):
        """
        Formats the comment associated with the song.

        Parameters:
        comment (str): The comment associated with the song.

        Returns:
        str: The formatted comment.
        """
        for c in ["\n", '"']:
            comment = comment.replace(c, "")
        comment.replace("…", "...")
        return comment

    def format_choral(self, choral):
        """
        Formats the choral information of the song.

        Parameters:
        choral (str): The choral information of the song.

        Returns:
        str: The formatted choral information.
        """
        try:
            math.isnan(choral)
            choral = "-"
        except:
            pass
        choral = choral.replace("\n", "")
        return choral

    def format_name(self, name):
        """
        Formats the name of the song.

        Parameters:
        name (str): The name of the song.

        Returns:
        str: The formatted name.
        """
        name = name.replace('"', "")
        return name

    def get_songID(self, name):
        """
        Generates a unique songID from the name of the song.

        Parameters:
        name (str): The name of the song.

        Returns:
        str: The generated songID.
        """
        for c in [
            "-",
            " ",
            "  ",
            "'",
            "(",
            ")",
            ".",
            '"',
            ",",
            "&",
            ":",
            "!",
            "?",
            "\\",
            "/",
        ]:
            name = name.replace(c, "")
        if name[0].isdigit():
            name = "n" + name
        return name

    def format_rank(self, rank):
        """
        Formats the rank of the song.

        Parameters:
        rank (str): The rank of the song.

        Returns:
        str: The formatted rank.
        """
        rank = rank.replace("-", "I")
        if rank == "S I D":
            rank = "D"
        return rank

    def format_length(self, length, songID):
        """
        Formats the length of the song based on the songID.

        Parameters:
        length (str): The length of the song as a string.
        songID (str): The unique identifier of the song.

        Returns:
        str: The formatted length.
        """
        if songID == "Dramaticevent":
            length = "14'30"
        return length

    def format_live_comment(self, live_comment):
        """
        Formats the live comment associated with the song.

        Parameters:
        live_comment (str): The live comment associated with the song.

        Returns:
        str: The formatted live comment.
        """
        if live_comment not in [np.nan]:
            return live_comment
        return "-"


def get_df_from_xls(xls_file, media=None, EN=False):
    """
    Extracts data from an Excel file and returns it as a pandas DataFrame.

    Parameters:
    xls_file (str): The path to the Excel file.
    media (str): The media type of the songs (e.g., YouTube, Live, Twitch).
    EN (bool): A flag to determine which sheet to read from the workbook.
               If False, reads from "Sheet1" (French). If True, reads from "Sheet2" (English).

    Returns:
    pd.DataFrame: A DataFrame containing the extracted song data.
    """
    song_list = []
    date_set = False

    if media is None:
        if "yt" in xls_file:
            media = "YouTube"
        elif "tw" in xls_file:
            media = "Twitch"
        elif "live" in xls_file:
            media = "Live"
    elif media not in ["YouTube", "Twitch", "Live"]:
        raise ValueError("media must be one of 'YouTube', 'Twitch', or 'Live'")

    # Load the workbook and select the appropriate sheet
    wrkbk = openpyxl.load_workbook(xls_file)
    if not EN:
        sh = wrkbk["Sheet1"]
    else:
        sh = wrkbk["Sheet2"]

    line_start = 5

    # Iterate through the rows of the sheet
    for n_line, row in enumerate(
        sh.iter_rows(min_row=line_start, min_col=1, max_row=1500, max_col=10),
        line_start,
    ):
        line = []
        for cell in row:
            if cell.value is None:
                line.append(np.nan)
            else:
                line.append(cell.value)

        # print(f"Reading line {n_line} : {line}")

        if row[2].value is not None:
            song_URL = row[2].hyperlink.target

        title_or_date = line[0]
        song_name = line[2]
        if type(song_name) == int:
            song_name = str(song_name)
        song_rank = line[3]
        song_genre = line[4]
        song_tempo = line[5]
        song_length = line[6]
        song_comment = line[7]
        song_choral = line[8]

        # Skip rows where song_name is NaN
        try:
            math.isnan(song_name)
            continue
        except:
            # Check if the row contains a date or a title
            try:
                math.isnan(title_or_date)
            except:
                if date_set:
                    date_set = False
                else:
                    live_title = line[0]
                    live_comment = line[1]
                    song_date = sh.cell(n_line + 1, 1).value
                    song_when_ranked = line[9]
                    if math.isnan(song_when_ranked):
                        song_when_ranked = 990101
                    date_set = True

        # Handle choral information
        try:
            math.isnan(song_choral)
        except:
            if '"' in song_choral:
                song_choral = song_choral.replace('"', sh.cell(n_line - 1, 9).value)

        # Append the song information to the list
        song_list.append(
            Song(
                song_name,
                media,
                song_date,
                song_rank,
                song_genre,
                song_tempo,
                song_length,
                song_comment,
                song_choral,
                song_URL,
                song_when_ranked,
                live_title,
                live_comment,
            )
        )

    for song in song_list:
        if song.name == "DELETETHISSONG":
            song_list.remove(song)

    # Convert the list of Song objects to a DataFrame
    return pd.DataFrame([vars(v) for v in song_list])


def get_unique_songID(df):
    """
    Ensures that each songID in the DataFrame is unique by appending a count to duplicate songIDs.

    Parameters:
    df (pd.DataFrame): The DataFrame containing song data with a "songID" column.

    Returns:
    pd.DataFrame: The DataFrame with unique songIDs.
    """
    print("\nChecking songIDs duplicates")
    songIDs = []
    for index, row in df.iterrows():
        songID = row["songID"]
        songIDs.append(songID)
        if songIDs.count(songID) > 1:
            print(f"\tMultiple {songID} ({songIDs.count(songID)})")
            df.at[index, "songID"] = songID + str(songIDs.count(songID))
    print("Done")
    return df


def get_unique_htmlID(df):
    """
    Ensures that each songID in the DataFrame is unique by appending a count to duplicate htmlID.

    Parameters:
    df (pd.DataFrame): The DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with unique htmlIDs.
    """
    print("\nChecking htmlIDs duplicates")
    df_gb = (
        df[["live_title", "htmlID"]]
        .groupby(["live_title", "htmlID"])
        .size()
        .reset_index(name="count")
    )
    htmlIDs = []
    live_to_update = []
    for index, row in df_gb.iterrows():
        htmlIDs.append(row["htmlID"])
        if htmlIDs.count(row["htmlID"]) > 1:
            print(
                f"\tMultiple {row['htmlID']} for {row['live_title']} ({htmlIDs.count(row['htmlID'])})"
            )
            df_gb.at[index, "htmlID"] = row["htmlID"] + chr(
                64 + htmlIDs.count(row["htmlID"])
            )
            live_to_update.append(row["live_title"])
    for index, row in df.iterrows():
        for index_gb, row_gb in df_gb.iterrows():
            if (
                row["live_title"] == row_gb["live_title"]
                and row["htmlID"] != row_gb["htmlID"]
                and row["live_title"] in live_to_update
            ):
                df.at[index, "htmlID"] = row_gb["htmlID"]
    print("Done")
    return df


def get_unique_IDs(df):
    """
    Ensures that each songID and htmlID in the DataFrame is unique by appending a count to duplicate IDs.

    Parameters:
    df (pd.DataFrame): The DataFrame containing song data with "songID" and "htmlID" columns.

    Returns:
    pd.DataFrame: The DataFrame with unique songIDs and htmlIDs.
    """
    df = get_unique_songID(df)
    df = get_unique_htmlID(df)
    return df


def print_simple_stats(df):
    """
    Prints simple statistics about the songs in the DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing song data.

    Prints:
    Various statistics including the count of songs by rank, number of live streams, total music length,
    average music time per stream, longest music time for a stream, average song length, longest song length,
    average number of songs per stream, highest number of songs for a stream, and estimated live streams and songs left to rank.
    """
    print("\n_______ Simple stats _______")

    # Print song counts by rank
    ranks = ["S", "A+", "A", "B+", "B", "C+", "C", "D", "I"]
    print(f"\nAny rank : ({len(df)})")
    for rank in ranks:
        print(f'{rank:>2} : ({len(df.query(f"rank == \'{rank}\'"))})')

    # Print live stream statistics
    num_streams = len(df.groupby("htmlID"))
    total_length = df["length_DT"].sum()
    avg_length_per_stream = total_length / num_streams
    print(avg_length_per_stream)
    longest_stream_length = (
        df[["length_DT", "htmlID"]].groupby("htmlID").sum().max().iloc[0]
    )
    longest_stream_title = (
        df[["length_DT", "live_title"]].groupby("live_title").sum().idxmax().iloc[0]
    )
    longest_stream_htmlID = (
        df[["length_DT", "htmlID"]].groupby("htmlID").sum().idxmax().iloc[0]
    )

    print(f"\nNumber of Live Streams done: {num_streams} streams")
    print(f"Total music ranked: {len(df)} songs")
    print(f"Total music length ranked: {str(total_length)}\n")
    print(f"Average music time per streams: {str(avg_length_per_stream)[7:15]}")
    print(
        f"Longest music time for a stream: {str(longest_stream_length)[7:15]}, {longest_stream_title}"
    )
    if longest_stream_htmlID != "s201209":
        print("NEW RECORD!\n")
    else:
        print()

    # Print song length statistics
    avg_song_length = total_length / len(df)
    longest_song_length = df["length_DT"].max()
    longest_song_name = df.loc[df["length_DT"].idxmax()]["name"]

    print(f"Average song length: {str(avg_song_length)[7:15]}")
    print(
        f"Longest song length: {str(longest_song_length)[7:15]} with {longest_song_name}\n"
    )

    # Print song count per stream statistics
    avg_songs_per_stream = len(df) / num_streams
    max_songs_per_stream = (
        df[["length_DT", "live_title"]].groupby("live_title").size().max()
    )
    max_songs_stream_title = (
        df[["length_DT", "live_title"]].groupby("live_title").size().idxmax()
    )

    print(f"Average number of songs per stream: {avg_songs_per_stream:.2f} songs")
    print(
        f"Highest number of songs for a stream: {max_songs_per_stream} songs, {max_songs_stream_title}"
    )
    if longest_stream_htmlID != "s180211":
        print("NEW RECORD!\n")
    else:
        print()

    # Print estimated remaining statistics
    estimated_live_left = 108 + 25 - num_streams
    estimated_songs_left = estimated_live_left * avg_songs_per_stream
    estimated_length_left = estimated_live_left * avg_length_per_stream

    print(f"Number of Live Stream left to rank (estimation): {estimated_live_left}")
    print(f"Number of songs left to rank (estimation): {estimated_songs_left:.0f}")
    print(
        f"Total music length left to rank (estimation): {str(estimated_length_left)}\n"
    )