tmthyjames · tmthyjames · Oct 13, 2017 · Oct 13, 2017 · Oct 13, 2017 · Oct 21, 2017
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,13 @@
+# general things to ignore
+build/
+dist/
+*.egg-info/
+*.egg
+*.py[cod]
+__pycache__/
+*.so
+*~
+
+# due to using tox and pytest
+.tox
+.cache
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,22 @@
+# this file is *not* meant to cover or endorse the use of travis, but rather to
+# help confirm pull requests to this project.
+
+language: python
+
+matrix:
+  include:
+    - python: 2.7
+      env: TOXENV=py27
+    - python: 3.4
+      env: TOXENV=py34
+    - python: 3.5
+      env: TOXENV=py35
+    - python: 3.6
+      env: TOXENV=py36
+
+install: pip install tox
+
+script: tox
+
+notifications:
+  email: false
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2018 Timothy James Dobbins
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,5 @@
+# Include the license file
+include LICENSE.txt
+
+# Include the data files
+recursive-include data *
diff --git a/README.md b/README.md
@@ -1,2 +1,103 @@
-# cypher
-You shall see.
+# Lyrics Wikia has been shut down and this project has been deprecated. 
+
+Welcome to the Cypher. Blog post [here](https://tmthyjames.github.io/2018/january/Cypher/)
+
+Here are a couple other blog posts that use Cypher to get data: [Analyzing Rap Lyrics Using Word Vectors](https://tmthyjames.github.io/2018/january/Analyzing-Rap-Lyrics-Using-Word-Vectors/) and [Using Lyrics to Predict Genre](https://tmthyjames.github.io/2018/february/Predicting-Musical-Genres/)
+
+=======================
+
+Easily get music lyrics
+
+
+To install, use `pip`:
+
+`pip install thecypher`
+
+Example:
+
+```python
+>>> import thecypher as cy
+>>> coasts = cy.get_lyrics('coasts')
+>>> coasts[0]
+{'album': 'Coasts (2016)',
+ 'artist': 'coasts',
+ 'genre': 'Indie_Pop',
+ 'id': 0,
+ 'lyric': 'We fell in love',
+ 'song': 'Oceans',
+ 'year': '2016'}
+```
+
+Convert it to a pandas DataFrame like so:
+
+```python
+>>> import pandas as pd
+>>> coasts_df = pd.DataFrame(coasts)
+>>> coasts_df.head()
+```
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right">
+      <th></th>
+      <th>album</th>
+      <th>artist</th>
+      <th>genre</th>
+      <th>id</th>
+      <th>lyric</th>
+      <th>song</th>
+      <th>year</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>Coasts (2016)</td>
+      <td>coasts</td>
+      <td>Indie_Pop</td>
+      <td>0</td>
+      <td>We fell in love</td>
+      <td>Oceans</td>
+      <td>2016</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>Coasts (2016)</td>
+      <td>coasts</td>
+      <td>Indie_Pop</td>
+      <td>1</td>
+      <td>Right by the ocean</td>
+      <td>Oceans</td>
+      <td>2016</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>Coasts (2016)</td>
+      <td>coasts</td>
+      <td>Indie_Pop</td>
+      <td>2</td>
+      <td>Made all our plans</td>
+      <td>Oceans</td>
+      <td>2016</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>Coasts (2016)</td>
+      <td>coasts</td>
+      <td>Indie_Pop</td>
+      <td>3</td>
+      <td>Down on the sand</td>
+      <td>Oceans</td>
+      <td>2016</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>Coasts (2016)</td>
+      <td>coasts</td>
+      <td>Indie_Pop</td>
+      <td>4</td>
+      <td>And from the tips of your fingers</td>
+      <td>Oceans</td>
+      <td>2016</td>
+    </tr>
+  </tbody>
+</table>
diff --git a/build/lib/thecypher/__init__.py b/build/lib/thecypher/__init__.py
@@ -0,0 +1,120 @@
+import re
+import requests
+from bs4 import BeautifulSoup as BS
+from bs4.element import NavigableString
+import logging
+import random
+import six
+
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+
+def uniqueid():
+    seed = random.getrandbits(1)
+    while True:
+        yield seed
+        seed += 1
+
+
+uid = uniqueid()
+
+
+def get_lyrics(artist, get_album_genre=False):
+
+    """
+    Example:
+        lyrics = get_lyrics('madonna')
+    """
+
+    base_url = 'http://lyrics.wikia.com'
+    url_ext = '/wiki/'
+
+    search_url = base_url + url_ext + 'Special:Search?query=' + artist.replace(' ', '+')
+    search_resp = requests.get(search_url)
+    soup = BS(search_resp.content)
+    results = soup.find_all('a', {'class': 'result-link'})
+
+    artist_url = results[0].attrs['href']
+
+    logging.info('GET Artist URL: ' + artist_url)
+
+    req = requests.get(artist_url)
+    resp = BS(req.content, 'html')
+
+    genre = ''
+    genre_tag = resp.find_all('table', {'class': 'artist-info-box'})
+    if genre_tag:
+        for atag in resp.find_all('table', {'class': 'artist-info-box'})[0].find_all('a'):
+            if 'Category:Genre' in atag.attrs['href']:
+                genre += ('|' if genre else genre) + atag.text
+
+    albums = {}
+    nodes = resp.find_all('div', {'id': 'mw-content-text'})[0].find_all()
+    for node in nodes:
+        if node.name == 'h2':
+            if node.find_all('span'):
+                a_tag = node.find_all('a')
+                album_url_span = node.findChild('span', {'class': 'mw-headline'})
+                album_url_a = album_url_span if not album_url_span else album_url_span.findChild('a')
+                album_url = album_url_a if not album_url_a else album_url_a.attrs.get('href')
+                title = 'Misc (0000)' if not a_tag else a_tag[0].text
+                year_search = re.search('([0-9]{4})', title)
+                album_year = None if not year_search else year_search.group(0)
+                albums[title] = {}
+                albums[title]['year'] = album_year
+                albums[title]['album_url'] = None if not album_url else (base_url + album_url)
+        if node.name == 'ol':
+            for song in node:
+                track_a = song.find_all('a')
+                if not track_a:
+                    continue
+                track_node = track_a[0]
+                track_name = track_node.text
+                track_href = track_node.get('href')
+                if 'tracks' not in albums[title]:
+                    albums[title]['tracks'] = {}
+                albums[title]['tracks'][track_name] = track_href
+
+    lyrics_obj = []
+    try:
+        album_keys = albums.keys()
+        for album in album_keys:
+            album_url = albums[album]['album_url']
+            album_genre = ''
+            if get_album_genre and album_url:
+
+                album_req = requests.get(album_url)
+                album_soup = BS(album_req.content)
+                genre_tag = album_soup.find_all('div', {'id': 'mw-content-text'})
+                if genre_tag:
+                    for atag in album_soup.findChild('div', {'id': 'mw-content-text'}).find_all('a'):
+                        if 'Category:Genre' in atag.attrs['href']:
+                            album_genre += ('|' if album_genre else album_genre) + atag.text
+
+            logging.info('GET Artist Album: ' + album)
+            track_keys = albums[album].get('tracks')
+            if track_keys:
+                year = albums[album]['year']
+                for song in track_keys:
+                    resp = requests.get(base_url + track_keys[song])
+                    lyric_soup = BS(resp.content)
+                    lyrics_div = lyric_soup.find_all('div', {'class': 'lyricbox'})
+                    lyrics_div = None if not lyrics_div else lyrics_div[0]
+                    if lyrics_div:
+                        for lyric in lyrics_div.childGenerator():
+                            if isinstance(lyric, NavigableString) and lyric.strip():
+                                lyric_dict = {
+                                    'artist': artist,
+                                    'lyric': lyric,
+                                    'song': song,
+                                    'year': year,
+                                    'album': album,
+                                    'id': six.next(uid),
+                                    'genre': genre,
+                                    'album_genre': album_genre
+                                }
+                                lyrics_obj.append(lyric_dict)
+            logging.info('GET Artist Album Successful: ' + album)
+        return lyrics_obj
+    except KeyboardInterrupt:
+        return lyrics_obj
diff --git a/build/lib/thecypher/package_data.dat b/build/lib/thecypher/package_data.dat
@@ -0,0 +1 @@
+some data
diff --git a/data/data_file b/data/data_file
@@ -0,0 +1 @@
+some data
diff --git a/data/lyrics_test.csv.zip b/data/lyrics_test.csv.zip
diff --git a/data/lyrics_train.csv.zip b/data/lyrics_train.csv.zip
diff --git a/data/lyrics_whole.csv.zip b/data/lyrics_whole.csv.zip
diff --git a/dist/thecypher-1.2.0-py2.py3-none-any.whl b/dist/thecypher-1.2.0-py2.py3-none-any.whl
diff --git a/dist/thecypher-1.2.0.tar.gz b/dist/thecypher-1.2.0.tar.gz