commit 7e9f2f586282ddf649a996b0d1167972f8216d36
parent eab9a8a834323863e823b0782c67f45f2f4ce12b
Author: AsherMorgan <59518073+AsherMorgan@users.noreply.github.com>
Date: Sun, 22 Mar 2020 19:00:51 -0700
Improve artist and song preprocessing.
Diffstat:
1 file changed, 22 insertions(+), 7 deletions(-)
diff --git a/Songs2Slides.py b/Songs2Slides.py
@@ -1,19 +1,34 @@
# Import dependencies
from bs4 import BeautifulSoup
import os
-import requests
from pptx import Presentation
from pptx.enum.text import PP_ALIGN
from pptx.util import Inches, Pt
+import requests
# Gets the lyrics
-def getLyrics(artist, song):
- artist = artist.replace(" ", "-")
- song = song.replace(" ", "-")
+def GetLyrics(artist, song):
+ # Convert to lowercase
+ artist = artist.lower()
+ song = song.lower()
+
+ # Remove extra whitespace
+ artist = ' '.join(artist.split())
+ song = ' '.join(song.split())
+
+ # Replace invalid characters
+ old = [" ", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "+", "=", "'", "?", "/", "|", "\\", ".", ",", "á", "é", "í", "ó", "ñ", "ú"]
+ new = ["-", "", "", "", "s", "", "-", "and", "", "", "", "-", "-", "", "", "", "", "", "", "", "a", "e", "i", "o", "n", "u"]
+ for i in range(0, len(old)):
+ artist = artist.replace(old[i], new[i])
+ song = song.replace(old[i], new[i])
+
+ # Get lyrics
page = requests.get("https://genius.com/{0}-{1}-lyrics".format(artist, song))
- html = BeautifulSoup(page.text, 'html.parser')
- lyrics = html.find('div', class_='lyrics').get_text()
+ lyrics = BeautifulSoup(page.text, 'html.parser').find('div', class_='lyrics').get_text()
+
+ # Return lyrics
return lyrics
@@ -81,7 +96,7 @@ if (__name__ == "__main__"):
# Get song lyrics
try:
- lyrics += ParseLyrics(getLyrics(artist, title))
+ lyrics += ParseLyrics(GetLyrics(artist, title))
lyrics += [""]
except:
print("We couldn't find the lyrics to that song.")