import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
Data Collection
In this notebook will collect all of the data required for this project including:
- Top ten singles by year from Wikipedia
- Matching song entities using the Spotify API with the Spotipy package
Collect Top Ten Singles by Year
The top ten singles are available from 1958 to 2022 using a predictable URL pattern.
= []
urls for year in range(1958, 2023):
f"https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_{year}") urls.append(
20] urls[
'https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1978'
= []
rows for url, year in zip(urls, range(1958, 2023)):
print(year)
= pd.read_html(url)
dfs
= []
row_num for df in dfs:
0])
row_num.append(df.shape[
= dfs[row_num.index(max(row_num))]
year_df = year_df.iloc[:,:6]
year_df
= ['entry_date','title','artist','peak','peak_date','weeks_top_ten']
columns = columns
year_df.columns
= year_df[~pd.to_numeric(year_df['peak'], errors='coerce').isna()].reset_index(drop=True)
year_df 'year'] = year
year_df[='records')) rows.extend(year_df.to_dict(orient
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
= pd.DataFrame(rows) df
20) df.sample(
entry_date | title | artist | peak | peak_date | weeks_top_ten | year | |
---|---|---|---|---|---|---|---|
3173 | September 22 | "Praying for Time" | George Michael | 1 | October 13 | 6 | 1990 |
4658 | August 1 | "The Hills"[G] (#10) | The Weeknd | 1 | October 3 | 21 | 2015 |
3880 | November 10 | "Only Time" | Enya | 10 | November 10 | 3 | 2001 |
850 | September 10 | "Bus Stop" | The Hollies | 5 | September 17 | 4 | 1966 |
3351 | August 29 | "Humpin' Around" | Bobby Brown | 3 | September 12 | 8 | 1992 |
4079 | July 2 | "Inside Your Heaven" | Carrie Underwood | 1 | July 2 | 4 | 2005 |
4027 | July 10 | "I Believe" | Fantasia | 1 | July 10 | 2 | 2004 |
1766 | September 13 | "Wasted Days and Wasted Nights" | Freddy Fender | 8 | September 27 | 3 | 1975 |
3235 | April 20 | "Here We Go (Let's Rock & Roll)" | C+C Music Factory | 3 | May 18 | 6 | 1991 |
1204 | February 14 | "Arizona" | Mark Lindsay | 10 | February 14 | 1 | 1970 |
281 | July 17 | "San Antonio Rose" | Floyd Cramer | 8 | July 17 | 1 | 1961 |
228 | December 31[1] | "Angel Baby" | Rosie and the Originals | 5 | January 23 | 7 | 1961 |
2867 | January 16 | "Could've Been" | Tiffany | 1 | February 6 | 6 | 1988 |
4877 | October 26 | "Bandit" | Juice Wrld and YoungBoy Never Broke Again | 10 | October 26 | 1 | 2019 |
2707 | August 9 | "Higher Love" | Steve Winwood | 1 | August 30 | 6 | 1986 |
4422 | December 11 | "Grenade" (#6) | Bruno Mars | 1 | January 8 | 17 | 2011 |
4170 | November 4 | "Come to Me" | Diddy featuring Nicole Scherzinger | 9 | November 4 | 1 | 2006 |
1288 | January 9 | "Lonely Days" | Bee Gees | 3 | January 30 | 7 | 1971 |
1592 | March 2 | "Dark Lady" | Cher | 1 | March 23 | 6 | 1974 |
862 | October 15 | "Walk Away Renée" | The Left Banke | 5 | October 29 | 4 | 1966 |
Find Matching Spotify Songs
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
= spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials()) spotify
Some of the song titles had backslashes in them, which caused issues with createing the URLs for the Spotify API. We will get rid of all slashes to solve this problem.
'title'] = df['title'].str.replace("/","", regex=False)
df['title'] = df['title'].str.replace("\\","", regex=False)
df['title'] = df['title'].str.replace("(","", regex=False)
df['title'] = df['title'].str.replace(")","", regex=False)
df['title'] = df['title'].str.replace("\'","", regex=False)
df['title'] = df['title'].str.replace(":","", regex=False)
df['title'] = df['title'].str.replace(".","", regex=False) df[
= df.to_dict(orient='records') list_df
0] list_df[
{'entry_date': 'August 4',
'title': '"Poor Little Fool"',
'artist': 'Ricky Nelson',
'peak': 1,
'peak_date': 'August 4',
'weeks_top_ten': 6,
'year': 1958}
for item in tqdm(list_df):
try:
= spotify.search(f"""track:{item['title']} artist:{item['artist']}""", type="track", limit=1)
result if len(result['tracks']['items']) > 0:
'spotify_uri'] = result['tracks']['items'][0]['uri']
item[else:
'spotify_uri'] = np.nan
item[except spotipy.client.SpotifyException as e:
'spotify_uri'] = str(e.http_status) + " - " + e.msg item[
72%|███████▏ | 3644/5050 [06:11<02:14, 10.44it/s]HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:"Been Around the World" "Its All About the Benjamins" artist:Puff Daddy featuring The Notorious B.I.G. and Mase', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
78%|███████▊ | 3914/5050 [06:38<01:44, 10.85it/s]HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:"I Need a Girl Part Two" artist:P. Diddy featuring Tammy Ruggieri, Ginuwine, Loon, and Mario Winans', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
78%|███████▊ | 3916/5050 [06:38<01:37, 11.61it/s]HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:"Down 4 U" artist:Irv Gotti presents The Inc. featuring Ja Rule, Ashanti, Charli Baltimore and Vita', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
99%|█████████▉| 5013/5050 [08:27<00:03, 10.12it/s]HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:"We Dont Talk About Bruno" artist:Carolina Gaitán, Mauro Castillo, Adassa, Rhenzy Feliz, Diane Guerrero, Stephanie Beatriz and the Encanto cast', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
100%|██████████| 5050/5050 [08:30<00:00, 9.89it/s]
Small cleaning
Not all of our songs had matches, let’s remove those.
= pd.DataFrame(list_df) df
= df.dropna() df
= df[~df['spotify_uri'].str.contains("Not found")] df
'songs.csv', index=False) df.to_csv(
Extract Song Features
= pd.read_csv("songs.csv") df
df.head()
entry_date | title | artist | peak | peak_date | weeks_top_ten | year | spotify_uri | |
---|---|---|---|---|---|---|---|---|
0 | August 4 | "Poor Little Fool" | Ricky Nelson | 1 | August 4 | 6 | 1958 | spotify:track:5ayybTSXNwcarDtxQKqvWX |
1 | August 4 | "Patricia" | Pérez Prado | 2 | August 4 | 6 | 1958 | spotify:track:2bwhOdCOLgQ8v6xStAqnju |
2 | August 4 | "Splish Splash" | Bobby Darin | 3 | August 4 | 3 | 1958 | spotify:track:40fD7ct05FvQHLdQTgJelG |
3 | August 4 | "Hard Headed Woman" | Elvis Presley | 4 | August 4 | 2 | 1958 | spotify:track:3SU1TXJtAsf8jCKdUeYy53 |
4 | August 4 | "When" | Kalin Twins | 5 | August 4 | 5 | 1958 | spotify:track:3HZJ9BLBpDya4p71VfXSWp |
= df.to_dict(orient='records') list_df
88]['spotify_uri'])[0] spotify.audio_features(list_df[
item
{'entry_date': 'July 2',
'title': '"Falling Back"',
'artist': 'Drake',
'peak': 7,
'peak_date': 'July 2',
'weeks_top_ten': '1*',
'year': 2022,
'spotify_uri': 'spotify:track:1vbn9fEyw1IYhqgZJdu9ZB'}
= []
new_list for item in tqdm(list_df):
= spotify.audio_features(item['spotify_uri'])[0]
feature_dict if feature_dict is not None:
= {**item, **feature_dict}
combo_dict else:
= item
combo_dict new_list.append(combo_dict)
100%|██████████| 4251/4251 [05:27<00:00, 12.97it/s]
= pd.DataFrame(new_list)
df df.head()
entry_date | title | artist | peak | peak_date | weeks_top_ten | year | spotify_uri | danceability | energy | ... | liveness | valence | tempo | type | id | uri | track_href | analysis_url | duration_ms | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | August 4 | "Poor Little Fool" | Ricky Nelson | 1 | August 4 | 6 | 1958 | spotify:track:5ayybTSXNwcarDtxQKqvWX | 0.474 | 0.338 | ... | 0.1300 | 0.810 | 154.596 | audio_features | 5ayybTSXNwcarDtxQKqvWX | spotify:track:5ayybTSXNwcarDtxQKqvWX | https://api.spotify.com/v1/tracks/5ayybTSXNwca... | https://api.spotify.com/v1/audio-analysis/5ayy... | 153933.0 | 4.0 |
1 | August 4 | "Patricia" | Pérez Prado | 2 | August 4 | 6 | 1958 | spotify:track:2bwhOdCOLgQ8v6xStAqnju | 0.699 | 0.715 | ... | 0.0704 | 0.810 | 137.373 | audio_features | 2bwhOdCOLgQ8v6xStAqnju | spotify:track:2bwhOdCOLgQ8v6xStAqnju | https://api.spotify.com/v1/tracks/2bwhOdCOLgQ8... | https://api.spotify.com/v1/audio-analysis/2bwh... | 140000.0 | 4.0 |
2 | August 4 | "Splish Splash" | Bobby Darin | 3 | August 4 | 3 | 1958 | spotify:track:40fD7ct05FvQHLdQTgJelG | 0.645 | 0.943 | ... | 0.3700 | 0.965 | 147.768 | audio_features | 40fD7ct05FvQHLdQTgJelG | spotify:track:40fD7ct05FvQHLdQTgJelG | https://api.spotify.com/v1/tracks/40fD7ct05FvQ... | https://api.spotify.com/v1/audio-analysis/40fD... | 131720.0 | 4.0 |
3 | August 4 | "Hard Headed Woman" | Elvis Presley | 4 | August 4 | 2 | 1958 | spotify:track:3SU1TXJtAsf8jCKdUeYy53 | 0.616 | 0.877 | ... | 0.1840 | 0.919 | 97.757 | audio_features | 3SU1TXJtAsf8jCKdUeYy53 | spotify:track:3SU1TXJtAsf8jCKdUeYy53 | https://api.spotify.com/v1/tracks/3SU1TXJtAsf8... | https://api.spotify.com/v1/audio-analysis/3SU1... | 114240.0 | 4.0 |
4 | August 4 | "When" | Kalin Twins | 5 | August 4 | 5 | 1958 | spotify:track:3HZJ9BLBpDya4p71VfXSWp | 0.666 | 0.468 | ... | 0.1190 | 0.946 | 93.018 | audio_features | 3HZJ9BLBpDya4p71VfXSWp | spotify:track:3HZJ9BLBpDya4p71VfXSWp | https://api.spotify.com/v1/tracks/3HZJ9BLBpDya... | https://api.spotify.com/v1/audio-analysis/3HZJ... | 146573.0 | 4.0 |
5 rows × 26 columns
'songs.csv', index=False) df.to_csv(