import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException, ElementNotSelectableException, TimeoutException, SessionNotCreatedException
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm.notebook import tqdm
import time
from datetime import datetime
Data Collection
This notebook will scrape Airbnb listings for several cities around the world. The purpose is to understand the fee structure for these listings better. If other cities are requested, I can add them later.
= datetime.now().year
year = datetime.now().month
month = (datetime.now().month % 12) + 1 next_month
= Options()
options "--headless=new")
options.add_argument("--disable-extensions")
options.add_argument("--no-sandbox") options.add_argument(
= ['Dallas--TX--United-States', 'Austin--TX--United-States', 'Los-Angeles--United-States',
cities 'New-York-City--Manhattan--United-States', 'San-Francisco--CA--United-States']
Airbnb only shows 15 pages of listings, with 20 listings per page. That is 300 listings per city without extra filtering or moving the map around. This seems like a good enough sample for our purposes, and I believe this will approximate a normal user experience when using Airbnb.
len(cities) * 300
1500
def extract_features(url):
# Make empty dict
= {}
features try:
= Chrome(options=options)
driver
driver.get(url)= WebDriverWait(driver, timeout=10, poll_frequency=1, ignored_exceptions=[ElementNotVisibleException, ElementNotSelectableException])
wait
# Wait for pice list
# Make this class the price box. Could really be any of them.
= wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, """span[class='_18x3iiu']""")))
element = BeautifulSoup(driver.page_source)
soup
# Get price list
# This class should be the one where each price is listed. It's duplicated in the block area
= [x.text for x in soup.find_all("div", {'class':'_tr4owt'})]
price_list = [x.split("Show price breakdown") for x in price_list]
price_list for item in price_list:
if item[0] == 'Cleaning fee':
'cleaning_fee'] = item[1]
features[elif item[0] == 'Service fee':
'service_fee'] = item[1]
features[if item[0] == 'Weekly discount':
'weekly_discount'] = item[1]
features[else:
'weekly_discount'] = ''
features[if item[0] == 'Long stay discount':
'long_stay_discount'] = item[1]
features[else:
'long_stay_discount'] = ''
features['price_minus_fees'] = price_list[0][1]
features[
# Basic info
'title'] = soup.find('h2').text
features[
= soup.find_all("ol")[0].text.split("· ·")
bed_baths for item in bed_baths:
if 'guest' in item:
'guest'] = item.strip().split(" ")[0]
features[elif 'bedroom' in item:
'bedrooms'] = item.strip().split(" ")[0]
features[elif ('1 bed' in item) or ('beds' in item):
'beds'] = item.strip().split(" ")[0]
features[elif 'bath' in item:
'baths'] = item.strip().split(" ")[0]
features[
#add url
'url'] = url
features[
driver.quit()return features
# If the dates aren't available then it will time out.
except TimeoutException:
pass
except SessionNotCreatedException:
print("session error")
except:
print("unhandled exception")
Scrape flexible times
Scrape listing URLs for various cities
Looks good. Let’s scrape!
= []
links for city in tqdm(cities, position=0, desc="City"):
= f"https://www.airbnb.com/s/{city}/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&date_picker_type=flexible_dates&source=structured_search_input_header&search_type=autocomplete_click&pagination_search=true"
url # Load root city listings
= Chrome(options=options)
driver
driver.get(url)= WebDriverWait(driver, timeout=10, poll_frequency=1, ignored_exceptions=[ElementNotVisibleException, ElementNotSelectableException])
wait = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, """nav[aria-label='Search results pagination']""")))
element
# Determine number of pages in listings
= driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-2].text
pages = int(pages)
pages
# Navigate through each successive page in the listings
= []
temp_links for page in range(pages):
# Grab all links on page. We can filter them later.
for a in driver.find_elements(By.TAG_NAME, "a"):
if 'rooms' in a.get_property('href'):
'city':city, 'url':a.get_property('href')})
links.append({
# Got to next page and wait until element is loaded
"nav")[-1].find_elements(By.TAG_NAME, "a")[-1].click()
driver.find_elements(By.TAG_NAME, 10) time.sleep(
City: 100%|██████████| 5/5 [14:12<00:00, 170.41s/it]
Let’s deduplicate and filter the link list, then write to CSV for use later. Because ain’t nobody got time for that.
= pd.DataFrame(links) urls_flexible
urls_flexible.shape
(9442, 2)
= urls_flexible.drop_duplicates().reset_index(drop=True) urls_flexible
urls_flexible
city | url | |
---|---|---|
0 | Dallas--TX--United-States | https://www.airbnb.com/rooms/11288912634451714... |
1 | Dallas--TX--United-States | https://www.airbnb.com/rooms/63351740544145347... |
2 | Dallas--TX--United-States | https://www.airbnb.com/rooms/15110568?adults=1... |
3 | Dallas--TX--United-States | https://www.airbnb.com/rooms/12759576?adults=1... |
4 | Dallas--TX--United-States | https://www.airbnb.com/rooms/11268542802470994... |
... | ... | ... |
1345 | San-Francisco--CA--United-States | https://www.airbnb.com/rooms/33457138?adults=1... |
1346 | San-Francisco--CA--United-States | https://www.airbnb.com/rooms/38884411?adults=1... |
1347 | San-Francisco--CA--United-States | https://www.airbnb.com/rooms/75576040384622287... |
1348 | San-Francisco--CA--United-States | https://www.airbnb.com/rooms/62383325230559388... |
1349 | San-Francisco--CA--United-States | https://www.airbnb.com/rooms/1098003?adults=1&... |
1350 rows × 2 columns
f'urls_flexible_{month}_{year}.csv', index=False) urls_flexible.to_csv(
Scrape listing details
= pd.read_csv(f'urls_flexible_{month}_{year}.csv')
urls_flexible urls_flexible.shape
(1350, 2)
= []
data for i, row in tqdm(urls_flexible.iterrows(), total=urls_flexible.shape[0]):
= extract_features(row['url'])
features if features:
'location'] = row['city']
features['check_in'] = row['url'].split("check_in=")[1].split("&")[0]
features['check_out'] = row['url'].split("check_out=")[1].split("&")[0]
features[ data.append(features)
= pd.DataFrame(data) df
df
weekly_discount | long_stay_discount | cleaning_fee | price_minus_fees | title | guest | bedrooms | beds | baths | url | location | check_in | check_out | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | $75 | $360 | Entire rental unit in Dallas, Texas, United St... | 4 | 1 | 1 | 1 | https://www.airbnb.com/rooms/11288912634451714... | Dallas--TX--United-States | 2024-05-19 | 2024-05-24 | ||
1 | $60 | $705 | Entire guest suite in Dallas, Texas, United St... | 2 | 1 | 1 | 1 | https://www.airbnb.com/rooms/63351740544145347... | Dallas--TX--United-States | 2024-06-25 | 2024-06-30 | ||
2 | $30 | $301 | Room in Dallas, Texas, United States | NaN | NaN | NaN | Dedicated | https://www.airbnb.com/rooms/15110568?adults=1... | Dallas--TX--United-States | 2024-06-26 | 2024-07-01 | ||
3 | $45 | $425 | Entire guesthouse in Dallas, Texas, United States | 2 | 1 | 1 | 1 | https://www.airbnb.com/rooms/12759576?adults=1... | Dallas--TX--United-States | 2024-07-07 | 2024-07-12 | ||
4 | $35 | $500 | Entire rental unit in Dallas, Texas, United St... | 4 | 1 | 1 | 1 | https://www.airbnb.com/rooms/11268542802470994... | Dallas--TX--United-States | 2024-05-19 | 2024-05-24 | ||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1338 | $55 | $779 | Room in San Francisco, California, United States | NaN | NaN | NaN | Dedicated | https://www.airbnb.com/rooms/33457138?adults=1... | San-Francisco--CA--United-States | 2024-06-11 | 2024-06-16 | ||
1339 | $45 | $620 | Room in San Francisco, California, United States | NaN | 1 | NaN | Shared | https://www.airbnb.com/rooms/38884411?adults=1... | San-Francisco--CA--United-States | 2024-05-18 | 2024-05-23 | ||
1340 | $55 | $695 | Room in San Francisco, California, United States | NaN | NaN | 1 | Shared | https://www.airbnb.com/rooms/75576040384622287... | San-Francisco--CA--United-States | 2024-06-02 | 2024-06-07 | ||
1341 | $29 | $699 | Entire condo in San Francisco, California, Uni... | 3 | 1 | NaN | 1 | https://www.airbnb.com/rooms/62383325230559388... | San-Francisco--CA--United-States | 2024-12-17 | 2024-12-22 | ||
1342 | $170 | $3,500 | Entire home in San Francisco, California, Unit... | 6 | 3 | 4 | 3 | https://www.airbnb.com/rooms/1098003?adults=1&... | San-Francisco--CA--United-States | 2024-06-16 | 2024-06-21 |
1343 rows × 13 columns
df.shape
(1343, 13)
f"flexible_listings_{month}_{year}.csv", index=False) df.to_csv(
Scrape one night listings
Now we will do the same thing but with a one night stay.
= []
links for city in tqdm(cities, position=0, desc="City"):
= f'2024-{next_month}-02'
check_in = f'2024-{next_month}-03'
check_out = f"https://www.airbnb.com/s/{city}/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&date_picker_type=calendar&checkin={check_in}&checkout={check_out}"
url # Load root city listings
= Chrome(options=options)
driver
driver.get(url)= WebDriverWait(driver, timeout=10, poll_frequency=1, ignored_exceptions=[ElementNotVisibleException, ElementNotSelectableException])
wait = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, """nav[aria-label='Search results pagination']""")))
element
# Determine number of pages in listings
= driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-2].text
pages = int(pages)
pages
# Navigate through each successive page in the listings
= []
temp_links for page in range(pages):
# Grab all links on page. We can filter them later.
for a in driver.find_elements(By.TAG_NAME, "a"):
if 'rooms' in a.get_property('href'):
'city':city, 'url':a.get_property('href')})
links.append({
# Got to next page and wait until element is loaded
"nav")[-1].find_elements(By.TAG_NAME, "a")[-1].click()
driver.find_elements(By.TAG_NAME, 10) time.sleep(
= pd.DataFrame(links) urls_one_day
urls_one_day.shape
(9475, 2)
= urls_one_day.drop_duplicates().reset_index(drop=True) urls_one_day
10) urls_one_day.sample(
city | url | |
---|---|---|
410 | Austin--TX--United-States | https://www.airbnb.com/rooms/26293461?adults=1... |
720 | Los-Angeles--United-States | https://www.airbnb.com/rooms/38905402?adults=1... |
826 | New-York-City--Manhattan--United-States | https://www.airbnb.com/rooms/4936254?adults=1&... |
810 | New-York-City--Manhattan--United-States | https://www.airbnb.com/rooms/5406041?adults=1&... |
180 | Dallas--TX--United-States | https://www.airbnb.com/rooms/43819359?adults=1... |
469 | Austin--TX--United-States | https://www.airbnb.com/rooms/43913939?adults=1... |
335 | Austin--TX--United-States | https://www.airbnb.com/rooms/19496423?adults=1... |
295 | Austin--TX--United-States | https://www.airbnb.com/rooms/71901894796619394... |
564 | Los-Angeles--United-States | https://www.airbnb.com/rooms/35969924?adults=1... |
727 | Los-Angeles--United-States | https://www.airbnb.com/rooms/11355480565728754... |
f'urls_one_day_{month}_{year}.csv', index=False) urls_one_day.to_csv(
Scrape listing details
= pd.read_csv(f'urls_one_day_{month}_{year}.csv') urls_one_day
= []
data for i, row in tqdm(urls_one_day.iterrows(), total=urls_one_day.shape[0]):
= extract_features(row['url'])
features if features:
'location'] = row['city']
features['check_in'] = row['url'].split("check_in=")[1].split("&")[0]
features['check_out'] = row['url'].split("check_out=")[1].split("&")[0]
features[ data.append(features)
= pd.DataFrame(data)
df f"one_day_listings_{month}_{year}.csv", index=False) df.to_csv(
df.shape
(1323, 13)
df.head()
weekly_discount | long_stay_discount | cleaning_fee | price_minus_fees | title | guest | bedrooms | beds | baths | url | location | check_in | check_out | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | $45 | $425 | Entire guesthouse in Dallas, Texas, United States | 2 | 1 | 1 | 1 | https://www.airbnb.com/rooms/12759576?adults=1... | Dallas--TX--United-States | 2024-07-07 | 2024-07-12 | ||
1 | $65 | $905 | Entire cabin in Heath, Texas, United States | 3 | 1 | 2 | 1 | https://www.airbnb.com/rooms/10085991956732392... | Dallas--TX--United-States | 2024-06-16 | 2024-06-21 | ||
2 | $75 | $450 | Entire guesthouse in Dallas, Texas, United States | 2 | 1 | 2 | 1 | https://www.airbnb.com/rooms/46852636?adults=1... | Dallas--TX--United-States | 2024-05-19 | 2024-05-24 | ||
3 | $60 | $749 | Entire guest suite in Dallas, Texas, United St... | 2 | 1 | 1 | 1 | https://www.airbnb.com/rooms/63351740544145347... | Dallas--TX--United-States | 2024-06-25 | 2024-06-30 | ||
4 | $30 | $322 | Room in Dallas, Texas, United States | NaN | NaN | NaN | Dedicated | https://www.airbnb.com/rooms/15110568?adults=1... | Dallas--TX--United-States | 2024-05-14 | 2024-05-19 |