Data Collection

This notebook will scrape Airbnb listings for several cities around the world. The purpose is to understand the fee structure for these listings better. If other cities are requested, I can add them later.

import pandas as pd
import requests

from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException, ElementNotSelectableException, TimeoutException, SessionNotCreatedException
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from tqdm.notebook import tqdm
import time
from datetime import datetime
year = datetime.now().year
month = datetime.now().month
next_month = (datetime.now().month % 12) + 1
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-extensions")
options.add_argument("--no-sandbox")
cities = ['Dallas--TX--United-States', 'Austin--TX--United-States', 'Los-Angeles--United-States', 
          'New-York-City--Manhattan--United-States', 'San-Francisco--CA--United-States']

Airbnb only shows 15 pages of listings, with 20 listings per page. That is 300 listings per city without extra filtering or moving the map around. This seems like a good enough sample for our purposes, and I believe this will approximate a normal user experience when using Airbnb.

len(cities) * 300
1500
def extract_features(url):
    # Make empty dict
    features = {}
    try:
        driver = Chrome(options=options)
        driver.get(url)
        wait = WebDriverWait(driver, timeout=10, poll_frequency=1, ignored_exceptions=[ElementNotVisibleException, ElementNotSelectableException])

        # Wait for pice list 
        # Make this class the price box. Could really be any of them.
        element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, """span[class='_18x3iiu']""")))
        soup = BeautifulSoup(driver.page_source)

        # Get price list
        # This class should be the one where each price is listed. It's duplicated in the block area
        price_list = [x.text for x in soup.find_all("div", {'class':'_tr4owt'})]
        price_list = [x.split("Show price breakdown") for x in price_list]
        for item in price_list:
            if item[0] == 'Cleaning fee':
                features['cleaning_fee'] = item[1]
            elif item[0] == 'Service fee':
                features['service_fee'] = item[1]
            if item[0] == 'Weekly discount':
                features['weekly_discount'] = item[1]
            else:
                features['weekly_discount'] = ''
            if item[0] == 'Long stay discount':
                features['long_stay_discount'] = item[1]
            else:
                features['long_stay_discount'] = ''
        features['price_minus_fees'] = price_list[0][1]

        # Basic info
        features['title'] = soup.find('h2').text
        
        bed_baths = soup.find_all("ol")[0].text.split("·  ·")
        for item in bed_baths:
            if 'guest' in item:
                features['guest'] = item.strip().split(" ")[0]
            elif 'bedroom' in item:
                features['bedrooms'] = item.strip().split(" ")[0]
            elif ('1 bed' in item) or ('beds' in item):
                features['beds'] = item.strip().split(" ")[0]
            elif 'bath' in item:
                features['baths'] = item.strip().split(" ")[0]

        #add url
        features['url'] = url

        driver.quit()
        return features
    # If the dates aren't available then it will time out. 
    except TimeoutException:
        pass
    except SessionNotCreatedException:
        print("session error")
    except:
        print("unhandled exception")

Scrape flexible times

Scrape listing URLs for various cities

Looks good. Let’s scrape!

links = []
for city in tqdm(cities, position=0, desc="City"):
    url = f"https://www.airbnb.com/s/{city}/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&date_picker_type=flexible_dates&source=structured_search_input_header&search_type=autocomplete_click&pagination_search=true"
    # Load root city listings
    driver = Chrome(options=options)
    driver.get(url)
    wait = WebDriverWait(driver, timeout=10, poll_frequency=1, ignored_exceptions=[ElementNotVisibleException, ElementNotSelectableException])
    element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, """nav[aria-label='Search results pagination']""")))

    # Determine number of pages in listings
    pages = driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-2].text
    pages = int(pages)

    # Navigate through each successive page in the listings
    temp_links = []
    for page in range(pages):
        # Grab all links on page. We can filter them later.
        for a in driver.find_elements(By.TAG_NAME, "a"):
            if 'rooms' in a.get_property('href'):
                links.append({'city':city, 'url':a.get_property('href')})
        
        # Got to next page and wait until element is loaded
        driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-1].click()
        time.sleep(10)
City: 100%|██████████| 5/5 [14:12<00:00, 170.41s/it]

Let’s deduplicate and filter the link list, then write to CSV for use later. Because ain’t nobody got time for that.

urls_flexible = pd.DataFrame(links)
urls_flexible.shape
(9442, 2)
urls_flexible = urls_flexible.drop_duplicates().reset_index(drop=True)
urls_flexible
city url
0 Dallas--TX--United-States https://www.airbnb.com/rooms/11288912634451714...
1 Dallas--TX--United-States https://www.airbnb.com/rooms/63351740544145347...
2 Dallas--TX--United-States https://www.airbnb.com/rooms/15110568?adults=1...
3 Dallas--TX--United-States https://www.airbnb.com/rooms/12759576?adults=1...
4 Dallas--TX--United-States https://www.airbnb.com/rooms/11268542802470994...
... ... ...
1345 San-Francisco--CA--United-States https://www.airbnb.com/rooms/33457138?adults=1...
1346 San-Francisco--CA--United-States https://www.airbnb.com/rooms/38884411?adults=1...
1347 San-Francisco--CA--United-States https://www.airbnb.com/rooms/75576040384622287...
1348 San-Francisco--CA--United-States https://www.airbnb.com/rooms/62383325230559388...
1349 San-Francisco--CA--United-States https://www.airbnb.com/rooms/1098003?adults=1&...

1350 rows × 2 columns

urls_flexible.to_csv(f'urls_flexible_{month}_{year}.csv', index=False)

Scrape listing details

urls_flexible = pd.read_csv(f'urls_flexible_{month}_{year}.csv')
urls_flexible.shape
(1350, 2)
data = []
for i, row in tqdm(urls_flexible.iterrows(), total=urls_flexible.shape[0]):
    features = extract_features(row['url'])
    if features:
        features['location'] = row['city']
        features['check_in'] = row['url'].split("check_in=")[1].split("&")[0]
        features['check_out'] = row['url'].split("check_out=")[1].split("&")[0]
        data.append(features)
df = pd.DataFrame(data)
df
weekly_discount long_stay_discount cleaning_fee price_minus_fees title guest bedrooms beds baths url location check_in check_out
0 $75 $360 Entire rental unit in Dallas, Texas, United St... 4 1 1 1 https://www.airbnb.com/rooms/11288912634451714... Dallas--TX--United-States 2024-05-19 2024-05-24
1 $60 $705 Entire guest suite in Dallas, Texas, United St... 2 1 1 1 https://www.airbnb.com/rooms/63351740544145347... Dallas--TX--United-States 2024-06-25 2024-06-30
2 $30 $301 Room in Dallas, Texas, United States NaN NaN NaN Dedicated https://www.airbnb.com/rooms/15110568?adults=1... Dallas--TX--United-States 2024-06-26 2024-07-01
3 $45 $425 Entire guesthouse in Dallas, Texas, United States 2 1 1 1 https://www.airbnb.com/rooms/12759576?adults=1... Dallas--TX--United-States 2024-07-07 2024-07-12
4 $35 $500 Entire rental unit in Dallas, Texas, United St... 4 1 1 1 https://www.airbnb.com/rooms/11268542802470994... Dallas--TX--United-States 2024-05-19 2024-05-24
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1338 $55 $779 Room in San Francisco, California, United States NaN NaN NaN Dedicated https://www.airbnb.com/rooms/33457138?adults=1... San-Francisco--CA--United-States 2024-06-11 2024-06-16
1339 $45 $620 Room in San Francisco, California, United States NaN 1 NaN Shared https://www.airbnb.com/rooms/38884411?adults=1... San-Francisco--CA--United-States 2024-05-18 2024-05-23
1340 $55 $695 Room in San Francisco, California, United States NaN NaN 1 Shared https://www.airbnb.com/rooms/75576040384622287... San-Francisco--CA--United-States 2024-06-02 2024-06-07
1341 $29 $699 Entire condo in San Francisco, California, Uni... 3 1 NaN 1 https://www.airbnb.com/rooms/62383325230559388... San-Francisco--CA--United-States 2024-12-17 2024-12-22
1342 $170 $3,500 Entire home in San Francisco, California, Unit... 6 3 4 3 https://www.airbnb.com/rooms/1098003?adults=1&... San-Francisco--CA--United-States 2024-06-16 2024-06-21

1343 rows × 13 columns

df.shape
(1343, 13)
df.to_csv(f"flexible_listings_{month}_{year}.csv", index=False)

Scrape one night listings

Now we will do the same thing but with a one night stay.

links = []
for city in tqdm(cities, position=0, desc="City"):
    check_in = f'2024-{next_month}-02'
    check_out = f'2024-{next_month}-03'
    url = f"https://www.airbnb.com/s/{city}/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&date_picker_type=calendar&checkin={check_in}&checkout={check_out}"
    # Load root city listings
    driver = Chrome(options=options)
    driver.get(url)
    wait = WebDriverWait(driver, timeout=10, poll_frequency=1, ignored_exceptions=[ElementNotVisibleException, ElementNotSelectableException])
    element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, """nav[aria-label='Search results pagination']""")))

    # Determine number of pages in listings
    pages = driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-2].text
    pages = int(pages)

    # Navigate through each successive page in the listings
    temp_links = []
    for page in range(pages):
        # Grab all links on page. We can filter them later.
        for a in driver.find_elements(By.TAG_NAME, "a"):
            if 'rooms' in a.get_property('href'):
                links.append({'city':city, 'url':a.get_property('href')})
        
        # Got to next page and wait until element is loaded
        driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-1].click()
        time.sleep(10)
urls_one_day = pd.DataFrame(links)
urls_one_day.shape
(9475, 2)
urls_one_day = urls_one_day.drop_duplicates().reset_index(drop=True)
urls_one_day.sample(10)
city url
410 Austin--TX--United-States https://www.airbnb.com/rooms/26293461?adults=1...
720 Los-Angeles--United-States https://www.airbnb.com/rooms/38905402?adults=1...
826 New-York-City--Manhattan--United-States https://www.airbnb.com/rooms/4936254?adults=1&...
810 New-York-City--Manhattan--United-States https://www.airbnb.com/rooms/5406041?adults=1&...
180 Dallas--TX--United-States https://www.airbnb.com/rooms/43819359?adults=1...
469 Austin--TX--United-States https://www.airbnb.com/rooms/43913939?adults=1...
335 Austin--TX--United-States https://www.airbnb.com/rooms/19496423?adults=1...
295 Austin--TX--United-States https://www.airbnb.com/rooms/71901894796619394...
564 Los-Angeles--United-States https://www.airbnb.com/rooms/35969924?adults=1...
727 Los-Angeles--United-States https://www.airbnb.com/rooms/11355480565728754...
urls_one_day.to_csv(f'urls_one_day_{month}_{year}.csv', index=False)

Scrape listing details

urls_one_day = pd.read_csv(f'urls_one_day_{month}_{year}.csv')
data = []
for i, row in tqdm(urls_one_day.iterrows(), total=urls_one_day.shape[0]):
    features = extract_features(row['url'])
    if features:
        features['location'] = row['city']
        features['check_in'] = row['url'].split("check_in=")[1].split("&")[0]
        features['check_out'] = row['url'].split("check_out=")[1].split("&")[0]
        data.append(features)
df = pd.DataFrame(data)
df.to_csv(f"one_day_listings_{month}_{year}.csv", index=False)
df.shape
(1323, 13)
df.head()
weekly_discount long_stay_discount cleaning_fee price_minus_fees title guest bedrooms beds baths url location check_in check_out
0 $45 $425 Entire guesthouse in Dallas, Texas, United States 2 1 1 1 https://www.airbnb.com/rooms/12759576?adults=1... Dallas--TX--United-States 2024-07-07 2024-07-12
1 $65 $905 Entire cabin in Heath, Texas, United States 3 1 2 1 https://www.airbnb.com/rooms/10085991956732392... Dallas--TX--United-States 2024-06-16 2024-06-21
2 $75 $450 Entire guesthouse in Dallas, Texas, United States 2 1 2 1 https://www.airbnb.com/rooms/46852636?adults=1... Dallas--TX--United-States 2024-05-19 2024-05-24
3 $60 $749 Entire guest suite in Dallas, Texas, United St... 2 1 1 1 https://www.airbnb.com/rooms/63351740544145347... Dallas--TX--United-States 2024-06-25 2024-06-30
4 $30 $322 Room in Dallas, Texas, United States NaN NaN NaN Dedicated https://www.airbnb.com/rooms/15110568?adults=1... Dallas--TX--United-States 2024-05-14 2024-05-19