# import necessary libraries
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import ChromeOptions
import timeNote: This is an adapted version of the original scraper.py script optimized for Quarto.
Accordingly, the number of bands exported from this file to metallum_bands.db or metallum_bands.csv may differ slightly from those shown in subsequent analyses.
# set options to fine-tune ChromeOptions()
options = ChromeOptions()
# run headless ChromeDriver to avoid popup
options.add_argument('--headless=new')# instantiate Chrome driver with headless browser
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),
options=options)
# define base url
base_url = 'https://www.metal-archives.com/lists/'
# instantiate driver from base_url
driver.get(base_url)# define function to extract band info from each individual page
def extract_band_info(page_source):
# create BeautifulSoup object to parse HTMl aspects of page
soup = BeautifulSoup(page_source, 'html.parser')
# locate band info rows and select
rows = soup.select('#bandListAlpha tbody tr')
# create empty list to append data onto
band_info = []
# iterate through rows (bands) in page
for row in rows:
# define entry as dictionary that gets only text aspects
# nth preserves increasing tr[] object
entry = {
'band_name': row.select_one('td:nth-of-type(1) a').text.strip(),
'country': row.select_one('td:nth-of-type(2)').text.strip(),
'genre': row.select_one('td:nth-of-type(3)').text.strip(),
'status': row.select_one('td:nth-of-type(4) span').text.strip()
}
# append band_info with each entry
band_info.append(entry)
# return the final band_info
return band_info# define function to extract all bands from letter
def extract_bands_from_letter(page_source):
# create empty list for all bands in each letter
entire_letter_bands = []
# set page_source equal to the driver's page_source
page_source = driver.page_source
# call extract_band_info function on each page and add on to entire_letter_bands
entire_letter_bands.extend(extract_band_info(page_source=page_source))
# instantiate logic
while True:
try:
# define the next button
next_button = driver.find_element(By.XPATH, '//*[@id="bandListAlpha_next"]')
# if on last page (no next_button), break
if 'next paginate_button paginate_button_disabled' in next_button.get_attribute('class'):
break
# otherwise, click to next page
else:
next_button.click()
# give content a second to load
time.sleep(2)
# define page_source as the driver's page_source
page_source = driver.page_source
# recall extract_band_info function as long as still on same letter
entire_letter_bands.extend(extract_band_info(page_source=page_source))
# add logic to print exception and break
except Exception as e:
print(f"Error: {e}")
break
# return all bands from the letter
return entire_letter_bands# define function to get every band on metallum
def extract_all_letters(base_url):
# define base url
base_url = base_url
# define letter list
letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
# define character list (NBR would have been evaluated as 'N', 'B', and 'R')
characters = ['NBR', '~']
# set letters_finished (loop-breaking arg) to False
letters_finished = False
# set characters_finished (loop-breaking arg) to False
characters_finished = False
# create empty list for all bands
all_bands = []
# loop through list of letters
for letter in letters:
# unless already looped through, then break
if letters_finished:
break
try:
# set letter_page as base_url plus the letter
letter_page = f"{base_url}{letter}"
# print the url to monitor progress
print(f"Accessing URL: {letter_page}")
# get driver to access the letter page
driver.get(letter_page)
# give it 10 if it needs it
driver.implicitly_wait(10)
# extract the bands in each letter using previous function
bands_in_letter = extract_bands_from_letter(driver.page_source)
# add entries to all_bands
all_bands.extend(bands_in_letter)
# if letter is Z (end reached), set loop to break
if letter == 'Z':
letters_finished = True
# print any exceptions
except Exception as e:
print(f"Error accessing {letter_page}: {e}")
continue
# character function is identical, but loops over the characters instead
for character in characters:
if characters_finished:
break
try:
# set character_page as base_url plus the character
character_page = f"{base_url}{character}"
# print the url to monitor progress
print(f"Accessing URL: {character_page}")
# get driver to access the character page
driver.get(character_page)
# give it 10 if it needs it
driver.implicitly_wait(10)
# extract the bands in each character using previous function
bands_in_character = extract_bands_from_letter(driver.page_source)
# add entries to all_bands
all_bands.extend(bands_in_character)
# if character is ~ (end reached), set loop to break
if character == '~':
characters_finished = True
# print any exceptions
except Exception as e:
print(f"Error accessing {character_page}: {e}")
continue
# return all the bands from all the letters and characters
return all_bands # using extract_all_letters, save all bands to all_bands
all_bands = extract_all_letters(base_url=base_url)Accessing URL: https://www.metal-archives.com/lists/A
Accessing URL: https://www.metal-archives.com/lists/B
Accessing URL: https://www.metal-archives.com/lists/C
Accessing URL: https://www.metal-archives.com/lists/D
Accessing URL: https://www.metal-archives.com/lists/E
Accessing URL: https://www.metal-archives.com/lists/F
Accessing URL: https://www.metal-archives.com/lists/G
Accessing URL: https://www.metal-archives.com/lists/H
Accessing URL: https://www.metal-archives.com/lists/I
Accessing URL: https://www.metal-archives.com/lists/J
Accessing URL: https://www.metal-archives.com/lists/K
Accessing URL: https://www.metal-archives.com/lists/L
Accessing URL: https://www.metal-archives.com/lists/M
Accessing URL: https://www.metal-archives.com/lists/N
Accessing URL: https://www.metal-archives.com/lists/O
Accessing URL: https://www.metal-archives.com/lists/P
Accessing URL: https://www.metal-archives.com/lists/Q
Accessing URL: https://www.metal-archives.com/lists/R
Accessing URL: https://www.metal-archives.com/lists/S
Accessing URL: https://www.metal-archives.com/lists/T
Accessing URL: https://www.metal-archives.com/lists/U
Accessing URL: https://www.metal-archives.com/lists/V
Accessing URL: https://www.metal-archives.com/lists/W
Accessing URL: https://www.metal-archives.com/lists/X
Accessing URL: https://www.metal-archives.com/lists/Y
Accessing URL: https://www.metal-archives.com/lists/Z
Accessing URL: https://www.metal-archives.com/lists/NBR
Accessing URL: https://www.metal-archives.com/lists/~
# quit driver
driver.quit()# convert all_bands to df
metallum_df = pd.DataFrame.from_dict(all_bands)
# create band_id column from index
metallum_df['band_id'] = metallum_df.index
# output df to csv
metallum_df.to_csv('data/metallum_bands.csv')# import sqlite and create database 'metallum_bands.db'
import sqlite3
conn = sqlite3.connect('metallum_bands.db')
# add metallum_df to database
metallum_df.to_sql('metal_archives_table', con=conn, if_exists='replace', index=False)180137
# commit database changes and close sqlite connection
conn.commit()
conn.close()