Web Scraping per realizzare Wordlist
Per ottenere Wordlist relative a specifici argomenti, ci si può avvalere della tecnica del “Web Scraping“, estraendo le informazioni utili, direttamente dai siti web.
Lo script Python di esempio, può essere utile nella realizzazione di Wordlists basate su Nomi prorpi Italiani, estraendole direttamente da sito nomix.it
Script Python webscraping-names.py
from bs4 import BeautifulSoup
import time # Optional delay between requests
import requests
import argparse # Import argparse for handling command-line arguments
# Set up argparse to handle command-line arguments
parser = argparse.ArgumentParser(description="Scrape names from the website.")
parser.add_argument('gender', choices=['maschili', 'femminili'], help="Selezionare 'maschili' o 'femminili'")
args = parser.parse_args()
# Define the letter groupings according to the site's rules
letter_groups = [
"A", "B", "C", "D", "E", "F", "G", "I", "L",
"M", "NO", "PQ", "R", "S", "TUV", "WZ"
]
# Loop through each letter group
for group in letter_groups:
# Construct the URL for the current group
url = f"https://www.nomix.it/nomi-italiani-lettera-{group}.php"
#print(f"Fetching {args.gender} names for group '{group}' from {url}...")
# Send a GET request to the website
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Function to extract names from a given section
def extract_names(section):
names = [] # List to hold the extracted names
rows = section.find_all('tr') # Find all rows in the table
for row in rows:
# Check for <a> tags
link = row.find('a')
if link and link.has_attr('title'):
title = link.get('title').strip() # Get the title attribute
name = link.text.strip() # Get the text content
names.append((title, name))
# Check for names in <td> without <a> tags
td = row.find('td')
if td and not link: # Ensure it's not already added
name = td.text.strip()
if name: # Ensure the name is not empty
names.append((None, name)) # No title for these names
return names
# Select the section based on the gender input
section = None
for div in soup.find_all('div', class_='pure-u-1 pure-u-md-1-2'):
header = div.find('h3')
if header:
if (args.gender == 'maschili' and header.text.strip() == "Maschili") or \
(args.gender == 'femminili' and header.text.strip() == "Femminili"):
section = div # Set section to the matched div
break # Exit the loop
# Check if section is found
if section is not None:
names = extract_names(section)
# Print names
if names:
for title, name in names:
title_output = title if title else "No Title"
# DEBUG
#print(f"{args.gender.capitalize()} - Title: {title_output}\nName: {name}\n")
print(f"{name}")
else:
print(f"No {args.gender} names found.")
else:
print(f"No section found for '{args.gender}' in group '{group}'.")
else:
print(f"Failed to retrieve the page for group '{group}'. Status code: {response.status_code}")