
Web Scrapping MeetUp
Publicado por mariona (17 intervenciones) el 19/12/2023 09:05:48
Hola, estoy intentando scrappear los eventos de MeetUp para Madrid y exportarlos en un csv con las siguientes variables: .image_url, event_name, event_date, event_time, event_group, description_text, location_name, location_info, map_link, event_categories
Tengo el siguiente código pero no me funciona, si me pudierais ayudar por favor que no soy programadora y no sé muy bien cómo debo hacerlo. Gracias de antemano.
No sé si el problema radica en la paginación o es que tengo algo mal definido en el código. Si me pudierais ayudar os lo agradecería, pues lo necesito para un proyecto.
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import os
import time # Import the time module
import random
# URL of the page with meetup events in Madrid
base_url = "https://www.meetup.com/es-ES/find/?location=es--Madrid&source=EVENTS&eventType=inPerson"
# Limit of events you want to collect
event_limit = 100
# Directory to store the event images
image_dir = "event_images"
os.makedirs(image_dir, exist_ok=True)
# List to store event data
data = []
# Initialize a Selenium web driver
driver = webdriver.Chrome() # You need to have Chrome and chromedriver installed
# Function to get event details including the image
def get_event_details(event_url):
driver.get(event_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Introduce a 0.5-second delay before retrieving the map link
time.sleep(1 + random.uniform(0, 1))
# Find the HTML element containing the event image from within the event page
image_element = soup.find("img", {"alt": True, "data-nimg": True})
# Extract the image URL
image_url = image_element['src'] if image_element else "Image URL not available"
description_element = soup.find("div", {"class": "break-words"})
description = description_element.find_all("p")
description_text = "\n".join([p.text.strip() for p in description])
location_name_element = soup.select_one("a[data-testid='venue-name-link']").text if soup.select_one(
"a[data-testid='venue-name-link']") else "Information not available"
location_info_element = soup.find("div", {"class": "text-gray6", "data-testid": "location-info"})
location_info = location_info_element.text.strip() if location_info_element else "Information not available"
map_link_element = soup.find("a", {"data-testid": "map-link"})['href'] if soup.find("a", {
"data-testid": "map-link"}) else "Map link not available"
categories = get_event_categories(event_url)
return image_url, description_text, location_name_element, location_info, map_link_element, categories
# Function to get event categories
def get_event_categories(event_url):
driver.get(event_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
categories_element = soup.find("div", {"id": "topics"})
if categories_element:
categories = categories_element.find_all("a")
categories_list = [category.text for category in categories]
categories_text = ", ".join(categories_list)
else:
categories_text = "Categories not available"
return categories_text
# Counter for the number of collected events
event_count = 0
# Conjunto para almacenar URL de eventos ya recopiladas
collected_event_urls = set()
# Current page number
page_number = 1
while event_count < event_limit:
# Build the URL of the current page
url = f"{base_url}&page={page_number}"
# Send an HTTP request to get the page
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find all elements containing event details
event_elements = soup.find_all("div", {"data-element-name": "categoryResults-eventCard"})
if not event_elements:
break
for event_element in event_elements:
# Find the image URL, event name, and other details
event_url_element = event_element.find("a", {"data-event-label": "Event card"})
event_url = event_url_element["href"]
# Verifica si ya hemos recopilado este evento
if event_url in collected_event_urls:
continue
image_url, description_text, location_name, location_info, map_link, event_categories = get_event_details(event_url)
event_name = event_element.find('h2', class_="text-gray7 font-medium text-base pb-1 pt-0 line-clamp-3").text.strip()
event_date = event_element.find("span").text.strip()
event_time_element = event_element.find("time")
if event_time_element:
event_time = event_time_element.find_all("span")[1].text.strip()
else:
event_time = "Time not available"
event_group = event_element.find("p", class_="text-gray6").text.strip()
# Añade la URL del evento al conjunto de eventos recopilados
collected_event_urls.add(event_url)
data.append([image_url, event_name, event_date, event_time, event_group, description_text, location_name, location_info, map_link, event_categories])
event_count += 1
if event_count >= event_limit:
break
page_number += 1
# Save the data to a CSV file
with open('meetup_events_madrid.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Image URL", "Name", "Date", "Time", "Group", "Description", "Location Name", "Location Info", "Map Link", "Categories"])
for event in data:
writer.writerow(event)
Tengo el siguiente código pero no me funciona, si me pudierais ayudar por favor que no soy programadora y no sé muy bien cómo debo hacerlo. Gracias de antemano.
No sé si el problema radica en la paginación o es que tengo algo mal definido en el código. Si me pudierais ayudar os lo agradecería, pues lo necesito para un proyecto.
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import os
import time # Import the time module
import random
# URL of the page with meetup events in Madrid
base_url = "https://www.meetup.com/es-ES/find/?location=es--Madrid&source=EVENTS&eventType=inPerson"
# Limit of events you want to collect
event_limit = 100
# Directory to store the event images
image_dir = "event_images"
os.makedirs(image_dir, exist_ok=True)
# List to store event data
data = []
# Initialize a Selenium web driver
driver = webdriver.Chrome() # You need to have Chrome and chromedriver installed
# Function to get event details including the image
def get_event_details(event_url):
driver.get(event_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Introduce a 0.5-second delay before retrieving the map link
time.sleep(1 + random.uniform(0, 1))
# Find the HTML element containing the event image from within the event page
image_element = soup.find("img", {"alt": True, "data-nimg": True})
# Extract the image URL
image_url = image_element['src'] if image_element else "Image URL not available"
description_element = soup.find("div", {"class": "break-words"})
description = description_element.find_all("p")
description_text = "\n".join([p.text.strip() for p in description])
location_name_element = soup.select_one("a[data-testid='venue-name-link']").text if soup.select_one(
"a[data-testid='venue-name-link']") else "Information not available"
location_info_element = soup.find("div", {"class": "text-gray6", "data-testid": "location-info"})
location_info = location_info_element.text.strip() if location_info_element else "Information not available"
map_link_element = soup.find("a", {"data-testid": "map-link"})['href'] if soup.find("a", {
"data-testid": "map-link"}) else "Map link not available"
categories = get_event_categories(event_url)
return image_url, description_text, location_name_element, location_info, map_link_element, categories
# Function to get event categories
def get_event_categories(event_url):
driver.get(event_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
categories_element = soup.find("div", {"id": "topics"})
if categories_element:
categories = categories_element.find_all("a")
categories_list = [category.text for category in categories]
categories_text = ", ".join(categories_list)
else:
categories_text = "Categories not available"
return categories_text
# Counter for the number of collected events
event_count = 0
# Conjunto para almacenar URL de eventos ya recopiladas
collected_event_urls = set()
# Current page number
page_number = 1
while event_count < event_limit:
# Build the URL of the current page
url = f"{base_url}&page={page_number}"
# Send an HTTP request to get the page
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find all elements containing event details
event_elements = soup.find_all("div", {"data-element-name": "categoryResults-eventCard"})
if not event_elements:
break
for event_element in event_elements:
# Find the image URL, event name, and other details
event_url_element = event_element.find("a", {"data-event-label": "Event card"})
event_url = event_url_element["href"]
# Verifica si ya hemos recopilado este evento
if event_url in collected_event_urls:
continue
image_url, description_text, location_name, location_info, map_link, event_categories = get_event_details(event_url)
event_name = event_element.find('h2', class_="text-gray7 font-medium text-base pb-1 pt-0 line-clamp-3").text.strip()
event_date = event_element.find("span").text.strip()
event_time_element = event_element.find("time")
if event_time_element:
event_time = event_time_element.find_all("span")[1].text.strip()
else:
event_time = "Time not available"
event_group = event_element.find("p", class_="text-gray6").text.strip()
# Añade la URL del evento al conjunto de eventos recopilados
collected_event_urls.add(event_url)
data.append([image_url, event_name, event_date, event_time, event_group, description_text, location_name, location_info, map_link, event_categories])
event_count += 1
if event_count >= event_limit:
break
page_number += 1
# Save the data to a CSV file
with open('meetup_events_madrid.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Image URL", "Name", "Date", "Time", "Group", "Description", "Location Name", "Location Info", "Map Link", "Categories"])
for event in data:
writer.writerow(event)
Valora esta pregunta


0