Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# ======== Configs ========
.idea
.venv
.DS_Store

# ======== Pour les Macs ========
.DS_Store
# ======== Others ========
/webscraping/Loblaws_htmls
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
pytest
duckdb==1.4.3
scikit-learn

#webscraping
beautifulsoup4
requests
tqdm
lxml
selenium
176 changes: 176 additions & 0 deletions webscraping/loblaw-scraping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
from bs4 import BeautifulSoup
from tqdm import tqdm

import re
import time
import requests
import os

SITEMAP_URL = "https://www.loblaws.ca/sitemap.xml"
OUTPUT_TXT = "webscraping/loblaws_products_output.txt"

HEADERS = {
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

CATEGORY_FILTERS = ["en/food/international-foods/european-foods/bakery",
"en/food/frozen/appetizers-snacks/a"]


def filter_loblaws_xml(path, headers, timeout=30):
r = requests.get(path, headers=headers, timeout=timeout)
r.raise_for_status()
html = BeautifulSoup(r.content, "lxml-xml")

filtered_urls = [
loc_tag.get_text(strip=True)
for loc_tag in html.find_all("loc")
if 'L3' in loc_tag.get_text() and any(cat in loc_tag.get_text() for cat in CATEGORY_FILTERS)
]

return filtered_urls

def scrape_all_pages(category_url: str):
write_products_header(category_url)
page_number = 1

while True:
if page_number != 1:
url = category_url + "&page=" + str(page_number)
else:
url = category_url

r = request_url(url)
#save_html(r.content, url, "webscraping/L3_htmls") #if you want to save the htmls, add directory manually before
html = BeautifulSoup(r.content, 'html.parser')

print(url)
if is_products_page_empty(html) or page_number > 50:
print("page_empty")
break
else:
products = scrape_category_products(html)
write_products(products)
page_number += 1

time.sleep(5)

def scrape_category_products(html_content):
listing_container = html_content.find("div", {"data-testid": "listing-page-container"})

if not listing_container:
return None

product_grid = listing_container.find(
"div",
{"data-testid": "product-grid-component"}
)

if not product_grid:
return None

products = []

for a in product_grid.select("a.chakra-linkbox__overlay"):
product = {
"url": a.get("href"),

"brand": (
a.select_one('[data-testid="product-brand"]').get_text(strip=True)
if a.select_one('[data-testid="product-brand"]')
else None
),

"title": (
a.select_one('[data-testid="product-title"]').get_text(strip=True)
if a.select_one('[data-testid="product-title"]')
else None
),

"package_size": (
a.select_one('[data-testid="product-package-size"]').get_text(strip=True)
if a.select_one('[data-testid="product-package-size"]')
else None
),
}

products.append(product)

return products



def request_url(url, max_retries=2, retry_delay=5, request_timeout=15):
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=request_timeout)
response.raise_for_status()
return response
except requests.RequestException as e:
print(f"[{url}: attempt {attempt + 1}/{max_retries}] Request failed: {e}")
if attempt < max_retries:
time.sleep(retry_delay)

print(f"[{url}] Max retries reached, returning None")
return None


def save_html(html_content, url, directory="webscraping/"):
filename = sanitize_html_filename(url)
file_path = os.path.join(directory, f"{filename}.html")

with open(file_path, "wb") as f:
f.write(html_content)


def sanitize_html_filename(path_name):
return (
path_name.replace("https://", "")
.replace("http://", "")
.replace("/", "_")
.replace("?", "_")
.replace("=", "_")
.replace("&", "_")
)

def is_products_page_empty(html):
sub_heading_ps = html.find_all("p", {"data-testid": "sub-heading"})

return any(
"No items are available" in p.get_text(strip=True)
for p in sub_heading_ps
)


def write_output_headers( urls, filename=OUTPUT_TXT, mode="w"):
with open(filename, mode) as f:
f.write("=" * 80 + "\n")
f.write("RÉSULTATS DU SCRAPING - PRODUITS LOBLAWS.CA\n")
f.write("=" * 80 + "\n\n")

f.write("URLS CHERCHÉS\n")

for i in range(len(urls)):
f.write(f"{i + 1}. {urls[i]}\n")

def write_products_header(url):
with open(OUTPUT_TXT, mode="a") as f:
f.write("\n")
f.write(f"Produits pour {url}\n")


def write_products(products):
with open(OUTPUT_TXT, mode="a") as f:
i=1
for product in tqdm(products):
f.write(f"{i}.")
i+=1
f.write(f"{product['url']}\n")

if __name__ == "__main__":
urls = filter_loblaws_xml(SITEMAP_URL, HEADERS)
write_output_headers(urls)

for url in tqdm(urls):
scrape_all_pages(url)
File renamed without changes.
38 changes: 38 additions & 0 deletions webscraping/selenium_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import time

from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from time import sleep
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()

driver.get("https://www.loblaws.ca/en/old-fashioned-beef-salami-thin-sliced/p/20737126_KG?source=nspt")
try:
time.sleep(20)

listing_container = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='listing-page-container']"))
)

product_grid = listing_container.find_element(By.CSS_SELECTOR, "div[data-testid='product-grid-component']")
product_cards = product_grid.find_elements(By.CSS_SELECTOR, "a")

urls=[]
for card in product_cards:
url = card.get_attribute("href")
urls.append(url)

with open("webscraping/urls.txt","w") as f:
i = 1
for url in urls:
f.write(str(i) + ". " + url+"\n")
i+=1

except NoSuchElementException:
pass

sleep(10)
driver.quit()
3 changes: 2 additions & 1 deletion webscrapping/test.py → webscraping/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,5 +208,6 @@ def main():

f.write("\n")


if __name__ == "__main__":
main()

75 changes: 0 additions & 75 deletions webscrapping/loblaw-scraping.py

This file was deleted.