offCanada · sandrinecomeau · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
+# ======== Configs ========
 .idea
 .venv
+.DS_Store
 
-# ======== Pour les Macs ========
-.DS_Store
+# ======== Others ========
+/webscraping/Loblaws_htmls
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,10 @@
 pytest
 duckdb==1.4.3
 scikit-learn
+
+#webscraping
 beautifulsoup4
 requests
 tqdm
+lxml
+selenium
diff --git a/webscraping/loblaw-scraping.py b/webscraping/loblaw-scraping.py
@@ -0,0 +1,176 @@
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+import re
+import time
+import requests
+import os
+
+SITEMAP_URL = "https://www.loblaws.ca/sitemap.xml"
+OUTPUT_TXT = "webscraping/loblaws_products_output.txt"
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+}
+
+CATEGORY_FILTERS = ["en/food/international-foods/european-foods/bakery",
+                    "en/food/frozen/appetizers-snacks/a"]
+
+
+def filter_loblaws_xml(path, headers, timeout=30):
+    r = requests.get(path, headers=headers, timeout=timeout)
+    r.raise_for_status()
+    html = BeautifulSoup(r.content, "lxml-xml")
+
+    filtered_urls = [
+        loc_tag.get_text(strip=True)
+        for loc_tag in html.find_all("loc")
+        if 'L3' in loc_tag.get_text() and any(cat in loc_tag.get_text() for cat in CATEGORY_FILTERS)
+    ]
+
+    return filtered_urls
+
+def scrape_all_pages(category_url: str):
+    write_products_header(category_url)
+    page_number = 1
+
+    while True:
+        if page_number != 1:
+            url = category_url + "&page=" + str(page_number)
+        else:
+            url = category_url
+
+        r = request_url(url)
+        #save_html(r.content, url, "webscraping/L3_htmls") #if you want to save the htmls, add directory manually before
+        html = BeautifulSoup(r.content, 'html.parser')
+
+        print(url)
+        if is_products_page_empty(html) or page_number > 50:
+            print("page_empty")
+            break
+        else:
+            products = scrape_category_products(html)
+            write_products(products)
+            page_number += 1
+
+        time.sleep(5)
+
+def scrape_category_products(html_content):
+    listing_container = html_content.find("div", {"data-testid": "listing-page-container"})
+
+    if not listing_container:
+        return None
+
+    product_grid = listing_container.find(
+        "div",
+        {"data-testid": "product-grid-component"}
+    )
+
+    if not product_grid:
+        return None
+
+    products = []
+
+    for a in product_grid.select("a.chakra-linkbox__overlay"):
+        product = {
+            "url": a.get("href"),
+
+            "brand": (
+                a.select_one('[data-testid="product-brand"]').get_text(strip=True)
+                if a.select_one('[data-testid="product-brand"]')
+                else None
+            ),
+
+            "title": (
+                a.select_one('[data-testid="product-title"]').get_text(strip=True)
+                if a.select_one('[data-testid="product-title"]')
+                else None
+            ),
+
+            "package_size": (
+                a.select_one('[data-testid="product-package-size"]').get_text(strip=True)
+                if a.select_one('[data-testid="product-package-size"]')
+                else None
+            ),
+        }
+
+        products.append(product)
+
+    return products
+
+
+
+def request_url(url, max_retries=2, retry_delay=5, request_timeout=15):
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(url, timeout=request_timeout)
+            response.raise_for_status()
+            return response
+        except requests.RequestException as e:
+            print(f"[{url}: attempt {attempt + 1}/{max_retries}] Request failed: {e}")
+            if attempt < max_retries:
+                time.sleep(retry_delay)
+
+    print(f"[{url}] Max retries reached, returning None")
+    return None
+
+
+def save_html(html_content, url, directory="webscraping/"):
+    filename = sanitize_html_filename(url)
+    file_path = os.path.join(directory, f"{filename}.html")
+
+    with open(file_path, "wb") as f:
+        f.write(html_content)
+
+
+def sanitize_html_filename(path_name):
+    return (
+        path_name.replace("https://", "")
+           .replace("http://", "")
+           .replace("/", "_")
+           .replace("?", "_")
+           .replace("=", "_")
+           .replace("&", "_")
+    )
+
+def is_products_page_empty(html):
+    sub_heading_ps = html.find_all("p", {"data-testid": "sub-heading"})
+
+    return any(
+        "No items are available" in p.get_text(strip=True)
+        for p in sub_heading_ps
+    )
+
+
+def write_output_headers( urls, filename=OUTPUT_TXT, mode="w"):
+    with open(filename, mode) as f:
+        f.write("=" * 80 + "\n")
+        f.write("RÉSULTATS DU SCRAPING - PRODUITS LOBLAWS.CA\n")
+        f.write("=" * 80 + "\n\n")
+
+        f.write("URLS CHERCHÉS\n")
+
+        for i in range(len(urls)):
+            f.write(f"{i + 1}. {urls[i]}\n")
+
+def write_products_header(url):
+    with open(OUTPUT_TXT, mode="a") as f:
+        f.write("\n")
+        f.write(f"Produits pour {url}\n")
+
+
+def write_products(products):
+    with open(OUTPUT_TXT, mode="a") as f:
+        i=1
+        for product in tqdm(products):
+            f.write(f"{i}.")
+            i+=1
+            f.write(f"{product['url']}\n")
+
+if __name__ == "__main__":
+    urls = filter_loblaws_xml(SITEMAP_URL, HEADERS)
+    write_output_headers(urls)
+
+    for url in tqdm(urls):
+        scrape_all_pages(url)
diff --git a/webscrapping/loblaws-sitemap.xml → webscraping/loblaws-sitemap.xml b/webscrapping/loblaws-sitemap.xml → webscraping/loblaws-sitemap.xml
diff --git a/webscraping/selenium_test.py b/webscraping/selenium_test.py
@@ -0,0 +1,38 @@
+import time
+
+from selenium import webdriver
+from selenium.common import NoSuchElementException
+from selenium.webdriver.common.by import By
+from time import sleep
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+driver = webdriver.Chrome()
+
+driver.get("https://www.loblaws.ca/en/old-fashioned-beef-salami-thin-sliced/p/20737126_KG?source=nspt")
+try:
+    time.sleep(20)
+
+    listing_container = WebDriverWait(driver, 10).until(
+        EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='listing-page-container']"))
+    )
+
+    product_grid = listing_container.find_element(By.CSS_SELECTOR, "div[data-testid='product-grid-component']")
+    product_cards = product_grid.find_elements(By.CSS_SELECTOR, "a")
+
+    urls=[]
+    for card in product_cards:
+        url = card.get_attribute("href")
+        urls.append(url)
+
+    with open("webscraping/urls.txt","w") as f:
+        i = 1
+        for url in urls:
+            f.write(str(i) + ". " + url+"\n")
+            i+=1
+
+except NoSuchElementException:
+    pass
+
+sleep(10)
+driver.quit()
diff --git a/webscrapping/test.py → webscraping/test.py b/webscrapping/test.py → webscraping/test.py
@@ -208,5 +208,6 @@ def main():
 
             f.write("\n")
 
-
+if __name__ == "__main__":
+    main()
 
diff --git a/webscrapping/loblaw-scraping.py b/webscrapping/loblaw-scraping.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -208,5 +208,6 @@ def main():

		f.write("\n")


		if __name__ == "__main__":
		main()