offCanada · sandrinecomeau · May 1, 2026 · Feb 27, 2026 · Feb 27, 2026 · Mar 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,12 @@ off_project/.user.yml
 __pycache__/
 
 .coverage
+
+# Scraping
+loblaws_mapping_output.csv
+loblaws_mapping_output.txt
+loblaws_products_output.jsonl
+logs_loblaw_scraping.txt
+logs_loblaw_mapping.txt
+output.json
+iga_category_product_urls.txt
diff --git a/webscraping/README.md b/webscraping/README.md
@@ -2,13 +2,9 @@
 
 ## Languages
 
-- [English](#english)
 - [Français](#français)
 
 
-## English
-📌 TODO
-
 
 ## Français
 
@@ -57,8 +53,9 @@ uv run flake8 .
 
 ## Épiceries
 
-- [Loblaws](#Lowblaws)
+- [Loblaws](#Loblaws)
 - [Maxi](#Maxi)
+- [IGA](#IGA)
 
 
 ### Loblaws
@@ -149,4 +146,39 @@ Arguments:
 |-------------|-------|------------|------------------|
 | `--start`   | `int` | `0`        | Index de début   |
 | `--end`     | `int` | `1000`     | Index de fin     |
-| `--website` | `str` | `"Loblaws"` | Choix : `"Loblaws"`, `"Maxi"` |
+| `--website` | `str` | `"Loblaws"` | Choix : `"Loblaws"`, `"Maxi"` |
+
+### IGA
+
+Contrairement à Loblaws et Maxi, le site de IGA ne propose pas de sitemap et repose fortement sur du contenu dynamique chargé via JavaScript.
+
+Pour cette raison, une approche différente a été utilisée basée sur Scrapy + Playwright.
+
+Le scraping IGA repose sur le script `iga_products.py` et est divisé en deux étapes :
+
+- `Exploration d’une catégorie`: 
+  - Parcours des pages de catégorie
+  - Gestion de la pagination (?page=n)
+  - Extraction des URLs produits
+- `Scraping des produits`: 
+  - Visite de chaque page produit
+  - Extraction des données via JSON-LD
+Retourne : Un JSON structuré de produits alimentaires IGA, avec leurs données nutritionnelles et descriptives.
+
+#### Éxécution:
+
+`iga_products.py`:
+```bash
+cd webscraping/scrapy
+uv run scrapy runspider iga_products.py -O output.json
+```
+
+#### Limitations:
+Le site IGA utilise des mécanismes de protection anti-bot.
+
+Après un certain volume de requêtes, le serveur peut retourner : `403 Forbidden`
+Ce comportement :
+  - n’est pas lié au code
+  - provient d’un blocage côté serveur
+  - dépend du volume de requêtes et de l’IP
+<br>
diff --git a/webscraping/iga_scraper/iga_products.py b/webscraping/iga_scraper/iga_products.py
@@ -0,0 +1,278 @@
+import scrapy
+import json
+from urllib.parse import urljoin
+from scrapy_playwright.page import PageMethod
+
+
+class IgaCategoryProductsSpider(scrapy.Spider):
+    name = "iga_category_products"
+    allowed_domains = ["iga.ca"]
+
+    category_name = "Produits_surgeles"
+    base_category_url = "https://www.iga.ca/fr/produits/cat%C3%A9gorie/Produits__surgel%C3%A9s"
+
+    custom_settings = {
+        "DOWNLOAD_HANDLERS": {
+            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "ROBOTSTXT_OBEY": False,
+        "PLAYWRIGHT_BROWSER_TYPE": "chromium",
+        "PLAYWRIGHT_LAUNCH_OPTIONS": {
+            "headless": True,
+        },
+        "DEFAULT_REQUEST_HEADERS": {
+            "Accept-Language": "fr-CA,fr;q=0.9,en;q=0.8",
+            "User-Agent": (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/122.0.0.0 Safari/537.36"
+            ),
+        },
+        "CONCURRENT_REQUESTS": 8,
+    }
+
+    def start_requests(self):
+        yield scrapy.Request(
+            url=self.base_category_url,
+            meta={
+                "playwright": True,
+                "playwright_include_page": True,
+                "playwright_page_methods": [
+                    PageMethod("wait_for_load_state", "domcontentloaded"),
+                    PageMethod("wait_for_timeout", 2500),
+                ],
+                "page_number": 1,
+                "all_product_urls": [],
+            },
+            callback=self.parse_category_page,
+        )
+
+    async def parse_category_page(self, response):
+        page = response.meta["playwright_page"]
+        page_number = response.meta["page_number"]
+        all_product_urls = set(response.meta.get("all_product_urls", []))
+
+        try:
+            self.logger.info("Category page %s URL: %s", page_number, response.url)
+
+            await self.wait_for_products_to_settle(page)
+
+            page_urls = await self.extract_product_urls_from_current_page(page)
+
+            self.logger.info(
+                "Page %s: %s URLs produit trouvées",
+                page_number,
+                len(page_urls),
+            )
+
+            before_count = len(all_product_urls)
+            all_product_urls.update(page_urls)
+            after_count = len(all_product_urls)
+
+            self.logger.info(
+                "Page %s: cumul avant=%s, après=%s, ajoutées=%s",
+                page_number,
+                before_count,
+                after_count,
+                after_count - before_count,
+            )
+
+        finally:
+            await page.close()
+
+
+        if after_count == before_count and page_number > 1:
+            self.logger.info("Aucune nouvelle URL à la page %s. Fin de pagination.", page_number)
+
+            with open("iga_category_product_urls.txt", "w", encoding="utf-8") as f:
+                for url in sorted(all_product_urls):
+                    f.write(url + "\n")
+
+            for product_url in sorted(all_product_urls):
+                yield scrapy.Request(
+                    url=product_url,
+                    meta={
+                        "category_name": self.category_name,
+                        "source_category_url": self.base_category_url,
+                    },
+                    callback=self.parse_product,
+                )
+            return
+
+        next_page_number = page_number + 1
+        next_page_url = f"{self.base_category_url}?page={next_page_number}"
+
+        yield scrapy.Request(
+            url=next_page_url,
+            meta={
+                "playwright": True,
+                "playwright_include_page": True,
+                "playwright_page_methods": [
+                    PageMethod("wait_for_load_state", "domcontentloaded"),
+                    PageMethod("wait_for_timeout", 2500),
+                ],
+                "page_number": next_page_number,
+                "all_product_urls": sorted(all_product_urls),
+            },
+            callback=self.parse_category_page,
+        )
+
+    async def wait_for_products_to_settle(self, page):
+        previous_count = -1
+        stable_rounds = 0
+
+        for _ in range(15):
+            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+            await page.wait_for_timeout(700)
+
+            hrefs = await page.eval_on_selector_all(
+                "a[href]",
+                """
+                els => els
+                    .map(a => a.getAttribute('href'))
+                    .filter(h => h && h.includes('/fr/produits/'))
+                """
+            )
+
+            current_count = len({
+                urljoin("https://www.iga.ca", h.split("?")[0])
+                for h in hrefs
+                if h and "/fr/produits/" in h
+            })
+
+            if current_count == previous_count:
+                stable_rounds += 1
+            else:
+                stable_rounds = 0
+
+            previous_count = current_count
+
+            if stable_rounds >= 2:
+                break
+
+    async def extract_product_urls_from_current_page(self, page):
+        hrefs = await page.eval_on_selector_all(
+            "a[href]",
+            """
+            els => els
+                .map(a => a.getAttribute('href'))
+                .filter(h => h && h.includes('/fr/produits/'))
+            """
+        )
+
+        product_urls = set()
+
+        for href in hrefs:
+            if not self.is_product_url(href):
+                continue
+
+            href = href.split("?")[0]
+            full_url = urljoin("https://www.iga.ca", href)
+            product_urls.add(full_url)
+
+        return product_urls
+
+    def parse_product(self, response):
+        product_data = self.extract_jsonld_product(response)
+
+        if not product_data:
+            yield {
+                "url": response.url,
+                "category_name": response.meta.get("category_name"),
+                "source_category_url": response.meta.get("source_category_url"),
+                "error": "Aucun JSON-LD Product trouvé",
+            }
+            return
+
+        offers = product_data.get("offers", {})
+        nutrition = product_data.get("nutrition", {})
+        brand = product_data.get("brand", {})
+        breadcrumb = product_data.get("breadcrumb", {})
+
+        sub_category = ""
+        items = breadcrumb.get("itemListElement", [])
+        if len(items) >= 3:
+            sub_category = items[-2].get("name", "")
+
+        images = product_data.get("image", [])
+        image_url = images[0] if isinstance(images, list) and images else images
+
+        availability = offers.get("availability", "")
+        if isinstance(availability, str):
+            availability = availability.replace("https://schema.org/", "")
+
+        yield {
+            "category_name": response.meta.get("category_name"),
+            "source_category_url": response.meta.get("source_category_url"),
+            "url": response.url,
+            "name": product_data.get("name"),
+            "sku": product_data.get("sku"),
+            "brand": brand.get("name") if isinstance(brand, dict) else brand,
+            "description": product_data.get("description"),
+            "price": offers.get("price"),
+            "price_currency": offers.get("priceCurrency"),
+            "availability": availability,
+            "product_url": offers.get("url"),
+            "image_url": image_url,
+            "serving_size": nutrition.get("servingSize"),
+            "calories": nutrition.get("calories"),
+            "fat": nutrition.get("fatContent"),
+            "saturated_fat": nutrition.get("saturatedFatContent"),
+            "sodium": nutrition.get("sodiumContent"),
+            "carbohydrate": nutrition.get("carbohydrateContent"),
+            "fiber": nutrition.get("fiberContent"),
+            "sugar": nutrition.get("sugarContent"),
+            "protein": nutrition.get("proteinContent"),
+            "calcium": nutrition.get("calciumContent"),
+            "iron": nutrition.get("ironContent"),
+            "potassium": nutrition.get("potassiumContent"),
+            "sub_category": sub_category,
+        }
+
+    def extract_jsonld_product(self, response):
+        scripts = response.css('script[type="application/ld+json"]::text').getall()
+
+        for script in scripts:
+            try:
+                data = json.loads(script)
+
+                if isinstance(data, list):
+                    for entry in data:
+                        if isinstance(entry, dict) and entry.get("@type") == "Product":
+                            return entry
+
+                elif isinstance(data, dict) and data.get("@type") == "Product":
+                    return data
+
+            except Exception:
+                continue
+
+        return None
+
+    def is_product_url(self, href: str) -> bool:
+        if not href:
+            return False
+
+        href_lower = href.lower()
+
+        if "/fr/produits/" not in href_lower:
+            return False
+
+        excluded = [
+            "categorie",
+            "cat%C3%A9gorie",
+            "catégorie",
+            "category",
+            "cookie",
+            "consent",
+            "onetrust",
+            "#",
+        ]
+
+        for token in excluded:
+            if token.lower() in href_lower:
+                return False
+
+        return True
diff --git a/webscraping/pyproject.toml b/webscraping/pyproject.toml
@@ -6,7 +6,11 @@ requires-python = "==3.14.*"
 dependencies = [
     "beautifulsoup4>=4.14.3",
     "lxml>=6.0.2",
+    "pandas>=3.0.1",
+    "playwright>=1.58.0",
+    "pytest-cov>=7.1.0",
     "requests>=2.32.5",
+    "scrapy-playwright>=0.0.46",
     "selenium>=4.41.0",
     "tqdm>=4.67.3",
 ]